()
| 227 | |
| 228 | |
| 229 | def main(): |
| 230 | # See all possible arguments in src/transformers/training_args.py |
| 231 | # or by passing the --help flag to this script. |
| 232 | # We now keep distinct sets of args, for a cleaner separation of concerns. |
| 233 | |
| 234 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) |
| 235 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): |
| 236 | # If we pass only one argument to the script and it's the path to a json file, |
| 237 | # let's parse it to get our arguments. |
| 238 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) |
| 239 | else: |
| 240 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() |
| 241 | |
| 242 | # Setup logging |
| 243 | logging.basicConfig( |
| 244 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", |
| 245 | datefmt="%m/%d/%Y %H:%M:%S", |
| 246 | handlers=[logging.StreamHandler(sys.stdout)], |
| 247 | ) |
| 248 | |
| 249 | if training_args.should_log: |
| 250 | # The default of training_args.log_level is passive, so we set log level at info here to have that default. |
| 251 | transformers.utils.logging.set_verbosity_info() |
| 252 | |
| 253 | log_level = training_args.get_process_log_level() |
| 254 | logger.setLevel(log_level) |
| 255 | datasets.utils.logging.set_verbosity(log_level) |
| 256 | transformers.utils.logging.set_verbosity(log_level) |
| 257 | transformers.utils.logging.enable_default_handler() |
| 258 | transformers.utils.logging.enable_explicit_format() |
| 259 | |
| 260 | # Log on each process the small summary: |
| 261 | logger.warning( |
| 262 | f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " |
| 263 | + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" |
| 264 | ) |
| 265 | logger.info(f"Training/evaluation parameters {training_args}") |
| 266 | |
| 267 | # Set seed before initializing model. |
| 268 | set_seed(training_args.seed) |
| 269 | |
| 270 | # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) |
| 271 | # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). |
| 272 | # |
| 273 | # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the |
| 274 | # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named |
| 275 | # label if at least two columns are provided. |
| 276 | # |
| 277 | # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this |
| 278 | # single column. You can easily tweak this behavior (see below) |
| 279 | # |
| 280 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently |
| 281 | # download the dataset. |
| 282 | if data_args.task_name is not None: |
| 283 | # Downloading and loading a dataset from the hub. |
| 284 | raw_datasets = load_dataset( |
| 285 | "nyu-mll/glue", |
| 286 | data_args.task_name, |
no test coverage detected