This function will write out a dataframe to a file, if the user would like to proceed, and also offer a fine-tuning command with the newly created file. For classification it will optionally ask the user if they would like to split the data into train/valid files, and modify the suggested c
(df: pd.DataFrame, fname: str, any_remediations: bool, auto_accept: bool)
| 638 | |
| 639 | |
| 640 | def write_out_file(df: pd.DataFrame, fname: str, any_remediations: bool, auto_accept: bool) -> None: |
| 641 | """ |
| 642 | This function will write out a dataframe to a file, if the user would like to proceed, and also offer a fine-tuning command with the newly created file. |
| 643 | For classification it will optionally ask the user if they would like to split the data into train/valid files, and modify the suggested command to include the valid set. |
| 644 | """ |
| 645 | ft_format = infer_task_type(df) |
| 646 | common_prompt_suffix = get_common_xfix(df.prompt, xfix="suffix") |
| 647 | common_completion_suffix = get_common_xfix(df.completion, xfix="suffix") |
| 648 | |
| 649 | split = False |
| 650 | input_text = "- [Recommended] Would you like to split into training and validation set? [Y/n]: " |
| 651 | if ft_format == "classification": |
| 652 | if accept_suggestion(input_text, auto_accept): |
| 653 | split = True |
| 654 | |
| 655 | additional_params = "" |
| 656 | common_prompt_suffix_new_line_handled = common_prompt_suffix.replace("\n", "\\n") |
| 657 | common_completion_suffix_new_line_handled = common_completion_suffix.replace("\n", "\\n") |
| 658 | optional_ending_string = ( |
| 659 | f' Make sure to include `stop=["{common_completion_suffix_new_line_handled}"]` so that the generated texts ends at the expected place.' |
| 660 | if len(common_completion_suffix_new_line_handled) > 0 |
| 661 | else "" |
| 662 | ) |
| 663 | |
| 664 | input_text = "\n\nYour data will be written to a new JSONL file. Proceed [Y/n]: " |
| 665 | |
| 666 | if not any_remediations and not split: |
| 667 | sys.stdout.write( |
| 668 | f'\nYou can use your file for fine-tuning:\n> openai api fine_tunes.create -t "{fname}"{additional_params}\n\nAfter you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `{common_prompt_suffix_new_line_handled}` for the model to start generating completions, rather than continuing with the prompt.{optional_ending_string}\n' |
| 669 | ) |
| 670 | estimate_fine_tuning_time(df) |
| 671 | |
| 672 | elif accept_suggestion(input_text, auto_accept): |
| 673 | fnames = get_outfnames(fname, split) |
| 674 | if split: |
| 675 | assert len(fnames) == 2 and "train" in fnames[0] and "valid" in fnames[1] |
| 676 | MAX_VALID_EXAMPLES = 1000 |
| 677 | n_train = max(len(df) - MAX_VALID_EXAMPLES, int(len(df) * 0.8)) |
| 678 | df_train = df.sample(n=n_train, random_state=42) |
| 679 | df_valid = df.drop(df_train.index) |
| 680 | df_train[["prompt", "completion"]].to_json( # type: ignore |
| 681 | fnames[0], lines=True, orient="records", force_ascii=False, indent=None |
| 682 | ) |
| 683 | df_valid[["prompt", "completion"]].to_json( |
| 684 | fnames[1], lines=True, orient="records", force_ascii=False, indent=None |
| 685 | ) |
| 686 | |
| 687 | n_classes, pos_class = get_classification_hyperparams(df) |
| 688 | additional_params += " --compute_classification_metrics" |
| 689 | if n_classes == 2: |
| 690 | additional_params += f' --classification_positive_class "{pos_class}"' |
| 691 | else: |
| 692 | additional_params += f" --classification_n_classes {n_classes}" |
| 693 | else: |
| 694 | assert len(fnames) == 1 |
| 695 | df[["prompt", "completion"]].to_json( |
| 696 | fnames[0], lines=True, orient="records", force_ascii=False, indent=None |
| 697 | ) |
nothing calls this directly
no test coverage detected