This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation.
(df: pd.DataFrame)
| 334 | |
| 335 | |
| 336 | def common_completion_suffix_validator(df: pd.DataFrame) -> Remediation: |
| 337 | """ |
| 338 | This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation. |
| 339 | """ |
| 340 | error_msg = None |
| 341 | immediate_msg = None |
| 342 | optional_msg = None |
| 343 | optional_fn = None # type: ignore |
| 344 | |
| 345 | ft_type = infer_task_type(df) |
| 346 | if ft_type == "open-ended generation" or ft_type == "classification": |
| 347 | return Remediation(name="common_suffix") |
| 348 | |
| 349 | common_suffix = get_common_xfix(df.completion, xfix="suffix") |
| 350 | if (df.completion == common_suffix).all(): |
| 351 | error_msg = f"All completions are identical: `{common_suffix}`\nEnsure completions are different, otherwise the model will just repeat `{common_suffix}`" |
| 352 | return Remediation(name="common_suffix", error_msg=error_msg) |
| 353 | |
| 354 | # Find a suffix which is not contained within the completion otherwise |
| 355 | suggested_suffix = " [END]" |
| 356 | suffix_options = [ |
| 357 | "\n", |
| 358 | ".", |
| 359 | " END", |
| 360 | "***", |
| 361 | "+++", |
| 362 | "&&&", |
| 363 | "$$$", |
| 364 | "@@@", |
| 365 | "%%%", |
| 366 | ] |
| 367 | for suffix_option in suffix_options: |
| 368 | if df.completion.str.contains(suffix_option, regex=False).any(): |
| 369 | continue |
| 370 | suggested_suffix = suffix_option |
| 371 | break |
| 372 | display_suggested_suffix = suggested_suffix.replace("\n", "\\n") |
| 373 | |
| 374 | def add_suffix(x: Any, suffix: Any) -> Any: |
| 375 | x["completion"] += suffix |
| 376 | return x |
| 377 | |
| 378 | if common_suffix != "": |
| 379 | common_suffix_new_line_handled = common_suffix.replace("\n", "\\n") |
| 380 | immediate_msg = f"\n- All completions end with suffix `{common_suffix_new_line_handled}`" |
| 381 | if len(common_suffix) > 10: |
| 382 | immediate_msg += f". This suffix seems very long. Consider replacing with a shorter suffix, such as `{display_suggested_suffix}`" |
| 383 | if df.completion.str[: -len(common_suffix)].str.contains(common_suffix, regex=False).any(): |
| 384 | immediate_msg += f"\n WARNING: Some of your completions contain the suffix `{common_suffix}` more than once. We suggest that you review your completions and add a unique ending" |
| 385 | |
| 386 | else: |
| 387 | immediate_msg = "\n- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples." |
| 388 | |
| 389 | if common_suffix == "": |
| 390 | optional_msg = f"Add a suffix ending `{display_suggested_suffix}` to all completions" |
| 391 | |
| 392 | def optional_fn(x: Any) -> Any: |
| 393 | return add_suffix(x, suggested_suffix) |
nothing calls this directly
no test coverage detected