This validator will suggest to add a common suffix to the prompt if one doesn't already exist in case of classification or conditional generation.
(df: pd.DataFrame)
| 191 | |
| 192 | |
| 193 | def common_prompt_suffix_validator(df: pd.DataFrame) -> Remediation: |
| 194 | """ |
| 195 | This validator will suggest to add a common suffix to the prompt if one doesn't already exist in case of classification or conditional generation. |
| 196 | """ |
| 197 | error_msg = None |
| 198 | immediate_msg = None |
| 199 | optional_msg = None |
| 200 | optional_fn = None # type: ignore |
| 201 | |
| 202 | # Find a suffix which is not contained within the prompt otherwise |
| 203 | suggested_suffix = "\n\n### =>\n\n" |
| 204 | suffix_options = [ |
| 205 | " ->", |
| 206 | "\n\n###\n\n", |
| 207 | "\n\n===\n\n", |
| 208 | "\n\n---\n\n", |
| 209 | "\n\n===>\n\n", |
| 210 | "\n\n--->\n\n", |
| 211 | ] |
| 212 | for suffix_option in suffix_options: |
| 213 | if suffix_option == " ->": |
| 214 | if df.prompt.str.contains("\n").any(): |
| 215 | continue |
| 216 | if df.prompt.str.contains(suffix_option, regex=False).any(): |
| 217 | continue |
| 218 | suggested_suffix = suffix_option |
| 219 | break |
| 220 | display_suggested_suffix = suggested_suffix.replace("\n", "\\n") |
| 221 | |
| 222 | ft_type = infer_task_type(df) |
| 223 | if ft_type == "open-ended generation": |
| 224 | return Remediation(name="common_suffix") |
| 225 | |
| 226 | def add_suffix(x: Any, suffix: Any) -> Any: |
| 227 | x["prompt"] += suffix |
| 228 | return x |
| 229 | |
| 230 | common_suffix = get_common_xfix(df.prompt, xfix="suffix") |
| 231 | if (df.prompt == common_suffix).all(): |
| 232 | error_msg = f"All prompts are identical: `{common_suffix}`\nConsider leaving the prompts blank if you want to do open-ended generation, otherwise ensure prompts are different" |
| 233 | return Remediation(name="common_suffix", error_msg=error_msg) |
| 234 | |
| 235 | if common_suffix != "": |
| 236 | common_suffix_new_line_handled = common_suffix.replace("\n", "\\n") |
| 237 | immediate_msg = f"\n- All prompts end with suffix `{common_suffix_new_line_handled}`" |
| 238 | if len(common_suffix) > 10: |
| 239 | immediate_msg += f". This suffix seems very long. Consider replacing with a shorter suffix, such as `{display_suggested_suffix}`" |
| 240 | if df.prompt.str[: -len(common_suffix)].str.contains(common_suffix, regex=False).any(): |
| 241 | immediate_msg += f"\n WARNING: Some of your prompts contain the suffix `{common_suffix}` more than once. We strongly suggest that you review your prompts and add a unique suffix" |
| 242 | |
| 243 | else: |
| 244 | immediate_msg = "\n- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty" |
| 245 | |
| 246 | if common_suffix == "": |
| 247 | optional_msg = f"Add a suffix separator `{display_suggested_suffix}` to all prompts" |
| 248 | |
| 249 | def optional_fn(x: Any) -> Any: |
| 250 | return add_suffix(x, suggested_suffix) |
nothing calls this directly
no test coverage detected