This validator will suggest to lowercase the column values, if more than a third of letters are uppercase.
(df: pd.DataFrame, column: Any)
| 427 | |
| 428 | |
| 429 | def lower_case_validator(df: pd.DataFrame, column: Any) -> Remediation | None: |
| 430 | """ |
| 431 | This validator will suggest to lowercase the column values, if more than a third of letters are uppercase. |
| 432 | """ |
| 433 | |
| 434 | def lower_case(x: Any) -> Any: |
| 435 | x[column] = x[column].str.lower() |
| 436 | return x |
| 437 | |
| 438 | count_upper = df[column].apply(lambda x: sum(1 for c in x if c.isalpha() and c.isupper())).sum() |
| 439 | count_lower = df[column].apply(lambda x: sum(1 for c in x if c.isalpha() and c.islower())).sum() |
| 440 | |
| 441 | if count_upper * 2 > count_lower: |
| 442 | return Remediation( |
| 443 | name="lower_case", |
| 444 | immediate_msg=f"\n- More than a third of your `{column}` column/key is uppercase. Uppercase {column}s tends to perform worse than a mixture of case encountered in normal language. We recommend to lower case the data if that makes sense in your domain. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details", |
| 445 | optional_msg=f"Lowercase all your data in column/key `{column}`", |
| 446 | optional_fn=lower_case, |
| 447 | ) |
| 448 | return None |
| 449 | |
| 450 | |
| 451 | def read_any_format( |
no test coverage detected