This validator will remove additional columns from the dataframe.
(df: pd.DataFrame, fields: list[str] = ["prompt", "completion"])
| 73 | |
| 74 | |
| 75 | def additional_column_validator(df: pd.DataFrame, fields: list[str] = ["prompt", "completion"]) -> Remediation: |
| 76 | """ |
| 77 | This validator will remove additional columns from the dataframe. |
| 78 | """ |
| 79 | additional_columns = [] |
| 80 | necessary_msg = None |
| 81 | immediate_msg = None |
| 82 | necessary_fn = None # type: ignore |
| 83 | |
| 84 | if len(df.columns) > 2: |
| 85 | additional_columns = [c for c in df.columns if c not in fields] |
| 86 | warn_message = "" |
| 87 | for ac in additional_columns: |
| 88 | dups = [c for c in additional_columns if ac in c] |
| 89 | if len(dups) > 0: |
| 90 | warn_message += f"\n WARNING: Some of the additional columns/keys contain `{ac}` in their name. These will be ignored, and the column/key `{ac}` will be used instead. This could also result from a duplicate column/key in the provided file." |
| 91 | immediate_msg = f"\n- The input file should contain exactly two columns/keys per row. Additional columns/keys present are: {additional_columns}{warn_message}" |
| 92 | necessary_msg = f"Remove additional columns/keys: {additional_columns}" |
| 93 | |
| 94 | def necessary_fn(x: Any) -> Any: |
| 95 | return x[fields] |
| 96 | |
| 97 | return Remediation( |
| 98 | name="additional_column", |
| 99 | immediate_msg=immediate_msg, |
| 100 | necessary_msg=necessary_msg, |
| 101 | necessary_fn=necessary_fn, |
| 102 | ) |
| 103 | |
| 104 | |
| 105 | def non_empty_field_validator(df: pd.DataFrame, field: str = "completion") -> Remediation: |
nothing calls this directly
no test coverage detected