This validator will suggest to the user to remove examples that are too long.
(df: pd.DataFrame)
| 154 | |
| 155 | |
| 156 | def long_examples_validator(df: pd.DataFrame) -> Remediation: |
| 157 | """ |
| 158 | This validator will suggest to the user to remove examples that are too long. |
| 159 | """ |
| 160 | immediate_msg = None |
| 161 | optional_msg = None |
| 162 | optional_fn = None # type: ignore |
| 163 | |
| 164 | ft_type = infer_task_type(df) |
| 165 | if ft_type != "open-ended generation": |
| 166 | |
| 167 | def get_long_indexes(d: pd.DataFrame) -> Any: |
| 168 | long_examples = d.apply(lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1) |
| 169 | return d.reset_index().index[long_examples].tolist() |
| 170 | |
| 171 | long_indexes = get_long_indexes(df) |
| 172 | |
| 173 | if len(long_indexes) > 0: |
| 174 | immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens." |
| 175 | optional_msg = f"Remove {len(long_indexes)} long examples" |
| 176 | |
| 177 | def optional_fn(x: Any) -> Any: |
| 178 | long_indexes_to_drop = get_long_indexes(x) |
| 179 | if long_indexes != long_indexes_to_drop: |
| 180 | sys.stdout.write( |
| 181 | f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}\n" |
| 182 | ) |
| 183 | return x.drop(long_indexes_to_drop) |
| 184 | |
| 185 | return Remediation( |
| 186 | name="long_examples", |
| 187 | immediate_msg=immediate_msg, |
| 188 | optional_msg=optional_msg, |
| 189 | optional_fn=optional_fn, |
| 190 | ) |
| 191 | |
| 192 | |
| 193 | def common_prompt_suffix_validator(df: pd.DataFrame) -> Remediation: |
nothing calls this directly
no test coverage detected