(config_class, processors)
| 1260 | |
| 1261 | |
| 1262 | def get_config_overrides(config_class, processors): |
| 1263 | # `Bark` configuration is too special. Let's just not handle this for now. |
| 1264 | if config_class.__name__ == "BarkConfig": |
| 1265 | return {} |
| 1266 | |
| 1267 | config_overrides = {} |
| 1268 | |
| 1269 | # Check if there is any tokenizer (prefer fast version if any) |
| 1270 | tokenizer = None |
| 1271 | for processor in processors: |
| 1272 | if isinstance(processor, PreTrainedTokenizerFast): |
| 1273 | tokenizer = processor |
| 1274 | break |
| 1275 | elif isinstance(processor, PythonBackend): |
| 1276 | tokenizer = processor |
| 1277 | |
| 1278 | if tokenizer is None: |
| 1279 | return config_overrides |
| 1280 | |
| 1281 | # Get some properties of the (already converted) tokenizer (smaller vocab size, special token ids, etc.) |
| 1282 | # We use `len(tokenizer)` instead of `tokenizer.vocab_size` to avoid potential issues for tokenizers with non-empty |
| 1283 | # `added_tokens_encoder`. One example is the `DebertaV2Tokenizer` where the mask token is the extra token. |
| 1284 | vocab_size = len(tokenizer) |
| 1285 | |
| 1286 | # The original checkpoint has length `35998`, but it doesn't have ids `30400` and `30514` but instead `35998` and |
| 1287 | # `35999`. |
| 1288 | if config_class.__name__ == "GPTSanJapaneseConfig": |
| 1289 | vocab_size += 2 |
| 1290 | |
| 1291 | config_overrides["vocab_size"] = vocab_size |
| 1292 | |
| 1293 | # Used to create a new model tester with `tokenizer.vocab_size` in order to get the (updated) special token ids. |
| 1294 | model_tester_kwargs = {"vocab_size": vocab_size} |
| 1295 | # `FSMTModelTester` accepts `src_vocab_size` and `tgt_vocab_size` but not `vocab_size`. |
| 1296 | if config_class.__name__ == "FSMTConfig": |
| 1297 | del model_tester_kwargs["vocab_size"] |
| 1298 | model_tester_kwargs["src_vocab_size"] = tokenizer.src_vocab_size |
| 1299 | model_tester_kwargs["tgt_vocab_size"] = tokenizer.tgt_vocab_size |
| 1300 | |
| 1301 | _tiny_config = get_tiny_config(config_class, **model_tester_kwargs) |
| 1302 | |
| 1303 | # handle the possibility of `text_config` inside `_tiny_config` for clip-like models (`owlvit`, `groupvit`, etc.) |
| 1304 | if hasattr(_tiny_config, "text_config"): |
| 1305 | _tiny_config = _tiny_config.text_config |
| 1306 | |
| 1307 | # Collect values of some special token ids |
| 1308 | for attr in dir(_tiny_config): |
| 1309 | if attr.endswith("_token_id"): |
| 1310 | token_id = getattr(_tiny_config, attr) |
| 1311 | if token_id is not None: |
| 1312 | # Using the token id values from `tokenizer` instead of from `_tiny_config`. |
| 1313 | token_id = get_token_id_from_tokenizer(attr, tokenizer, original_token_id=token_id) |
| 1314 | config_overrides[attr] = token_id |
| 1315 | |
| 1316 | if config_class.__name__ == "FSMTConfig": |
| 1317 | config_overrides["src_vocab_size"] = tokenizer.src_vocab_size |
| 1318 | config_overrides["tgt_vocab_size"] = tokenizer.tgt_vocab_size |
| 1319 |
no test coverage detected