MCPcopy
hub / github.com/huggingface/transformers / get_config_overrides

Function get_config_overrides

utils/create_dummy_models.py:1262–1331  ·  view source on GitHub ↗
(config_class, processors)

Source from the content-addressed store, hash-verified

1260
1261
1262def get_config_overrides(config_class, processors):
1263 # `Bark` configuration is too special. Let's just not handle this for now.
1264 if config_class.__name__ == "BarkConfig":
1265 return {}
1266
1267 config_overrides = {}
1268
1269 # Check if there is any tokenizer (prefer fast version if any)
1270 tokenizer = None
1271 for processor in processors:
1272 if isinstance(processor, PreTrainedTokenizerFast):
1273 tokenizer = processor
1274 break
1275 elif isinstance(processor, PythonBackend):
1276 tokenizer = processor
1277
1278 if tokenizer is None:
1279 return config_overrides
1280
1281 # Get some properties of the (already converted) tokenizer (smaller vocab size, special token ids, etc.)
1282 # We use `len(tokenizer)` instead of `tokenizer.vocab_size` to avoid potential issues for tokenizers with non-empty
1283 # `added_tokens_encoder`. One example is the `DebertaV2Tokenizer` where the mask token is the extra token.
1284 vocab_size = len(tokenizer)
1285
1286 # The original checkpoint has length `35998`, but it doesn't have ids `30400` and `30514` but instead `35998` and
1287 # `35999`.
1288 if config_class.__name__ == "GPTSanJapaneseConfig":
1289 vocab_size += 2
1290
1291 config_overrides["vocab_size"] = vocab_size
1292
1293 # Used to create a new model tester with `tokenizer.vocab_size` in order to get the (updated) special token ids.
1294 model_tester_kwargs = {"vocab_size": vocab_size}
1295 # `FSMTModelTester` accepts `src_vocab_size` and `tgt_vocab_size` but not `vocab_size`.
1296 if config_class.__name__ == "FSMTConfig":
1297 del model_tester_kwargs["vocab_size"]
1298 model_tester_kwargs["src_vocab_size"] = tokenizer.src_vocab_size
1299 model_tester_kwargs["tgt_vocab_size"] = tokenizer.tgt_vocab_size
1300
1301 _tiny_config = get_tiny_config(config_class, **model_tester_kwargs)
1302
1303 # handle the possibility of `text_config` inside `_tiny_config` for clip-like models (`owlvit`, `groupvit`, etc.)
1304 if hasattr(_tiny_config, "text_config"):
1305 _tiny_config = _tiny_config.text_config
1306
1307 # Collect values of some special token ids
1308 for attr in dir(_tiny_config):
1309 if attr.endswith("_token_id"):
1310 token_id = getattr(_tiny_config, attr)
1311 if token_id is not None:
1312 # Using the token id values from `tokenizer` instead of from `_tiny_config`.
1313 token_id = get_token_id_from_tokenizer(attr, tokenizer, original_token_id=token_id)
1314 config_overrides[attr] = token_id
1315
1316 if config_class.__name__ == "FSMTConfig":
1317 config_overrides["src_vocab_size"] = tokenizer.src_vocab_size
1318 config_overrides["tgt_vocab_size"] = tokenizer.tgt_vocab_size
1319

Callers 1

buildFunction · 0.85

Calls 2

get_tiny_configFunction · 0.85

Tested by

no test coverage detected