MCPcopy
hub / github.com/huggingface/transformers / convert_processors

Function convert_processors

utils/create_dummy_models.py:782–1013  ·  view source on GitHub ↗

Change a processor to work with smaller inputs. For tokenizers, we try to reduce their vocabulary size. For feature extractor, we use smaller image size or change other attributes using the values from `tiny_config`. See `convert_feature_extractor`. This method should not fail: we

(processors, tiny_config, output_folder, result)

Source from the content-addressed store, hash-verified

780
781
782def convert_processors(processors, tiny_config, output_folder, result):
783 """Change a processor to work with smaller inputs.
784
785 For tokenizers, we try to reduce their vocabulary size.
786
787 For feature extractor, we use smaller image size or change
788 other attributes using the values from `tiny_config`. See `convert_feature_extractor`.
789
790 This method should not fail: we catch the errors and put them in `result["warnings"]` with descriptive messages.
791 """
792
793 def _sanity_check(fast_tokenizer, slow_tokenizer, keep_fast_tokenizer=False):
794 """Set tokenizer(s) to `None` if the fast/slow tokenizers have different values for `vocab_size` or `length`.
795
796 If `keep_fast_tokenizer=True`, the fast tokenizer will be kept.
797 """
798 # sanity check 1: fast and slow tokenizers should be compatible (vocab_size)
799 if fast_tokenizer is not None and slow_tokenizer is not None:
800 if fast_tokenizer.vocab_size != slow_tokenizer.vocab_size:
801 warning_message = (
802 "The fast/slow tokenizers "
803 f"({fast_tokenizer.__class__.__name__}/{slow_tokenizer.__class__.__name__}) have different "
804 "vocabulary size: "
805 f"fast_tokenizer.vocab_size = {fast_tokenizer.vocab_size} and "
806 f"slow_tokenizer.vocab_size = {slow_tokenizer.vocab_size}."
807 )
808 result["warnings"].append(warning_message)
809 if not keep_fast_tokenizer:
810 fast_tokenizer = None
811 slow_tokenizer = None
812
813 # sanity check 2: fast and slow tokenizers should be compatible (length)
814 if fast_tokenizer is not None and slow_tokenizer is not None:
815 if len(fast_tokenizer) != len(slow_tokenizer):
816 warning_message = (
817 f"The fast/slow tokenizers () have different length: "
818 f"len(fast_tokenizer) = {len(fast_tokenizer)} and "
819 f"len(slow_tokenizer) = {len(slow_tokenizer)}."
820 )
821 result["warnings"].append(warning_message)
822 if not keep_fast_tokenizer:
823 fast_tokenizer = None
824 slow_tokenizer = None
825
826 return fast_tokenizer, slow_tokenizer
827
828 tokenizers = []
829 feature_extractors = []
830 for processor in processors:
831 if isinstance(processor, PreTrainedTokenizerBase):
832 if processor.__class__.__name__ not in {x.__class__.__name__ for x in tokenizers}:
833 tokenizers.append(processor)
834 elif isinstance(processor, BaseImageProcessor):
835 if processor.__class__.__name__ not in {x.__class__.__name__ for x in feature_extractors}:
836 feature_extractors.append(processor)
837 elif isinstance(processor, FeatureExtractionMixin):
838 if processor.__class__.__name__ not in {x.__class__.__name__ for x in feature_extractors}:
839 feature_extractors.append(processor)

Callers 1

buildFunction · 0.85

Calls 4

_sanity_checkFunction · 0.85
save_pretrainedMethod · 0.45
from_pretrainedMethod · 0.45

Tested by

no test coverage detected