Change a processor to work with smaller inputs. For tokenizers, we try to reduce their vocabulary size. For feature extractor, we use smaller image size or change other attributes using the values from `tiny_config`. See `convert_feature_extractor`. This method should not fail: we
(processors, tiny_config, output_folder, result)
| 780 | |
| 781 | |
| 782 | def convert_processors(processors, tiny_config, output_folder, result): |
| 783 | """Change a processor to work with smaller inputs. |
| 784 | |
| 785 | For tokenizers, we try to reduce their vocabulary size. |
| 786 | |
| 787 | For feature extractor, we use smaller image size or change |
| 788 | other attributes using the values from `tiny_config`. See `convert_feature_extractor`. |
| 789 | |
| 790 | This method should not fail: we catch the errors and put them in `result["warnings"]` with descriptive messages. |
| 791 | """ |
| 792 | |
| 793 | def _sanity_check(fast_tokenizer, slow_tokenizer, keep_fast_tokenizer=False): |
| 794 | """Set tokenizer(s) to `None` if the fast/slow tokenizers have different values for `vocab_size` or `length`. |
| 795 | |
| 796 | If `keep_fast_tokenizer=True`, the fast tokenizer will be kept. |
| 797 | """ |
| 798 | # sanity check 1: fast and slow tokenizers should be compatible (vocab_size) |
| 799 | if fast_tokenizer is not None and slow_tokenizer is not None: |
| 800 | if fast_tokenizer.vocab_size != slow_tokenizer.vocab_size: |
| 801 | warning_message = ( |
| 802 | "The fast/slow tokenizers " |
| 803 | f"({fast_tokenizer.__class__.__name__}/{slow_tokenizer.__class__.__name__}) have different " |
| 804 | "vocabulary size: " |
| 805 | f"fast_tokenizer.vocab_size = {fast_tokenizer.vocab_size} and " |
| 806 | f"slow_tokenizer.vocab_size = {slow_tokenizer.vocab_size}." |
| 807 | ) |
| 808 | result["warnings"].append(warning_message) |
| 809 | if not keep_fast_tokenizer: |
| 810 | fast_tokenizer = None |
| 811 | slow_tokenizer = None |
| 812 | |
| 813 | # sanity check 2: fast and slow tokenizers should be compatible (length) |
| 814 | if fast_tokenizer is not None and slow_tokenizer is not None: |
| 815 | if len(fast_tokenizer) != len(slow_tokenizer): |
| 816 | warning_message = ( |
| 817 | f"The fast/slow tokenizers () have different length: " |
| 818 | f"len(fast_tokenizer) = {len(fast_tokenizer)} and " |
| 819 | f"len(slow_tokenizer) = {len(slow_tokenizer)}." |
| 820 | ) |
| 821 | result["warnings"].append(warning_message) |
| 822 | if not keep_fast_tokenizer: |
| 823 | fast_tokenizer = None |
| 824 | slow_tokenizer = None |
| 825 | |
| 826 | return fast_tokenizer, slow_tokenizer |
| 827 | |
| 828 | tokenizers = [] |
| 829 | feature_extractors = [] |
| 830 | for processor in processors: |
| 831 | if isinstance(processor, PreTrainedTokenizerBase): |
| 832 | if processor.__class__.__name__ not in {x.__class__.__name__ for x in tokenizers}: |
| 833 | tokenizers.append(processor) |
| 834 | elif isinstance(processor, BaseImageProcessor): |
| 835 | if processor.__class__.__name__ not in {x.__class__.__name__ for x in feature_extractors}: |
| 836 | feature_extractors.append(processor) |
| 837 | elif isinstance(processor, FeatureExtractionMixin): |
| 838 | if processor.__class__.__name__ not in {x.__class__.__name__ for x in feature_extractors}: |
| 839 | feature_extractors.append(processor) |
no test coverage detected