Serializes this instance to a Python dictionary. Returns: `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
(self)
| 999 | return proper_class |
| 1000 | |
| 1001 | def to_dict(self) -> dict[str, Any]: |
| 1002 | """ |
| 1003 | Serializes this instance to a Python dictionary. |
| 1004 | |
| 1005 | Returns: |
| 1006 | `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance. |
| 1007 | """ |
| 1008 | # Exclude tokenizer attributes before deepcopying to avoid copying large vocab/token structures. |
| 1009 | tokenizer_attributes = set() |
| 1010 | for attribute in self.__class__.get_attributes(): |
| 1011 | if attribute in self.__dict__: |
| 1012 | modality = _get_modality_for_attribute(attribute) |
| 1013 | if modality == "tokenizer": |
| 1014 | tokenizer_attributes.add(attribute) |
| 1015 | |
| 1016 | dict_to_copy = {k: v for k, v in self.__dict__.items() if k not in tokenizer_attributes} |
| 1017 | output = copy.deepcopy(dict_to_copy) |
| 1018 | |
| 1019 | # Get the kwargs in `__init__`. |
| 1020 | sig = inspect.signature(self.__init__) |
| 1021 | # Only save the attributes that are presented in the kwargs of `__init__`. |
| 1022 | # or in the attributes |
| 1023 | attrs_to_save = list(sig.parameters) + self.__class__.get_attributes() |
| 1024 | # extra attributes to be kept |
| 1025 | attrs_to_save += ["auto_map"] |
| 1026 | |
| 1027 | if "chat_template" in output: |
| 1028 | del output["chat_template"] |
| 1029 | |
| 1030 | def cast_array_to_list(dictionary): |
| 1031 | """ |
| 1032 | Numpy arrays are not serialiazable but can be in pre-processing dicts. |
| 1033 | This function casts arrays to list, recusring through the nested configs as well. |
| 1034 | """ |
| 1035 | for key, value in dictionary.items(): |
| 1036 | if isinstance(value, np.ndarray): |
| 1037 | dictionary[key] = value.tolist() |
| 1038 | elif isinstance(value, dict): |
| 1039 | dictionary[key] = cast_array_to_list(value) |
| 1040 | return dictionary |
| 1041 | |
| 1042 | # Special case, add `audio_tokenizer` dict which points to model weights and path |
| 1043 | if "audio_tokenizer" in output: |
| 1044 | audio_tokenizer_dict = { |
| 1045 | "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__, |
| 1046 | "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path, |
| 1047 | } |
| 1048 | output["audio_tokenizer"] = audio_tokenizer_dict |
| 1049 | |
| 1050 | # Serialize attributes as a dict |
| 1051 | output = { |
| 1052 | k: v.to_dict() if isinstance(v, PushToHubMixin) else v |
| 1053 | for k, v in output.items() |
| 1054 | if ( |
| 1055 | k in attrs_to_save # keep all attributes that have to be serialized |
| 1056 | and v.__class__.__name__ != "BeamSearchDecoderCTC" # remove attributes with that are objects |
| 1057 | ) |
| 1058 | } |
no test coverage detected