Converts categorical columns to Categorical type.
(
self,
data: DataFrame,
value_label_dict: dict[str, dict[int, str]],
lbllist: Sequence[str],
order_categoricals: bool,
)
| 1882 | return data[columns] |
| 1883 | |
| 1884 | def _do_convert_categoricals( |
| 1885 | self, |
| 1886 | data: DataFrame, |
| 1887 | value_label_dict: dict[str, dict[int, str]], |
| 1888 | lbllist: Sequence[str], |
| 1889 | order_categoricals: bool, |
| 1890 | ) -> DataFrame: |
| 1891 | """ |
| 1892 | Converts categorical columns to Categorical type. |
| 1893 | """ |
| 1894 | if not value_label_dict: |
| 1895 | return data |
| 1896 | cat_converted_data = [] |
| 1897 | for col, label in zip(data, lbllist, strict=True): |
| 1898 | if label in value_label_dict: |
| 1899 | # Explicit call with ordered=True |
| 1900 | vl = value_label_dict[label] |
| 1901 | keys = np.array(list(vl.keys())) |
| 1902 | column = data[col] |
| 1903 | key_matches = column.isin(keys) |
| 1904 | if self._using_iterator and key_matches.all(): |
| 1905 | initial_categories: np.ndarray | None = keys |
| 1906 | # If all categories are in the keys and we are iterating, |
| 1907 | # use the same keys for all chunks. If some are missing |
| 1908 | # value labels, then we will fall back to the categories |
| 1909 | # varying across chunks. |
| 1910 | else: |
| 1911 | if self._using_iterator: |
| 1912 | # warn is using an iterator |
| 1913 | warnings.warn( |
| 1914 | categorical_conversion_warning, |
| 1915 | CategoricalConversionWarning, |
| 1916 | stacklevel=find_stack_level(), |
| 1917 | ) |
| 1918 | initial_categories = None |
| 1919 | cat_data = Categorical( |
| 1920 | column, categories=initial_categories, ordered=order_categoricals |
| 1921 | ) |
| 1922 | if initial_categories is None: |
| 1923 | # If None here, then we need to match the cats in the Categorical |
| 1924 | categories = [] |
| 1925 | for category in cat_data.categories: |
| 1926 | if category in vl: |
| 1927 | categories.append(vl[category]) |
| 1928 | else: |
| 1929 | categories.append(category) |
| 1930 | else: |
| 1931 | # If all cats are matched, we can use the values |
| 1932 | categories = list(vl.values()) |
| 1933 | try: |
| 1934 | # Try to catch duplicate categories |
| 1935 | # TODO: if we get a non-copying rename_categories, use that |
| 1936 | cat_data = cat_data.rename_categories(categories) |
| 1937 | except ValueError as err: |
| 1938 | vc = Series(categories, copy=False).value_counts() |
| 1939 | repeated_cats = list(vc.index[vc > 1]) |
| 1940 | repeats = "-" * 80 + "\n" + "\n".join(repeated_cats) |
| 1941 | # GH 25772 |
no test coverage detected