| 1777 | return data |
| 1778 | |
| 1779 | def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame: |
| 1780 | # missing code for double was different in version 105 and prior |
| 1781 | old_missingdouble = float.fromhex("0x1.0p333") |
| 1782 | |
| 1783 | # Check for missing values, and replace if found |
| 1784 | replacements = {} |
| 1785 | for i in range(len(data.columns)): |
| 1786 | fmt = self._typlist[i] |
| 1787 | # recode instances of the old missing code to the currently used value |
| 1788 | if self._format_version <= 105 and fmt == "d": |
| 1789 | data.iloc[:, i] = data.iloc[:, i].replace( |
| 1790 | old_missingdouble, self.MISSING_VALUES["d"] |
| 1791 | ) |
| 1792 | |
| 1793 | if self._format_version <= 111: |
| 1794 | if fmt not in self.OLD_VALID_RANGE: |
| 1795 | continue |
| 1796 | |
| 1797 | fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE |
| 1798 | nmin, nmax = self.OLD_VALID_RANGE[fmt] |
| 1799 | else: |
| 1800 | if fmt not in self.VALID_RANGE: |
| 1801 | continue |
| 1802 | |
| 1803 | fmt = cast(str, fmt) # only strs in VALID_RANGE |
| 1804 | nmin, nmax = self.VALID_RANGE[fmt] |
| 1805 | series = data.iloc[:, i] |
| 1806 | |
| 1807 | # appreciably faster to do this with ndarray instead of Series |
| 1808 | svals = series._values |
| 1809 | missing = (svals < nmin) | (svals > nmax) |
| 1810 | |
| 1811 | if not missing.any(): |
| 1812 | continue |
| 1813 | |
| 1814 | if convert_missing: # Replacement follows Stata notation |
| 1815 | missing_loc = np.nonzero(np.asarray(missing))[0] |
| 1816 | umissing, umissing_loc = np.unique(series[missing], return_inverse=True) |
| 1817 | replacement = Series(series, dtype=object) |
| 1818 | for j, um in enumerate(umissing): |
| 1819 | if self._format_version <= 111: |
| 1820 | missing_value = StataMissingValue( |
| 1821 | float(self.MISSING_VALUES[fmt]) |
| 1822 | ) |
| 1823 | else: |
| 1824 | missing_value = StataMissingValue(um) |
| 1825 | |
| 1826 | loc = missing_loc[umissing_loc == j] |
| 1827 | replacement.iloc[loc] = missing_value |
| 1828 | else: # All replacements are identical |
| 1829 | dtype = series.dtype |
| 1830 | if dtype not in (np.float32, np.float64): |
| 1831 | dtype = np.float64 |
| 1832 | replacement = Series(series, dtype=dtype) |
| 1833 | # Note: operating on ._values is much faster than directly |
| 1834 | # TODO: can we fix that? |
| 1835 | replacement._values[missing] = np.nan |
| 1836 | replacements[i] = replacement |