MCPcopy
hub / github.com/pandas-dev/pandas / _do_convert_missing

Method _do_convert_missing

pandas/io/stata.py:1779–1840  ·  view source on GitHub ↗
(self, data: DataFrame, convert_missing: bool)

Source from the content-addressed store, hash-verified

1777 return data
1778
1779 def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:
1780 # missing code for double was different in version 105 and prior
1781 old_missingdouble = float.fromhex("0x1.0p333")
1782
1783 # Check for missing values, and replace if found
1784 replacements = {}
1785 for i in range(len(data.columns)):
1786 fmt = self._typlist[i]
1787 # recode instances of the old missing code to the currently used value
1788 if self._format_version <= 105 and fmt == "d":
1789 data.iloc[:, i] = data.iloc[:, i].replace(
1790 old_missingdouble, self.MISSING_VALUES["d"]
1791 )
1792
1793 if self._format_version <= 111:
1794 if fmt not in self.OLD_VALID_RANGE:
1795 continue
1796
1797 fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE
1798 nmin, nmax = self.OLD_VALID_RANGE[fmt]
1799 else:
1800 if fmt not in self.VALID_RANGE:
1801 continue
1802
1803 fmt = cast(str, fmt) # only strs in VALID_RANGE
1804 nmin, nmax = self.VALID_RANGE[fmt]
1805 series = data.iloc[:, i]
1806
1807 # appreciably faster to do this with ndarray instead of Series
1808 svals = series._values
1809 missing = (svals < nmin) | (svals > nmax)
1810
1811 if not missing.any():
1812 continue
1813
1814 if convert_missing: # Replacement follows Stata notation
1815 missing_loc = np.nonzero(np.asarray(missing))[0]
1816 umissing, umissing_loc = np.unique(series[missing], return_inverse=True)
1817 replacement = Series(series, dtype=object)
1818 for j, um in enumerate(umissing):
1819 if self._format_version <= 111:
1820 missing_value = StataMissingValue(
1821 float(self.MISSING_VALUES[fmt])
1822 )
1823 else:
1824 missing_value = StataMissingValue(um)
1825
1826 loc = missing_loc[umissing_loc == j]
1827 replacement.iloc[loc] = missing_value
1828 else: # All replacements are identical
1829 dtype = series.dtype
1830 if dtype not in (np.float32, np.float64):
1831 dtype = np.float64
1832 replacement = Series(series, dtype=dtype)
1833 # Note: operating on ._values is much faster than directly
1834 # TODO: can we fix that?
1835 replacement._values[missing] = np.nan
1836 replacements[i] = replacement

Callers 1

readMethod · 0.95

Calls 8

SeriesClass · 0.90
StataMissingValueClass · 0.85
nonzeroMethod · 0.80
isetitemMethod · 0.80
replaceMethod · 0.45
anyMethod · 0.45
uniqueMethod · 0.45
itemsMethod · 0.45

Tested by

no test coverage detected