hub / github.com/pandas-dev/pandas / _do_convert_missing

Method _do_convert_missing

pandas/io/stata.py:1779–1840 · view source on GitHub ↗

(self, data: DataFrame, convert_missing: bool)

Source from the content-addressed store, hash-verified

1777	return data
1778
1779	def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:
1780	# missing code for double was different in version 105 and prior
1781	old_missingdouble = float.fromhex("0x1.0p333")
1782
1783	# Check for missing values, and replace if found
1784	replacements = {}
1785	for i in range(len(data.columns)):
1786	fmt = self._typlist[i]
1787	# recode instances of the old missing code to the currently used value
1788	if self._format_version <= 105 and fmt == "d":
1789	data.iloc[:, i] = data.iloc[:, i].replace(
1790	old_missingdouble, self.MISSING_VALUES["d"]
1791	)
1792
1793	if self._format_version <= 111:
1794	if fmt not in self.OLD_VALID_RANGE:
1795	continue
1796
1797	fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE
1798	nmin, nmax = self.OLD_VALID_RANGE[fmt]
1799	else:
1800	if fmt not in self.VALID_RANGE:
1801	continue
1802
1803	fmt = cast(str, fmt) # only strs in VALID_RANGE
1804	nmin, nmax = self.VALID_RANGE[fmt]
1805	series = data.iloc[:, i]
1806
1807	# appreciably faster to do this with ndarray instead of Series
1808	svals = series._values
1809	missing = (svals < nmin) \| (svals > nmax)
1810
1811	if not missing.any():
1812	continue
1813
1814	if convert_missing: # Replacement follows Stata notation
1815	missing_loc = np.nonzero(np.asarray(missing))[0]
1816	umissing, umissing_loc = np.unique(series[missing], return_inverse=True)
1817	replacement = Series(series, dtype=object)
1818	for j, um in enumerate(umissing):
1819	if self._format_version <= 111:
1820	missing_value = StataMissingValue(
1821	float(self.MISSING_VALUES[fmt])
1822	)
1823	else:
1824	missing_value = StataMissingValue(um)
1825
1826	loc = missing_loc[umissing_loc == j]
1827	replacement.iloc[loc] = missing_value
1828	else: # All replacements are identical
1829	dtype = series.dtype
1830	if dtype not in (np.float32, np.float64):
1831	dtype = np.float64
1832	replacement = Series(series, dtype=dtype)
1833	# Note: operating on ._values is much faster than directly
1834	# TODO: can we fix that?
1835	replacement._values[missing] = np.nan
1836	replacements[i] = replacement

Callers 1

readMethod · 0.95

Calls 8

SeriesClass · 0.90

StataMissingValueClass · 0.85

nonzeroMethod · 0.80

isetitemMethod · 0.80

replaceMethod · 0.45

anyMethod · 0.45

uniqueMethod · 0.45

itemsMethod · 0.45

Tested by

no test coverage detected