Concatenate chunks of data read with low_memory=True. The tricky part is handling Categoricals, where different chunks may have different inferred categories.
(
chunks: list[dict[int, ArrayLike]], column_names: list[str]
)
| 334 | |
| 335 | |
| 336 | def _concatenate_chunks( |
| 337 | chunks: list[dict[int, ArrayLike]], column_names: list[str] |
| 338 | ) -> dict: |
| 339 | """ |
| 340 | Concatenate chunks of data read with low_memory=True. |
| 341 | |
| 342 | The tricky part is handling Categoricals, where different chunks |
| 343 | may have different inferred categories. |
| 344 | """ |
| 345 | names = list(chunks[0].keys()) |
| 346 | warning_columns = [] |
| 347 | |
| 348 | result: dict = {} |
| 349 | for name in names: |
| 350 | arrs = [chunk.pop(name) for chunk in chunks] |
| 351 | # Check each arr for consistent types. |
| 352 | dtypes = {a.dtype for a in arrs} |
| 353 | non_cat_dtypes = {x for x in dtypes if not isinstance(x, CategoricalDtype)} |
| 354 | |
| 355 | dtype = dtypes.pop() |
| 356 | if isinstance(dtype, CategoricalDtype): |
| 357 | result[name] = union_categoricals(arrs, sort_categories=False) |
| 358 | else: |
| 359 | result[name] = concat_compat(arrs) |
| 360 | if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object): |
| 361 | warning_columns.append(column_names[name]) |
| 362 | |
| 363 | if warning_columns: |
| 364 | warning_names = ", ".join( |
| 365 | [f"{index}: {name}" for index, name in enumerate(warning_columns)] |
| 366 | ) |
| 367 | warning_message = " ".join( |
| 368 | [ |
| 369 | f"Columns ({warning_names}) have mixed types. " |
| 370 | f"Specify dtype option on import or set low_memory=False." |
| 371 | ] |
| 372 | ) |
| 373 | warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level()) |
| 374 | return result |
| 375 | |
| 376 | |
| 377 | def ensure_dtype_objs( |