This function will read a file saved in .csv, .json, .txt, .xlsx or .tsv format using pandas. - for .xlsx it will read the first sheet - for .txt it will assume completions and split on newline
(
fname: str, fields: list[str] = ["prompt", "completion"]
)
| 449 | |
| 450 | |
| 451 | def read_any_format( |
| 452 | fname: str, fields: list[str] = ["prompt", "completion"] |
| 453 | ) -> tuple[pd.DataFrame | None, Remediation]: |
| 454 | """ |
| 455 | This function will read a file saved in .csv, .json, .txt, .xlsx or .tsv format using pandas. |
| 456 | - for .xlsx it will read the first sheet |
| 457 | - for .txt it will assume completions and split on newline |
| 458 | """ |
| 459 | remediation = None |
| 460 | necessary_msg = None |
| 461 | immediate_msg = None |
| 462 | error_msg = None |
| 463 | df = None |
| 464 | |
| 465 | if os.path.isfile(fname): |
| 466 | try: |
| 467 | if fname.lower().endswith(".csv") or fname.lower().endswith(".tsv"): |
| 468 | file_extension_str, separator = ("CSV", ",") if fname.lower().endswith(".csv") else ("TSV", "\t") |
| 469 | immediate_msg = ( |
| 470 | f"\n- Based on your file extension, your file is formatted as a {file_extension_str} file" |
| 471 | ) |
| 472 | necessary_msg = f"Your format `{file_extension_str}` will be converted to `JSONL`" |
| 473 | df = pd.read_csv(fname, sep=separator, dtype=str).fillna("") |
| 474 | elif fname.lower().endswith(".xlsx"): |
| 475 | immediate_msg = "\n- Based on your file extension, your file is formatted as an Excel file" |
| 476 | necessary_msg = "Your format `XLSX` will be converted to `JSONL`" |
| 477 | xls = pd.ExcelFile(fname) |
| 478 | sheets = xls.sheet_names |
| 479 | if len(sheets) > 1: |
| 480 | immediate_msg += "\n- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet..." |
| 481 | df = pd.read_excel(fname, dtype=str).fillna("") |
| 482 | elif fname.lower().endswith(".txt"): |
| 483 | immediate_msg = "\n- Based on your file extension, you provided a text file" |
| 484 | necessary_msg = "Your format `TXT` will be converted to `JSONL`" |
| 485 | with open(fname, "r") as f: |
| 486 | content = f.read() |
| 487 | df = pd.DataFrame( |
| 488 | [["", line] for line in content.split("\n")], |
| 489 | columns=fields, |
| 490 | dtype=str, |
| 491 | ).fillna("") |
| 492 | elif fname.lower().endswith(".jsonl"): |
| 493 | df = pd.read_json(fname, lines=True, dtype=str).fillna("") # type: ignore |
| 494 | if len(df) == 1: # type: ignore |
| 495 | # this is NOT what we expect for a .jsonl file |
| 496 | immediate_msg = "\n- Your JSONL file appears to be in a JSON format. Your file will be converted to JSONL format" |
| 497 | necessary_msg = "Your format `JSON` will be converted to `JSONL`" |
| 498 | df = pd.read_json(fname, dtype=str).fillna("") # type: ignore |
| 499 | else: |
| 500 | pass # this is what we expect for a .jsonl file |
| 501 | elif fname.lower().endswith(".json"): |
| 502 | try: |
| 503 | # to handle case where .json file is actually a .jsonl file |
| 504 | df = pd.read_json(fname, lines=True, dtype=str).fillna("") # type: ignore |
| 505 | if len(df) == 1: # type: ignore |
| 506 | # this code path corresponds to a .json file that has one line |
| 507 | df = pd.read_json(fname, dtype=str).fillna("") # type: ignore |
| 508 | else: |
nothing calls this directly
no test coverage detected