The delimiter /should/ occur the same number of times on each row. However, due to malformed data, it may not. We don't want an all or nothing approach, so we allow for small variations in this number. 1) build a table of the frequency of each character on
(self, data, delimiters)
| 345 | |
| 346 | |
| 347 | def _guess_delimiter(self, data, delimiters): |
| 348 | """ |
| 349 | The delimiter /should/ occur the same number of times on |
| 350 | each row. However, due to malformed data, it may not. We don't want |
| 351 | an all or nothing approach, so we allow for small variations in this |
| 352 | number. |
| 353 | 1) build a table of the frequency of each character on every line. |
| 354 | 2) build a table of frequencies of this frequency (meta-frequency?), |
| 355 | e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, |
| 356 | 7 times in 2 rows' |
| 357 | 3) use the mode of the meta-frequency to determine the /expected/ |
| 358 | frequency for that character |
| 359 | 4) find out how often the character actually meets that goal |
| 360 | 5) the character that best meets its goal is the delimiter |
| 361 | For performance reasons, the data is evaluated in chunks, so it can |
| 362 | try and evaluate the smallest portion of the data possible, evaluating |
| 363 | additional chunks as necessary. |
| 364 | """ |
| 365 | from collections import Counter, defaultdict |
| 366 | |
| 367 | data = list(filter(None, data.split('\n'))) |
| 368 | |
| 369 | # build frequency tables |
| 370 | chunkLength = min(10, len(data)) |
| 371 | iteration = 0 |
| 372 | num_lines = 0 |
| 373 | # {char -> {count_per_line -> num_lines_with_that_count}} |
| 374 | char_frequency = defaultdict(Counter) |
| 375 | modes = {} |
| 376 | delims = {} |
| 377 | start, end = 0, chunkLength |
| 378 | while start < len(data): |
| 379 | iteration += 1 |
| 380 | for line in data[start:end]: |
| 381 | num_lines += 1 |
| 382 | for char, count in Counter(line).items(): |
| 383 | if char.isascii(): |
| 384 | char_frequency[char][count] += 1 |
| 385 | |
| 386 | for char, counts in char_frequency.items(): |
| 387 | items = list(counts.items()) |
| 388 | missed_lines = num_lines - sum(counts.values()) |
| 389 | if missed_lines: |
| 390 | # Store the number of lines 'char' was missing from. |
| 391 | items.append((0, missed_lines)) |
| 392 | if len(items) == 1 and items[0][0] == 0: |
| 393 | continue |
| 394 | # get the mode of the frequencies |
| 395 | if len(items) > 1: |
| 396 | modes[char] = max(items, key=lambda x: x[1]) |
| 397 | # adjust the mode - subtract the sum of all |
| 398 | # other frequencies |
| 399 | items.remove(modes[char]) |
| 400 | modes[char] = (modes[char][0], modes[char][1] |
| 401 | - sum(item[1] for item in items)) |
| 402 | else: |
| 403 | modes[char] = items[0] |
| 404 |