Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass Loose reference https://en.wikipedia.org/wiki/Trie
| 43 | |
| 44 | |
| 45 | class Trie: |
| 46 | """ |
| 47 | Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass |
| 48 | Loose reference https://en.wikipedia.org/wiki/Trie |
| 49 | """ |
| 50 | |
| 51 | def __init__(self, *args): |
| 52 | self.data = {} |
| 53 | self._tokens = set() |
| 54 | self._termination_char = "" |
| 55 | self.update(*args) |
| 56 | |
| 57 | def update(self, *args): |
| 58 | """ |
| 59 | Updates the Trie with new tokens provided as arguments. |
| 60 | |
| 61 | Args: |
| 62 | *args: Variable number of words to be added to the Trie. |
| 63 | """ |
| 64 | for token in tuple(*args): |
| 65 | self.add(token) |
| 66 | |
| 67 | def add(self, word: str): |
| 68 | """ |
| 69 | Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. |
| 70 | The special key `""` in `self._termination_char` is used to represent termination. |
| 71 | |
| 72 | This function is idempotent, adding twice the same word will leave the trie unchanged |
| 73 | |
| 74 | Example: |
| 75 | |
| 76 | ```python |
| 77 | >>> trie = Trie() |
| 78 | >>> trie.add("Hello 友達") |
| 79 | >>> trie.data |
| 80 | {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} |
| 81 | |
| 82 | >>> trie.add("Hello") |
| 83 | >>> trie.data |
| 84 | {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} |
| 85 | ``` |
| 86 | """ |
| 87 | if not word: |
| 88 | # Prevent empty string |
| 89 | return |
| 90 | |
| 91 | self._tokens.add(word) |
| 92 | ref = self.data |
| 93 | for char in word: |
| 94 | ref[char] = ref.setdefault(char, {}) |
| 95 | ref = ref[char] |
| 96 | ref[self._termination_char] = 1 |
| 97 | |
| 98 | def split(self, text: str) -> list[str]: |
| 99 | """ |
| 100 | Will look for the words added to the trie within `text`. Output is the original string split along the |
| 101 | boundaries of the words found. |
| 102 |
no outgoing calls