Extract raw HTML from text. The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text is stored in `cleandoc` as a list of strings.
| 118 | |
| 119 | |
| 120 | class HTMLExtractor(htmlparser.HTMLParser): |
| 121 | """ |
| 122 | Extract raw HTML from text. |
| 123 | |
| 124 | The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the |
| 125 | [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text |
| 126 | is stored in `cleandoc` as a list of strings. |
| 127 | """ |
| 128 | |
| 129 | def __init__(self, md: Markdown, *args, **kwargs): |
| 130 | if 'convert_charrefs' not in kwargs: |
| 131 | kwargs['convert_charrefs'] = False |
| 132 | |
| 133 | # Block tags that should contain no content (self closing) |
| 134 | self.empty_tags = set(['hr']) |
| 135 | |
| 136 | self.lineno_start_cache = [0] |
| 137 | |
| 138 | # This calls self.reset |
| 139 | super().__init__(*args, **kwargs) |
| 140 | self.md = md |
| 141 | |
| 142 | def reset(self): |
| 143 | """Reset this instance. Loses all unprocessed data.""" |
| 144 | self.inraw = False |
| 145 | self.intail = False |
| 146 | self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags |
| 147 | self._cache: list[str] = [] |
| 148 | self.cleandoc: list[str] = [] |
| 149 | self.lineno_start_cache = [0] |
| 150 | |
| 151 | super().reset() |
| 152 | |
| 153 | def close(self): |
| 154 | """Handle any buffered data.""" |
| 155 | super().close() |
| 156 | if len(self.rawdata): |
| 157 | # Temp fix for https://bugs.python.org/issue41989 |
| 158 | # TODO: remove this when the bug is fixed in all supported Python versions. |
| 159 | if self.convert_charrefs and not self.cdata_elem: # pragma: no cover |
| 160 | self.handle_data(htmlparser.unescape(self.rawdata)) |
| 161 | else: |
| 162 | self.handle_data(self.rawdata) |
| 163 | # Handle any unclosed tags. |
| 164 | if len(self._cache): |
| 165 | self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) |
| 166 | self._cache = [] |
| 167 | |
| 168 | @property |
| 169 | def line_offset(self) -> int: |
| 170 | """Returns char index in `self.rawdata` for the start of the current line. """ |
| 171 | for ii in range(len(self.lineno_start_cache)-1, self.lineno-1): |
| 172 | last_line_start_pos = self.lineno_start_cache[ii] |
| 173 | lf_pos = self.rawdata.find('\n', last_line_start_pos) |
| 174 | if lf_pos == -1: |
| 175 | # No more newlines found. Use end of raw data as start of line beyond end. |
| 176 | lf_pos = len(self.rawdata) |
| 177 | self.lineno_start_cache.append(lf_pos+1) |
no outgoing calls
no test coverage detected
searching dependent graphs…