Take a string that contains HTML and turn it into a Python object structure that can be easily compared against other HTML on semantic equivalence. Syntactical differences like which quotation is used on arguments will be ignored.
(html)
| 256 | |
| 257 | |
| 258 | def parse_html(html): |
| 259 | """ |
| 260 | Take a string that contains HTML and turn it into a Python object structure |
| 261 | that can be easily compared against other HTML on semantic equivalence. |
| 262 | Syntactical differences like which quotation is used on arguments will be |
| 263 | ignored. |
| 264 | """ |
| 265 | parser = Parser() |
| 266 | parser.feed(html) |
| 267 | parser.close() |
| 268 | document = parser.root |
| 269 | document.finalize() |
| 270 | # Removing ROOT element if it's not necessary |
| 271 | if len(document.children) == 1 and not isinstance(document.children[0], str): |
| 272 | document = document.children[0] |
| 273 | return document |