Return a iterator of Selector's over all nodes of a XML document, given the name of the node to iterate. Useful for parsing XML feeds. obj can be: - a Response object - a unicode string - a string encoded as utf-8
(obj: Response | str | bytes, nodename: str)
| 21 | |
| 22 | |
| 23 | def xmliter(obj: Response | str | bytes, nodename: str) -> Iterator[Selector]: |
| 24 | """Return a iterator of Selector's over all nodes of a XML document, |
| 25 | given the name of the node to iterate. Useful for parsing XML feeds. |
| 26 | |
| 27 | obj can be: |
| 28 | - a Response object |
| 29 | - a unicode string |
| 30 | - a string encoded as utf-8 |
| 31 | """ |
| 32 | warn( |
| 33 | ( |
| 34 | "xmliter is deprecated and its use strongly discouraged because " |
| 35 | "it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See " |
| 36 | "https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9" |
| 37 | ), |
| 38 | ScrapyDeprecationWarning, |
| 39 | stacklevel=2, |
| 40 | ) |
| 41 | |
| 42 | nodename_patt = re.escape(nodename) |
| 43 | |
| 44 | DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.DOTALL) |
| 45 | HEADER_END_RE = re.compile(rf"<\s*/{nodename_patt}\s*>", re.DOTALL) |
| 46 | END_TAG_RE = re.compile(r"<\s*/([^\s>]+)\s*>", re.DOTALL) |
| 47 | NAMESPACE_RE = re.compile(r"((xmlns[:A-Za-z]*)=[^>\s]+)", re.DOTALL) |
| 48 | text = _body_or_str(obj) |
| 49 | |
| 50 | document_header_match = re.search(DOCUMENT_HEADER_RE, text) |
| 51 | document_header = ( |
| 52 | document_header_match.group().strip() if document_header_match else "" |
| 53 | ) |
| 54 | header_end_idx = re_rsearch(HEADER_END_RE, text) |
| 55 | header_end = text[header_end_idx[1] :].strip() if header_end_idx else "" |
| 56 | namespaces: dict[str, str] = {} |
| 57 | if header_end: |
| 58 | for tagname in reversed(re.findall(END_TAG_RE, header_end)): |
| 59 | assert header_end_idx |
| 60 | tag = re.search( |
| 61 | rf"<\s*{tagname}.*?xmlns[:=][^>]*>", |
| 62 | text[: header_end_idx[1]], |
| 63 | re.DOTALL, |
| 64 | ) |
| 65 | if tag: |
| 66 | for x in re.findall(NAMESPACE_RE, tag.group()): |
| 67 | namespaces[x[1]] = x[0] |
| 68 | |
| 69 | r = re.compile(rf"<{nodename_patt}[\s>].*?</{nodename_patt}>", re.DOTALL) |
| 70 | for match in r.finditer(text): |
| 71 | nodetext = ( |
| 72 | document_header |
| 73 | + match.group().replace( |
| 74 | nodename, f"{nodename} {' '.join(namespaces.values())}", 1 |
| 75 | ) |
| 76 | + header_end |
| 77 | ) |
| 78 | yield Selector(text=nodetext, type="xml") |
| 79 | |
| 80 |