MCPcopy
hub / github.com/scrapy/scrapy / xmliter

Function xmliter

scrapy/utils/iterators.py:23–78  ·  view source on GitHub ↗

Return a iterator of Selector's over all nodes of a XML document, given the name of the node to iterate. Useful for parsing XML feeds. obj can be: - a Response object - a unicode string - a string encoded as utf-8

(obj: Response | str | bytes, nodename: str)

Source from the content-addressed store, hash-verified

21
22
23def xmliter(obj: Response | str | bytes, nodename: str) -> Iterator[Selector]:
24 """Return a iterator of Selector's over all nodes of a XML document,
25 given the name of the node to iterate. Useful for parsing XML feeds.
26
27 obj can be:
28 - a Response object
29 - a unicode string
30 - a string encoded as utf-8
31 """
32 warn(
33 (
34 "xmliter is deprecated and its use strongly discouraged because "
35 "it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See "
36 "https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9"
37 ),
38 ScrapyDeprecationWarning,
39 stacklevel=2,
40 )
41
42 nodename_patt = re.escape(nodename)
43
44 DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.DOTALL)
45 HEADER_END_RE = re.compile(rf"<\s*/{nodename_patt}\s*>", re.DOTALL)
46 END_TAG_RE = re.compile(r"<\s*/([^\s>]+)\s*>", re.DOTALL)
47 NAMESPACE_RE = re.compile(r"((xmlns[:A-Za-z]*)=[^>\s]+)", re.DOTALL)
48 text = _body_or_str(obj)
49
50 document_header_match = re.search(DOCUMENT_HEADER_RE, text)
51 document_header = (
52 document_header_match.group().strip() if document_header_match else ""
53 )
54 header_end_idx = re_rsearch(HEADER_END_RE, text)
55 header_end = text[header_end_idx[1] :].strip() if header_end_idx else ""
56 namespaces: dict[str, str] = {}
57 if header_end:
58 for tagname in reversed(re.findall(END_TAG_RE, header_end)):
59 assert header_end_idx
60 tag = re.search(
61 rf"<\s*{tagname}.*?xmlns[:=][^>]*>",
62 text[: header_end_idx[1]],
63 re.DOTALL,
64 )
65 if tag:
66 for x in re.findall(NAMESPACE_RE, tag.group()):
67 namespaces[x[1]] = x[0]
68
69 r = re.compile(rf"<{nodename_patt}[\s>].*?</{nodename_patt}>", re.DOTALL)
70 for match in r.finditer(text):
71 nodetext = (
72 document_header
73 + match.group().replace(
74 nodename, f"{nodename} {' '.join(namespaces.values())}", 1
75 )
76 + header_end
77 )
78 yield Selector(text=nodetext, type="xml")
79
80

Callers 1

xmliterMethod · 0.90

Calls 6

re_rsearchFunction · 0.90
SelectorClass · 0.90
_body_or_strFunction · 0.85
valuesMethod · 0.80
replaceMethod · 0.45
joinMethod · 0.45

Tested by 1

xmliterMethod · 0.72