MCPcopy
hub / github.com/scrapy/scrapy / LxmlParserLinkExtractor

Class LxmlParserLinkExtractor

scrapy/linkextractors/lxmlhtml.py:60–157  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

58
59
60class LxmlParserLinkExtractor:
61 def __init__(
62 self,
63 tag: str | Callable[[str], bool] = "a",
64 attr: str | Callable[[str], bool] = "href",
65 process: Callable[[Any], Any] | None = None,
66 unique: bool = False,
67 strip: bool = True,
68 canonicalized: bool = False,
69 ):
70 # mypy doesn't infer types for operator.* and also for partial()
71 self.scan_tag: Callable[[str], bool] = (
72 tag
73 if callable(tag)
74 else cast("Callable[[str], bool]", partial(operator.eq, tag))
75 )
76 self.scan_attr: Callable[[str], bool] = (
77 attr
78 if callable(attr)
79 else cast("Callable[[str], bool]", partial(operator.eq, attr))
80 )
81 self.process_attr: Callable[[Any], Any] = (
82 process if callable(process) else _identity
83 )
84 self.unique: bool = unique
85 self.strip: bool = strip
86 self.link_key: Callable[[Link], str] = (
87 cast("Callable[[Link], str]", operator.attrgetter("url"))
88 if canonicalized
89 else _canonicalize_link_url
90 )
91
92 def _iter_links(
93 self, document: HtmlElement
94 ) -> Iterable[tuple[HtmlElement, str, str]]:
95 for el in document.iter(etree.Element):
96 if not self.scan_tag(_nons(el.tag)):
97 continue
98 attribs = el.attrib
99 for attrib in attribs:
100 if not self.scan_attr(attrib):
101 continue
102 yield el, attrib, attribs[attrib]
103
104 def _extract_links(
105 self,
106 selector: Selector,
107 response_url: str,
108 response_encoding: str,
109 base_url: str,
110 ) -> list[Link]:
111 links: list[Link] = []
112 # hacky way to get the underlying lxml parsed document
113 for el, _, attr_val in self._iter_links(selector.root):
114 # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
115 try:
116 if self.strip:
117 attr_val = strip_html5_whitespace(attr_val) # noqa: PLW2901 this is intended

Callers 1

__init__Method · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected