hub / github.com/scrapy/scrapy / LxmlParserLinkExtractor

Class LxmlParserLinkExtractor

scrapy/linkextractors/lxmlhtml.py:60–157 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

58
59
60	class LxmlParserLinkExtractor:
61	def __init__(
62	self,
63	tag: str \| Callable[[str], bool] = "a",
64	attr: str \| Callable[[str], bool] = "href",
65	process: Callable[[Any], Any] \| None = None,
66	unique: bool = False,
67	strip: bool = True,
68	canonicalized: bool = False,
69	):
70	# mypy doesn't infer types for operator.* and also for partial()
71	self.scan_tag: Callable[[str], bool] = (
72	tag
73	if callable(tag)
74	else cast("Callable[[str], bool]", partial(operator.eq, tag))
75	)
76	self.scan_attr: Callable[[str], bool] = (
77	attr
78	if callable(attr)
79	else cast("Callable[[str], bool]", partial(operator.eq, attr))
80	)
81	self.process_attr: Callable[[Any], Any] = (
82	process if callable(process) else _identity
83	)
84	self.unique: bool = unique
85	self.strip: bool = strip
86	self.link_key: Callable[[Link], str] = (
87	cast("Callable[[Link], str]", operator.attrgetter("url"))
88	if canonicalized
89	else _canonicalize_link_url
90	)
91
92	def _iter_links(
93	self, document: HtmlElement
94	) -> Iterable[tuple[HtmlElement, str, str]]:
95	for el in document.iter(etree.Element):
96	if not self.scan_tag(_nons(el.tag)):
97	continue
98	attribs = el.attrib
99	for attrib in attribs:
100	if not self.scan_attr(attrib):
101	continue
102	yield el, attrib, attribs[attrib]
103
104	def _extract_links(
105	self,
106	selector: Selector,
107	response_url: str,
108	response_encoding: str,
109	base_url: str,
110	) -> list[Link]:
111	links: list[Link] = []
112	# hacky way to get the underlying lxml parsed document
113	for el, _, attr_val in self._iter_links(selector.root):
114	# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
115	try:
116	if self.strip:
117	attr_val = strip_html5_whitespace(attr_val) # noqa: PLW2901 this is intended

Callers 1

__init__Method · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected