| 58 | |
| 59 | |
| 60 | class LxmlParserLinkExtractor: |
| 61 | def __init__( |
| 62 | self, |
| 63 | tag: str | Callable[[str], bool] = "a", |
| 64 | attr: str | Callable[[str], bool] = "href", |
| 65 | process: Callable[[Any], Any] | None = None, |
| 66 | unique: bool = False, |
| 67 | strip: bool = True, |
| 68 | canonicalized: bool = False, |
| 69 | ): |
| 70 | # mypy doesn't infer types for operator.* and also for partial() |
| 71 | self.scan_tag: Callable[[str], bool] = ( |
| 72 | tag |
| 73 | if callable(tag) |
| 74 | else cast("Callable[[str], bool]", partial(operator.eq, tag)) |
| 75 | ) |
| 76 | self.scan_attr: Callable[[str], bool] = ( |
| 77 | attr |
| 78 | if callable(attr) |
| 79 | else cast("Callable[[str], bool]", partial(operator.eq, attr)) |
| 80 | ) |
| 81 | self.process_attr: Callable[[Any], Any] = ( |
| 82 | process if callable(process) else _identity |
| 83 | ) |
| 84 | self.unique: bool = unique |
| 85 | self.strip: bool = strip |
| 86 | self.link_key: Callable[[Link], str] = ( |
| 87 | cast("Callable[[Link], str]", operator.attrgetter("url")) |
| 88 | if canonicalized |
| 89 | else _canonicalize_link_url |
| 90 | ) |
| 91 | |
| 92 | def _iter_links( |
| 93 | self, document: HtmlElement |
| 94 | ) -> Iterable[tuple[HtmlElement, str, str]]: |
| 95 | for el in document.iter(etree.Element): |
| 96 | if not self.scan_tag(_nons(el.tag)): |
| 97 | continue |
| 98 | attribs = el.attrib |
| 99 | for attrib in attribs: |
| 100 | if not self.scan_attr(attrib): |
| 101 | continue |
| 102 | yield el, attrib, attribs[attrib] |
| 103 | |
| 104 | def _extract_links( |
| 105 | self, |
| 106 | selector: Selector, |
| 107 | response_url: str, |
| 108 | response_encoding: str, |
| 109 | base_url: str, |
| 110 | ) -> list[Link]: |
| 111 | links: list[Link] = [] |
| 112 | # hacky way to get the underlying lxml parsed document |
| 113 | for el, _, attr_val in self._iter_links(selector.root): |
| 114 | # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) |
| 115 | try: |
| 116 | if self.strip: |
| 117 | attr_val = strip_html5_whitespace(attr_val) # noqa: PLW2901 this is intended |