(
self,
allow: _RegexOrSeveral = (),
deny: _RegexOrSeveral = (),
allow_domains: str | Iterable[str] = (),
deny_domains: str | Iterable[str] = (),
restrict_xpaths: str | Iterable[str] = (),
tags: str | Iterable[str] = ("a", "area"),
attrs: str | Iterable[str] = ("href",),
canonicalize: bool = False,
unique: bool = True,
process_value: Callable[[Any], Any] | None = None,
deny_extensions: str | Iterable[str] | None = None,
restrict_css: str | Iterable[str] = (),
strip: bool = True,
restrict_text: _RegexOrSeveral | None = None,
)
| 165 | _csstranslator = HTMLTranslator() |
| 166 | |
| 167 | def __init__( |
| 168 | self, |
| 169 | allow: _RegexOrSeveral = (), |
| 170 | deny: _RegexOrSeveral = (), |
| 171 | allow_domains: str | Iterable[str] = (), |
| 172 | deny_domains: str | Iterable[str] = (), |
| 173 | restrict_xpaths: str | Iterable[str] = (), |
| 174 | tags: str | Iterable[str] = ("a", "area"), |
| 175 | attrs: str | Iterable[str] = ("href",), |
| 176 | canonicalize: bool = False, |
| 177 | unique: bool = True, |
| 178 | process_value: Callable[[Any], Any] | None = None, |
| 179 | deny_extensions: str | Iterable[str] | None = None, |
| 180 | restrict_css: str | Iterable[str] = (), |
| 181 | strip: bool = True, |
| 182 | restrict_text: _RegexOrSeveral | None = None, |
| 183 | ): |
| 184 | tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) |
| 185 | self.link_extractor = LxmlParserLinkExtractor( |
| 186 | tag=partial(operator.contains, tags), |
| 187 | attr=partial(operator.contains, attrs), |
| 188 | unique=unique, |
| 189 | process=process_value, |
| 190 | strip=strip, |
| 191 | canonicalized=not canonicalize, |
| 192 | ) |
| 193 | self.allow_res: list[re.Pattern[str]] = self._compile_regexes(allow) |
| 194 | self.deny_res: list[re.Pattern[str]] = self._compile_regexes(deny) |
| 195 | |
| 196 | self.allow_domains: set[str] = set(arg_to_iter(allow_domains)) |
| 197 | self.deny_domains: set[str] = set(arg_to_iter(deny_domains)) |
| 198 | |
| 199 | self.restrict_xpaths: tuple[str, ...] = tuple(arg_to_iter(restrict_xpaths)) |
| 200 | self.restrict_xpaths += tuple( |
| 201 | map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css)) |
| 202 | ) |
| 203 | |
| 204 | if deny_extensions is None: |
| 205 | deny_extensions = IGNORED_EXTENSIONS |
| 206 | self.canonicalize: bool = canonicalize |
| 207 | self.deny_extensions: set[str] = {"." + e for e in arg_to_iter(deny_extensions)} |
| 208 | self.restrict_text: list[re.Pattern[str]] = self._compile_regexes(restrict_text) |
| 209 | |
| 210 | @staticmethod |
| 211 | def _compile_regexes(value: _RegexOrSeveral | None) -> list[re.Pattern[str]]: |
nothing calls this directly
no test coverage detected