hub / github.com/scrapy/scrapy / __init__

Method init

scrapy/linkextractors/lxmlhtml.py:167–208 · view source on GitHub ↗

(
        self,
        allow: _RegexOrSeveral = (),
        deny: _RegexOrSeveral = (),
        allow_domains: str | Iterable[str] = (),
        deny_domains: str | Iterable[str] = (),
        restrict_xpaths: str | Iterable[str] = (),
        tags: str | Iterable[str] = ("a", "area"),
        attrs: str | Iterable[str] = ("href",),
        canonicalize: bool = False,
        unique: bool = True,
        process_value: Callable[[Any], Any] | None = None,
        deny_extensions: str | Iterable[str] | None = None,
        restrict_css: str | Iterable[str] = (),
        strip: bool = True,
        restrict_text: _RegexOrSeveral | None = None,
    )

Source from the content-addressed store, hash-verified

165	_csstranslator = HTMLTranslator()
166
167	def __init__(
168	self,
169	allow: _RegexOrSeveral = (),
170	deny: _RegexOrSeveral = (),
171	allow_domains: str \| Iterable[str] = (),
172	deny_domains: str \| Iterable[str] = (),
173	restrict_xpaths: str \| Iterable[str] = (),
174	tags: str \| Iterable[str] = ("a", "area"),
175	attrs: str \| Iterable[str] = ("href",),
176	canonicalize: bool = False,
177	unique: bool = True,
178	process_value: Callable[[Any], Any] \| None = None,
179	deny_extensions: str \| Iterable[str] \| None = None,
180	restrict_css: str \| Iterable[str] = (),
181	strip: bool = True,
182	restrict_text: _RegexOrSeveral \| None = None,
183	):
184	tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
185	self.link_extractor = LxmlParserLinkExtractor(
186	tag=partial(operator.contains, tags),
187	attr=partial(operator.contains, attrs),
188	unique=unique,
189	process=process_value,
190	strip=strip,
191	canonicalized=not canonicalize,
192	)
193	self.allow_res: list[re.Pattern[str]] = self._compile_regexes(allow)
194	self.deny_res: list[re.Pattern[str]] = self._compile_regexes(deny)
195
196	self.allow_domains: set[str] = set(arg_to_iter(allow_domains))
197	self.deny_domains: set[str] = set(arg_to_iter(deny_domains))
198
199	self.restrict_xpaths: tuple[str, ...] = tuple(arg_to_iter(restrict_xpaths))
200	self.restrict_xpaths += tuple(
201	map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css))
202	)
203
204	if deny_extensions is None:
205	deny_extensions = IGNORED_EXTENSIONS
206	self.canonicalize: bool = canonicalize
207	self.deny_extensions: set[str] = {"." + e for e in arg_to_iter(deny_extensions)}
208	self.restrict_text: list[re.Pattern[str]] = self._compile_regexes(restrict_text)
209
210	@staticmethod
211	def _compile_regexes(value: _RegexOrSeveral \| None) -> list[re.Pattern[str]]:

Callers

nothing calls this directly

Calls 3

_compile_regexesMethod · 0.95

arg_to_iterFunction · 0.90

LxmlParserLinkExtractorClass · 0.85

Tested by

no test coverage detected

Method __init__

Source from the content-addressed store, hash-verified

Callers

Calls 3

Tested by

Method init