MCPcopy
hub / github.com/scrapy/scrapy / __init__

Method __init__

scrapy/linkextractors/lxmlhtml.py:167–208  ·  view source on GitHub ↗
(
        self,
        allow: _RegexOrSeveral = (),
        deny: _RegexOrSeveral = (),
        allow_domains: str | Iterable[str] = (),
        deny_domains: str | Iterable[str] = (),
        restrict_xpaths: str | Iterable[str] = (),
        tags: str | Iterable[str] = ("a", "area"),
        attrs: str | Iterable[str] = ("href",),
        canonicalize: bool = False,
        unique: bool = True,
        process_value: Callable[[Any], Any] | None = None,
        deny_extensions: str | Iterable[str] | None = None,
        restrict_css: str | Iterable[str] = (),
        strip: bool = True,
        restrict_text: _RegexOrSeveral | None = None,
    )

Source from the content-addressed store, hash-verified

165 _csstranslator = HTMLTranslator()
166
167 def __init__(
168 self,
169 allow: _RegexOrSeveral = (),
170 deny: _RegexOrSeveral = (),
171 allow_domains: str | Iterable[str] = (),
172 deny_domains: str | Iterable[str] = (),
173 restrict_xpaths: str | Iterable[str] = (),
174 tags: str | Iterable[str] = ("a", "area"),
175 attrs: str | Iterable[str] = ("href",),
176 canonicalize: bool = False,
177 unique: bool = True,
178 process_value: Callable[[Any], Any] | None = None,
179 deny_extensions: str | Iterable[str] | None = None,
180 restrict_css: str | Iterable[str] = (),
181 strip: bool = True,
182 restrict_text: _RegexOrSeveral | None = None,
183 ):
184 tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
185 self.link_extractor = LxmlParserLinkExtractor(
186 tag=partial(operator.contains, tags),
187 attr=partial(operator.contains, attrs),
188 unique=unique,
189 process=process_value,
190 strip=strip,
191 canonicalized=not canonicalize,
192 )
193 self.allow_res: list[re.Pattern[str]] = self._compile_regexes(allow)
194 self.deny_res: list[re.Pattern[str]] = self._compile_regexes(deny)
195
196 self.allow_domains: set[str] = set(arg_to_iter(allow_domains))
197 self.deny_domains: set[str] = set(arg_to_iter(deny_domains))
198
199 self.restrict_xpaths: tuple[str, ...] = tuple(arg_to_iter(restrict_xpaths))
200 self.restrict_xpaths += tuple(
201 map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css))
202 )
203
204 if deny_extensions is None:
205 deny_extensions = IGNORED_EXTENSIONS
206 self.canonicalize: bool = canonicalize
207 self.deny_extensions: set[str] = {"." + e for e in arg_to_iter(deny_extensions)}
208 self.restrict_text: list[re.Pattern[str]] = self._compile_regexes(restrict_text)
209
210 @staticmethod
211 def _compile_regexes(value: _RegexOrSeveral | None) -> list[re.Pattern[str]]:

Callers

nothing calls this directly

Calls 3

_compile_regexesMethod · 0.95
arg_to_iterFunction · 0.90

Tested by

no test coverage detected