MCPcopy
hub / github.com/scrapy/scrapy / _link_allowed

Method _link_allowed

scrapy/linkextractors/lxmlhtml.py:217–235  ·  view source on GitHub ↗
(self, link: Link)

Source from the content-addressed store, hash-verified

215 ]
216
217 def _link_allowed(self, link: Link) -> bool:
218 if not _is_valid_url(link.url):
219 return False
220 if self.allow_res and not _matches(link.url, self.allow_res):
221 return False
222 if self.deny_res and _matches(link.url, self.deny_res):
223 return False
224 parsed_url = urlparse(link.url)
225 if self.allow_domains and not url_is_from_any_domain(
226 parsed_url, self.allow_domains
227 ):
228 return False
229 if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains):
230 return False
231 if self.deny_extensions and url_has_any_extension(
232 parsed_url, self.deny_extensions
233 ):
234 return False
235 return not self.restrict_text or _matches(link.text, self.restrict_text)
236
237 def matches(self, url: str) -> bool:
238 if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):

Calls 4

_is_valid_urlFunction · 0.90
_matchesFunction · 0.90
url_is_from_any_domainFunction · 0.90
url_has_any_extensionFunction · 0.90