MCPcopy
hub / github.com/scrapy/scrapy / process_request

Method process_request

scrapy/downloadermiddlewares/offsite.py:47–65  ·  view source on GitHub ↗
(self, request: Request, spider: Spider | None = None)

Source from the content-addressed store, hash-verified

45
46 @_warn_spider_arg
47 def process_request(self, request: Request, spider: Spider | None = None) -> None:
48 assert self.crawler.spider
49 if (
50 request.dont_filter
51 or request.meta.get("allow_offsite")
52 or self.should_follow(request, self.crawler.spider)
53 ):
54 return
55 domain = urlparse_cached(request).hostname
56 if domain and domain not in self.domains_seen:
57 self.domains_seen.add(domain)
58 logger.debug(
59 "Filtered offsite request to %(domain)r: %(request)s",
60 {"domain": domain, "request": request},
61 extra={"spider": self.crawler.spider},
62 )
63 self.stats.inc_value("offsite/domains")
64 self.stats.inc_value("offsite/filtered")
65 raise IgnoreRequest
66
67 def should_follow(self, request: Request, spider: Spider) -> bool:
68 regex = self.host_regex

Callers 1

request_scheduledMethod · 0.95

Calls 4

should_followMethod · 0.95
urlparse_cachedFunction · 0.90
getMethod · 0.45
inc_valueMethod · 0.45

Tested by

no test coverage detected