(self, request: Request, spider: Spider | None = None)
| 45 | |
| 46 | @_warn_spider_arg |
| 47 | def process_request(self, request: Request, spider: Spider | None = None) -> None: |
| 48 | assert self.crawler.spider |
| 49 | if ( |
| 50 | request.dont_filter |
| 51 | or request.meta.get("allow_offsite") |
| 52 | or self.should_follow(request, self.crawler.spider) |
| 53 | ): |
| 54 | return |
| 55 | domain = urlparse_cached(request).hostname |
| 56 | if domain and domain not in self.domains_seen: |
| 57 | self.domains_seen.add(domain) |
| 58 | logger.debug( |
| 59 | "Filtered offsite request to %(domain)r: %(request)s", |
| 60 | {"domain": domain, "request": request}, |
| 61 | extra={"spider": self.crawler.spider}, |
| 62 | ) |
| 63 | self.stats.inc_value("offsite/domains") |
| 64 | self.stats.inc_value("offsite/filtered") |
| 65 | raise IgnoreRequest |
| 66 | |
| 67 | def should_follow(self, request: Request, spider: Spider) -> bool: |
| 68 | regex = self.host_regex |
no test coverage detected