| 67 | yield from entries |
| 68 | |
| 69 | def _parse_sitemap(self, response: Response) -> Iterable[Request]: |
| 70 | if response.url.endswith("/robots.txt"): |
| 71 | urls = list(sitemap_urls_from_robots(response.body, base_url=response.url)) |
| 72 | return (Request(url, callback=self._parse_sitemap) for url in urls) |
| 73 | |
| 74 | body = self._get_sitemap_body(response) |
| 75 | if not body: |
| 76 | logger.warning( |
| 77 | "Ignoring invalid sitemap: %(response)s", |
| 78 | {"response": response}, |
| 79 | extra={"spider": self}, |
| 80 | ) |
| 81 | return () |
| 82 | |
| 83 | s = Sitemap(body) |
| 84 | |
| 85 | if s.type == "sitemapindex": |
| 86 | urls = list(self._get_urls_from_sitemapindex(self.sitemap_filter(s))) |
| 87 | return (Request(loc, callback=self._parse_sitemap) for loc in urls) |
| 88 | |
| 89 | if s.type == "urlset": |
| 90 | url_callback_pairs = list( |
| 91 | self._get_urls_and_callbacks_from_urlset(self.sitemap_filter(s)) |
| 92 | ) |
| 93 | return (Request(loc, callback=c) for loc, c in url_callback_pairs) |
| 94 | |
| 95 | logger.warning( |
| 96 | "Ignoring invalid sitemap: %(response)s", |
| 97 | {"response": response}, |
| 98 | extra={"spider": self}, |
| 99 | ) |
| 100 | |
| 101 | return () |
| 102 | |
| 103 | def _get_urls_from_sitemapindex( |
| 104 | self, it: Iterable[dict[str, Any]] |