MCPcopy
hub / github.com/scrapy/scrapy / _parse_sitemap

Method _parse_sitemap

scrapy/spiders/sitemap.py:69–101  ·  view source on GitHub ↗
(self, response: Response)

Source from the content-addressed store, hash-verified

67 yield from entries
68
69 def _parse_sitemap(self, response: Response) -> Iterable[Request]:
70 if response.url.endswith("/robots.txt"):
71 urls = list(sitemap_urls_from_robots(response.body, base_url=response.url))
72 return (Request(url, callback=self._parse_sitemap) for url in urls)
73
74 body = self._get_sitemap_body(response)
75 if not body:
76 logger.warning(
77 "Ignoring invalid sitemap: %(response)s",
78 {"response": response},
79 extra={"spider": self},
80 )
81 return ()
82
83 s = Sitemap(body)
84
85 if s.type == "sitemapindex":
86 urls = list(self._get_urls_from_sitemapindex(self.sitemap_filter(s)))
87 return (Request(loc, callback=self._parse_sitemap) for loc in urls)
88
89 if s.type == "urlset":
90 url_callback_pairs = list(
91 self._get_urls_and_callbacks_from_urlset(self.sitemap_filter(s))
92 )
93 return (Request(loc, callback=c) for loc, c in url_callback_pairs)
94
95 logger.warning(
96 "Ignoring invalid sitemap: %(response)s",
97 {"response": response},
98 extra={"spider": self},
99 )
100
101 return ()
102
103 def _get_urls_from_sitemapindex(
104 self, it: Iterable[dict[str, Any]]

Calls 7

_get_sitemap_bodyMethod · 0.95
sitemap_filterMethod · 0.95
sitemap_urls_from_robotsFunction · 0.90
RequestClass · 0.90
SitemapClass · 0.90