Method _parse_sitemap

scrapy/spiders/sitemap.py:69–101 · view source on GitHub ↗

(self, response: Response)

Source from the content-addressed store, hash-verified

67	yield from entries
68
69	def _parse_sitemap(self, response: Response) -> Iterable[Request]:
70	if response.url.endswith("/robots.txt"):
71	urls = list(sitemap_urls_from_robots(response.body, base_url=response.url))
72	return (Request(url, callback=self._parse_sitemap) for url in urls)
73
74	body = self._get_sitemap_body(response)
75	if not body:
76	logger.warning(
77	"Ignoring invalid sitemap: %(response)s",
78	{"response": response},
79	extra={"spider": self},
80	)
81	return ()
82
83	s = Sitemap(body)
84
85	if s.type == "sitemapindex":
86	urls = list(self._get_urls_from_sitemapindex(self.sitemap_filter(s)))
87	return (Request(loc, callback=self._parse_sitemap) for loc in urls)
88
89	if s.type == "urlset":
90	url_callback_pairs = list(
91	self._get_urls_and_callbacks_from_urlset(self.sitemap_filter(s))
92	)
93	return (Request(loc, callback=c) for loc, c in url_callback_pairs)
94
95	logger.warning(
96	"Ignoring invalid sitemap: %(response)s",
97	{"response": response},
98	extra={"spider": self},
99	)
100
101	return ()
102
103	def _get_urls_from_sitemapindex(
104	self, it: Iterable[dict[str, Any]]

test_get_sitemap_urls_from_robotstxtMethod · 0.80

test_get_sitemap_urls_from_robotstxt_skips_invalid_utf8_urlsMethod · 0.80

test_alternate_url_locsMethod · 0.80

test_sitemap_filterMethod · 0.80

test_sitemap_filter_with_alternate_linksMethod · 0.80

test_sitemapindex_filterMethod · 0.80

test_sitemap_filter_with_ruleMethod · 0.80

test_parse_sitemap_empty_bodyMethod · 0.80

test_parse_sitemap_not_sitemapMethod · 0.80

test_sitemap_followMethod · 0.80

_get_sitemap_bodyMethod · 0.95

_get_urls_from_sitemapindexMethod · 0.95

sitemap_filterMethod · 0.95

_get_urls_and_callbacks_from_urlsetMethod · 0.95

sitemap_urls_from_robotsFunction · 0.90

RequestClass · 0.90

SitemapClass · 0.90

test_get_sitemap_urls_from_robotstxtMethod · 0.64

test_get_sitemap_urls_from_robotstxt_skips_invalid_utf8_urlsMethod · 0.64

test_alternate_url_locsMethod · 0.64

test_sitemap_filterMethod · 0.64

test_sitemap_filter_with_alternate_linksMethod · 0.64

test_sitemapindex_filterMethod · 0.64

test_sitemap_filter_with_ruleMethod · 0.64

test_parse_sitemap_empty_bodyMethod · 0.64

test_parse_sitemap_not_sitemapMethod · 0.64

test_sitemap_followMethod · 0.64