| 20 | |
| 21 | |
| 22 | class TestSitemapSpider(TestSpider): |
| 23 | spider_class = SitemapSpider |
| 24 | |
| 25 | BODY = b"SITEMAP" |
| 26 | f = BytesIO() |
| 27 | g = gzip.GzipFile(fileobj=f, mode="w+b") |
| 28 | g.write(BODY) |
| 29 | g.close() |
| 30 | GZBODY = f.getvalue() |
| 31 | |
| 32 | def assertSitemapBody(self, response: Response, body: bytes | None) -> None: |
| 33 | crawler = get_crawler() |
| 34 | spider = self.spider_class.from_crawler(crawler, "example.com") |
| 35 | assert spider._get_sitemap_body(response) == body |
| 36 | |
| 37 | def test_get_sitemap_body(self): |
| 38 | r = XmlResponse(url="http://www.example.com/", body=self.BODY) |
| 39 | self.assertSitemapBody(r, self.BODY) |
| 40 | |
| 41 | r = HtmlResponse(url="http://www.example.com/", body=self.BODY) |
| 42 | self.assertSitemapBody(r, None) |
| 43 | |
| 44 | r = Response(url="http://www.example.com/favicon.ico", body=self.BODY) |
| 45 | self.assertSitemapBody(r, None) |
| 46 | |
| 47 | r = XmlResponse(url="http://www.example.com/", body=b"") |
| 48 | self.assertSitemapBody(r, b"") |
| 49 | |
| 50 | def test_get_sitemap_body_gzip_headers(self): |
| 51 | r = Response( |
| 52 | url="http://www.example.com/sitemap", |
| 53 | body=self.GZBODY, |
| 54 | headers={"content-type": "application/gzip"}, |
| 55 | request=Request("http://www.example.com/sitemap"), |
| 56 | ) |
| 57 | self.assertSitemapBody(r, self.BODY) |
| 58 | |
| 59 | def test_get_sitemap_body_xml_url(self): |
| 60 | r = TextResponse(url="http://www.example.com/sitemap.xml", body=self.BODY) |
| 61 | self.assertSitemapBody(r, self.BODY) |
| 62 | |
| 63 | def test_get_sitemap_body_xml_url_compressed(self): |
| 64 | r = Response( |
| 65 | url="http://www.example.com/sitemap.xml.gz", |
| 66 | body=self.GZBODY, |
| 67 | request=Request("http://www.example.com/sitemap"), |
| 68 | ) |
| 69 | self.assertSitemapBody(r, self.BODY) |
| 70 | |
| 71 | # .xml.gz but body decoded by HttpCompression middleware already |
| 72 | r = Response(url="http://www.example.com/sitemap.xml.gz", body=self.BODY) |
| 73 | self.assertSitemapBody(r, self.BODY) |
| 74 | |
| 75 | def test_get_sitemap_urls_from_robotstxt(self): |
| 76 | robots = b"""# Sitemap files |
| 77 | Sitemap: http://example.com/sitemap.xml |
| 78 | Sitemap: http://example.com/sitemap-product-index.xml |
| 79 | Sitemap: HTTP://example.com/sitemap-uppercase.xml |