MCPcopy
hub / github.com/scrapy/scrapy / _get_sitemap_body

Method _get_sitemap_body

scrapy/spiders/sitemap.py:119–150  ·  view source on GitHub ↗

Return the sitemap body contained in the given response, or None if the response is not a sitemap.

(self, response: Response)

Source from the content-addressed store, hash-verified

117 break
118
119 def _get_sitemap_body(self, response: Response) -> bytes | None:
120 """Return the sitemap body contained in the given response,
121 or None if the response is not a sitemap.
122 """
123 if isinstance(response, XmlResponse):
124 return response.body
125 if gzip_magic_number(response):
126 uncompressed_size = len(response.body)
127 max_size = response.meta.get("download_maxsize", self._max_size)
128 warn_size = response.meta.get("download_warnsize", self._warn_size)
129 try:
130 body = gunzip(response.body, max_size=max_size)
131 except _DecompressionMaxSizeExceeded:
132 return None
133 if uncompressed_size < warn_size <= len(body):
134 logger.warning(
135 f"{response} body size after decompression ({len(body)} B) "
136 f"is larger than the download warning size ({warn_size} B)."
137 )
138 return body
139 # actual gzipped sitemap files are decompressed above ;
140 # if we are here (response body is not gzipped)
141 # and have a response for .xml.gz,
142 # it usually means that it was already gunzipped
143 # by HttpCompression middleware,
144 # the HTTP response being sent with "Content-Encoding: gzip"
145 # without actually being a .xml.gz file in the first place,
146 # merely XML gzip-compressed on the fly,
147 # in other word, here, we have plain XML
148 if response.url.endswith(".xml") or response.url.endswith(".xml.gz"):
149 return response.body
150 return None
151
152
153def regex(x: re.Pattern[str] | str) -> re.Pattern[str]:

Calls 3

gzip_magic_numberFunction · 0.90
gunzipFunction · 0.90
getMethod · 0.45