Return the sitemap body contained in the given response, or None if the response is not a sitemap.
(self, response: Response)
| 117 | break |
| 118 | |
| 119 | def _get_sitemap_body(self, response: Response) -> bytes | None: |
| 120 | """Return the sitemap body contained in the given response, |
| 121 | or None if the response is not a sitemap. |
| 122 | """ |
| 123 | if isinstance(response, XmlResponse): |
| 124 | return response.body |
| 125 | if gzip_magic_number(response): |
| 126 | uncompressed_size = len(response.body) |
| 127 | max_size = response.meta.get("download_maxsize", self._max_size) |
| 128 | warn_size = response.meta.get("download_warnsize", self._warn_size) |
| 129 | try: |
| 130 | body = gunzip(response.body, max_size=max_size) |
| 131 | except _DecompressionMaxSizeExceeded: |
| 132 | return None |
| 133 | if uncompressed_size < warn_size <= len(body): |
| 134 | logger.warning( |
| 135 | f"{response} body size after decompression ({len(body)} B) " |
| 136 | f"is larger than the download warning size ({warn_size} B)." |
| 137 | ) |
| 138 | return body |
| 139 | # actual gzipped sitemap files are decompressed above ; |
| 140 | # if we are here (response body is not gzipped) |
| 141 | # and have a response for .xml.gz, |
| 142 | # it usually means that it was already gunzipped |
| 143 | # by HttpCompression middleware, |
| 144 | # the HTTP response being sent with "Content-Encoding: gzip" |
| 145 | # without actually being a .xml.gz file in the first place, |
| 146 | # merely XML gzip-compressed on the fly, |
| 147 | # in other word, here, we have plain XML |
| 148 | if response.url.endswith(".xml") or response.url.endswith(".xml.gz"): |
| 149 | return response.body |
| 150 | return None |
| 151 | |
| 152 | |
| 153 | def regex(x: re.Pattern[str] | str) -> re.Pattern[str]: |