| 22 | |
| 23 | |
| 24 | def decode_robotstxt( |
| 25 | robotstxt_body: bytes, spider: Spider | None, to_native_str_type: bool = False |
| 26 | ) -> str: |
| 27 | try: |
| 28 | if to_native_str_type: |
| 29 | body_decoded = to_unicode(robotstxt_body) |
| 30 | else: |
| 31 | body_decoded = robotstxt_body.decode("utf-8-sig", errors="ignore") |
| 32 | except UnicodeDecodeError: |
| 33 | # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it. |
| 34 | # Switch to 'allow all' state. |
| 35 | logger.warning( |
| 36 | "Failure while parsing robots.txt. File either contains garbage or " |
| 37 | "is in an encoding other than UTF-8, treating it as an empty file.", |
| 38 | exc_info=sys.exc_info(), |
| 39 | extra={"spider": spider}, |
| 40 | ) |
| 41 | body_decoded = "" |
| 42 | return body_decoded |
| 43 | |
| 44 | |
| 45 | class RobotParser(metaclass=ABCMeta): |