| 422 | self.assertRaises(HTTPInputError, HTTPHeaders.parse, data) |
| 423 | |
| 424 | def test_unicode_newlines(self): |
| 425 | # Ensure that only \r\n is recognized as a header separator, and not |
| 426 | # the other newline-like unicode characters. |
| 427 | # Characters that are likely to be problematic can be found in |
| 428 | # http://unicode.org/standard/reports/tr13/tr13-5.html |
| 429 | # and cpython's unicodeobject.c (which defines the implementation |
| 430 | # of unicode_type.splitlines(), and uses a different list than TR13). |
| 431 | newlines = [ |
| 432 | # The following ascii characters are sometimes treated as newline-like, |
| 433 | # but they're disallowed in HTTP headers. This test covers unicode |
| 434 | # characters that are permitted in headers (under the obs-text rule). |
| 435 | # "\u001b", # VERTICAL TAB |
| 436 | # "\u001c", # FILE SEPARATOR |
| 437 | # "\u001d", # GROUP SEPARATOR |
| 438 | # "\u001e", # RECORD SEPARATOR |
| 439 | "\u0085", # NEXT LINE |
| 440 | "\u2028", # LINE SEPARATOR |
| 441 | "\u2029", # PARAGRAPH SEPARATOR |
| 442 | ] |
| 443 | for newline in newlines: |
| 444 | # Try the utf8 and latin1 representations of each newline |
| 445 | for encoding in ["utf8", "latin1"]: |
| 446 | try: |
| 447 | try: |
| 448 | encoded = newline.encode(encoding) |
| 449 | except UnicodeEncodeError: |
| 450 | # Some chars cannot be represented in latin1 |
| 451 | continue |
| 452 | data = b"Cookie: foo=" + encoded + b"bar" |
| 453 | # parse() wants a native_str, so decode through latin1 |
| 454 | # in the same way the real parser does. |
| 455 | headers = HTTPHeaders.parse(native_str(data.decode("latin1"))) |
| 456 | expected = [ |
| 457 | ( |
| 458 | "Cookie", |
| 459 | "foo=" + native_str(encoded.decode("latin1")) + "bar", |
| 460 | ) |
| 461 | ] |
| 462 | self.assertEqual(expected, list(headers.get_all())) |
| 463 | except Exception: |
| 464 | gen_log.warning("failed while trying %r in %s", newline, encoding) |
| 465 | raise |
| 466 | |
| 467 | def test_unicode_whitespace(self): |
| 468 | # Only tabs and spaces are to be stripped according to the HTTP standard. |