As per RFC 3987 Section 3.2, step three of converting a URI into an IRI, repercent-encode any octet produced that is not part of a strictly legal UTF-8 octet sequence.
(path)
| 211 | |
| 212 | |
| 213 | def repercent_broken_unicode(path): |
| 214 | """ |
| 215 | As per RFC 3987 Section 3.2, step three of converting a URI into an IRI, |
| 216 | repercent-encode any octet produced that is not part of a strictly legal |
| 217 | UTF-8 octet sequence. |
| 218 | """ |
| 219 | changed_parts = [] |
| 220 | while True: |
| 221 | try: |
| 222 | path.decode() |
| 223 | except UnicodeDecodeError as e: |
| 224 | # CVE-2019-14235: A recursion shouldn't be used since the exception |
| 225 | # handling uses massive amounts of memory |
| 226 | repercent = quote(path[e.start : e.end], safe=b"/#%[]=:;$&()+,!?*@'~") |
| 227 | changed_parts.append(path[: e.start] + repercent.encode()) |
| 228 | path = path[e.end :] |
| 229 | else: |
| 230 | return b"".join(changed_parts) + path |
| 231 | |
| 232 | |
| 233 | def filepath_to_uri(path): |