unstructured = (*([FWS] vchar) *WSP) / obs-unstruct obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) obs-utext = %d0 / obs-NO-WS-CTL / LF / CR obs-NO-WS-CTL is control characters except WSP/CR/LF. So, basically, we have printable runs, plus control characters or nul
(value)
| 1122 | return ew, value |
| 1123 | |
| 1124 | def get_unstructured(value): |
| 1125 | """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct |
| 1126 | obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) |
| 1127 | obs-utext = %d0 / obs-NO-WS-CTL / LF / CR |
| 1128 | |
| 1129 | obs-NO-WS-CTL is control characters except WSP/CR/LF. |
| 1130 | |
| 1131 | So, basically, we have printable runs, plus control characters or nulls in |
| 1132 | the obsolete syntax, separated by whitespace. Since RFC 2047 uses the |
| 1133 | obsolete syntax in its specification, but requires whitespace on either |
| 1134 | side of the encoded words, I can see no reason to need to separate the |
| 1135 | non-printable-non-whitespace from the printable runs if they occur, so we |
| 1136 | parse this into xtext tokens separated by WSP tokens. |
| 1137 | |
| 1138 | Because an 'unstructured' value must by definition constitute the entire |
| 1139 | value, this 'get' routine does not return a remaining value, only the |
| 1140 | parsed TokenList. |
| 1141 | |
| 1142 | """ |
| 1143 | # XXX: but what about bare CR and LF? They might signal the start or |
| 1144 | # end of an encoded word. YAGNI for now, since our current parsers |
| 1145 | # will never send us strings with bare CR or LF. |
| 1146 | |
| 1147 | unstructured = UnstructuredTokenList() |
| 1148 | while value: |
| 1149 | if value[0] in WSP: |
| 1150 | token, value = get_fws(value) |
| 1151 | unstructured.append(token) |
| 1152 | continue |
| 1153 | valid_ew = True |
| 1154 | if value.startswith('=?'): |
| 1155 | try: |
| 1156 | token, value = get_encoded_word(value, 'utext') |
| 1157 | except _InvalidEwError: |
| 1158 | valid_ew = False |
| 1159 | except errors.HeaderParseError: |
| 1160 | # XXX: Need to figure out how to register defects when |
| 1161 | # appropriate here. |
| 1162 | pass |
| 1163 | else: |
| 1164 | have_ws = True |
| 1165 | if len(unstructured) > 0: |
| 1166 | if unstructured[-1].token_type != 'fws': |
| 1167 | unstructured.defects.append(errors.InvalidHeaderDefect( |
| 1168 | "missing whitespace before encoded word")) |
| 1169 | have_ws = False |
| 1170 | if have_ws and len(unstructured) > 1: |
| 1171 | if unstructured[-2].token_type == 'encoded-word': |
| 1172 | unstructured[-1] = EWWhiteSpaceTerminal( |
| 1173 | unstructured[-1], 'fws') |
| 1174 | unstructured.append(token) |
| 1175 | continue |
| 1176 | tok, *remainder = _wsp_splitter(value, 1) |
| 1177 | # Split in the middle of an atom if there is a rfc2047 encoded word |
| 1178 | # which does not have WSP on both sides. The defect will be registered |
| 1179 | # the next time through the loop. |
| 1180 | # This needs to only be performed when the encoded word is valid; |
| 1181 | # otherwise, performing it on an invalid encoded word can cause |
no test coverage detected
searching dependent graphs…