From d1ff8a86dda93c0ccb414965cb06273b2e2a4d01 Mon Sep 17 00:00:00 2001 From: Sei Lisa Date: Thu, 19 Jan 2017 07:00:06 +0100 Subject: [PATCH] Additional fixes to llBase64ToString behaviour. llBase64ToString hid another surprise: characters in range from U+0000 to U+001F are substituted by "?" except for tabs (\x09), form feeds (\x0A), shift ins (\x0F) and unit separators (\x1F), which were kept verbatim. So, mimic this behaviour. --- lslopt/lslbasefuncs.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lslopt/lslbasefuncs.py b/lslopt/lslbasefuncs.py index 63dc88b..0e1971e 100644 --- a/lslopt/lslbasefuncs.py +++ b/lslopt/lslbasefuncs.py @@ -925,7 +925,7 @@ b64tos_re = re.compile( b'(' # Those pass through and are caught by InternalUTF8toString: b'\x00$' # NUL at last position (zstr removes it) - b'|[\x01-\x7F\xFE\xFF]|[\xC2-\xDF][\x80-\xBF]' + b'|[\x09\x0A\x0F\x1F-\x7F\xFE\xFF]|[\xC2-\xDF][\x80-\xBF]' b'|(?:\xE0[\xA0-\xBF]|[\xE1-\xEF][\x80-\xBF])[\x80-\xBF]' b'|(?:\xF0[\x90-\xBF]|[\xF1-\xF7][\x80-\xBF])[\x80-\xBF]{2}' b'|(?:\xF8[\x88-\xBF]|[\xF9-\xFB][\x80-\xBF])[\x80-\xBF]{3}' @@ -933,7 +933,7 @@ b64tos_re = re.compile( b')|(' # Those are caught here and substituted by a single "?" # (greediness is important here): - b'[\x00\x80-\xBF]' + b'[\x00-\x1F\x80-\xBF]' b'|[\xC0-\xDF][\x80-\xBF]?' b'|[\xE0-\xEF][\x80-\xBF]{0,2}' b'|[\xF0-\xF7][\x80-\xBF]{0,3}' @@ -958,7 +958,8 @@ def llBase64ToString(s): # UTF-8 does. This causes inconsistencies in the number of ?'s returned. # In llBase64ToString, trailing NUL is stripped, and embedded NULs are - # converted to "?". + # converted to "?". In addition, characters in range 00-1F are also + # converted to "?" except for \x09, \x0A, \x0F, \x1F. byteseq = bytearray(b64decode(s + u'=' * (-len(s) & 3)))