From 0b266db7582591c421fa6e3b5f52f3390eff864b Mon Sep 17 00:00:00 2001 From: Sei Lisa Date: Thu, 19 Nov 2020 22:26:42 +0100 Subject: [PATCH] Fix InternalUTF8ToString to consider U+FFFE as invalid This commit also adds the foldtabs option as default for the eval test suite, and substantially improves the Unicode valid/invalid character test. --- lslopt/lslbasefuncs.py | 21 ++++++++++++++------- run-tests.py | 2 +- unit_tests/expr.suite/unicode.lsl | 10 +++++++++- unit_tests/expr.suite/unicode.out | 10 +++++++++- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/lslopt/lslbasefuncs.py b/lslopt/lslbasefuncs.py index fda8ad1..3f3fe8f 100644 --- a/lslopt/lslbasefuncs.py +++ b/lslopt/lslbasefuncs.py @@ -555,8 +555,9 @@ def InternalList2Strings(val): good_utf8_re = re.compile(b'(?:' b'[\x00-\x7F]|[\xC2-\xDF][\x80-\xBF]' - b'|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}' + b'|[\xE1-\xEC\xEE][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}' b'|\xE0[\xA0-\xBF][\x80-\xBF]|\xED[\x80-\x9F][\x80-\xBF]' + b'|\xEF[\x80-\xBE][\x80-\xBF]|\xEF\xBF[\x80-\xBD\xBF]' b'|\xF0[\x90-\xBF][\x80-\xBF]{2}|\xF4[\x80-\x8F][\x80-\xBF]{2}' b')+') @@ -573,19 +574,25 @@ def InternalUTF8toString(s): # present, we need to write our own algorithm. # We reproduce the following observed behaviour: - # - A valid UTF-8 sequence is a UTF-8/1993 sequence excluding overlong - # sequences, the range U+D800-U+DFFF (UTF-16 surrogate pairs) and the - # range above 0x10FFFF. Any sequence that is not valid is invalid. + # - A valid UTF-8 sequence is a RFC-2279 UTF-8 sequence excluding overlong + # sequences, the range U+D800-U+DFFF (UTF-16 surrogate pairs), everything + # above U+10FFFF and the codepoint U+FFFE. Any sequence that is not valid + # is invalid. # - If an invalid sequence is detected, each byte in the sequence is # treated as an invalid character and replaced with a ? character. # - # Notes: + # The invalid sequences are the ones starting with the following: # - \xC0 and \xC1 generate overlong codes in the range 0-7F. # - \xE0\x80 through \xE0\x9F generate overlong codes in the range 0-7FF. - # - \xF0\x80 through \xF0\x8F generate overlong codes in the range 0-FFFF. # - \xED\xA0 through \xED\xBF generate high and low UTF-16 surrogates. + # - \xEF\xBF\xBE is the sequence for the invalid code U+FFFE. + # - \xF0\x80 through \xF0\x8F generate overlong codes in the range 0-FFFF. # - \xF4\x90 through \xF4\xBF generate codes above U+10FFFF. - # - All of the above are invalid, as well as start bytes >= \xF5. + # - \xF5 through \xFD generate even bigger codes. + # - \xFE and \xFF were never valid UTF-8. + # + # Reminder: Start codes \xC0-\xDF have length 2; \xE0-\xEF have length 3 + # and \xF0-\xF4, length 4. # # Examples: # b'\xC0\x81' is invalid because it represents an overlong U+0001. diff --git a/run-tests.py b/run-tests.py index b684447..d7e55e2 100755 --- a/run-tests.py +++ b/run-tests.py @@ -743,7 +743,7 @@ def generateScriptTests(): else ['main.py', # Defaults for Expr: '-O', 'clear,optimize,constfold' - ',addstrings,expr', + ',addstrings,foldtabs,expr', '-y', '-'])) werr(u"\nRunning test %s: " % any2u(fbase)) diff --git a/unit_tests/expr.suite/unicode.lsl b/unit_tests/expr.suite/unicode.lsl index 83a4b43..5a57de8 100644 --- a/unit_tests/expr.suite/unicode.lsl +++ b/unit_tests/expr.suite/unicode.lsl @@ -1 +1,9 @@ -llUnescapeURL("a%C3%A1%FC%80%80%E8%B0%F0%A0") +[ llEscapeURL(llUnescapeURL("%01%09%80%BF%C2%C3 a?%C0%80%C1%BF%C2%80%DF%BF")) +, llEscapeURL(llUnescapeURL("%E0%80%80%E0%9F%BF%E0%A0%80%EC%BF%BF%ED%9F%BF")) +, llEscapeURL(llUnescapeURL("%ED%A0%80%ED%BF%BF%EE%80%80%EF%BE%BF%EF%BF%80")) +, llEscapeURL(llUnescapeURL("%EF%BF%BD%EF%BF%BE%EF%BF%BF")) +, llEscapeURL(llUnescapeURL("%F0%80%80%80%F0%8F%BF%BF%F0%90%80%80%F3%BF%BF%BF")) +, llEscapeURL(llUnescapeURL("%F4%80%80%80%F4%8F%BF%BF%F4%90%80%80%F4%BF%BF%BF")) +, llEscapeURL(llUnescapeURL("%F5%80%80%80%F7%BF%BF%BF")) +, llEscapeURL(llUnescapeURL("%F8%80%80%80%80%F9%FA%FB%FC%FD%FE%FF%E1%80")) +] diff --git a/unit_tests/expr.suite/unicode.out b/unit_tests/expr.suite/unicode.out index 34102fc..2eee479 100644 --- a/unit_tests/expr.suite/unicode.out +++ b/unit_tests/expr.suite/unicode.out @@ -1 +1,9 @@ -"aĆ”???????" \ No newline at end of file +[ "%01%09%3F%3F%3F%3F%20a%3F%3F%3F%3F%3F%C2%80%DF%BF" +, "%3F%3F%3F%3F%3F%3F%E0%A0%80%EC%BF%BF%ED%9F%BF" +, "%3F%3F%3F%3F%3F%3F%EE%80%80%EF%BE%BF%EF%BF%80" +, "%EF%BF%BD%3F%3F%3F%EF%BF%BF" +, "%3F%3F%3F%3F%3F%3F%3F%3F%F0%90%80%80%F3%BF%BF%BF" +, "%F4%80%80%80%F4%8F%BF%BF%3F%3F%3F%3F%3F%3F%3F%3F" +, "%3F%3F%3F%3F%3F%3F%3F%3F" +, "%3F%3F%3F%3F%3F%3F%3F%3F%3F%3F%3F%3F%3F%3F" +] \ No newline at end of file