mirror of
https://github.com/Sei-Lisa/LSL-PyOptimizer
synced 2025-07-01 23:58:20 +00:00
Fix InternalUTF8ToString to consider U+FFFE as invalid
This commit also adds the foldtabs option as default for the eval test suite, and substantially improves the Unicode valid/invalid character test.
This commit is contained in:
parent
4771c76d85
commit
0b266db758
4 changed files with 33 additions and 10 deletions
|
@ -555,8 +555,9 @@ def InternalList2Strings(val):
|
||||||
|
|
||||||
good_utf8_re = re.compile(b'(?:'
|
good_utf8_re = re.compile(b'(?:'
|
||||||
b'[\x00-\x7F]|[\xC2-\xDF][\x80-\xBF]'
|
b'[\x00-\x7F]|[\xC2-\xDF][\x80-\xBF]'
|
||||||
b'|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}'
|
b'|[\xE1-\xEC\xEE][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}'
|
||||||
b'|\xE0[\xA0-\xBF][\x80-\xBF]|\xED[\x80-\x9F][\x80-\xBF]'
|
b'|\xE0[\xA0-\xBF][\x80-\xBF]|\xED[\x80-\x9F][\x80-\xBF]'
|
||||||
|
b'|\xEF[\x80-\xBE][\x80-\xBF]|\xEF\xBF[\x80-\xBD\xBF]'
|
||||||
b'|\xF0[\x90-\xBF][\x80-\xBF]{2}|\xF4[\x80-\x8F][\x80-\xBF]{2}'
|
b'|\xF0[\x90-\xBF][\x80-\xBF]{2}|\xF4[\x80-\x8F][\x80-\xBF]{2}'
|
||||||
b')+')
|
b')+')
|
||||||
|
|
||||||
|
@ -573,19 +574,25 @@ def InternalUTF8toString(s):
|
||||||
# present, we need to write our own algorithm.
|
# present, we need to write our own algorithm.
|
||||||
|
|
||||||
# We reproduce the following observed behaviour:
|
# We reproduce the following observed behaviour:
|
||||||
# - A valid UTF-8 sequence is a UTF-8/1993 sequence excluding overlong
|
# - A valid UTF-8 sequence is a RFC-2279 UTF-8 sequence excluding overlong
|
||||||
# sequences, the range U+D800-U+DFFF (UTF-16 surrogate pairs) and the
|
# sequences, the range U+D800-U+DFFF (UTF-16 surrogate pairs), everything
|
||||||
# range above 0x10FFFF. Any sequence that is not valid is invalid.
|
# above U+10FFFF and the codepoint U+FFFE. Any sequence that is not valid
|
||||||
|
# is invalid.
|
||||||
# - If an invalid sequence is detected, each byte in the sequence is
|
# - If an invalid sequence is detected, each byte in the sequence is
|
||||||
# treated as an invalid character and replaced with a ? character.
|
# treated as an invalid character and replaced with a ? character.
|
||||||
#
|
#
|
||||||
# Notes:
|
# The invalid sequences are the ones starting with the following:
|
||||||
# - \xC0 and \xC1 generate overlong codes in the range 0-7F.
|
# - \xC0 and \xC1 generate overlong codes in the range 0-7F.
|
||||||
# - \xE0\x80 through \xE0\x9F generate overlong codes in the range 0-7FF.
|
# - \xE0\x80 through \xE0\x9F generate overlong codes in the range 0-7FF.
|
||||||
# - \xF0\x80 through \xF0\x8F generate overlong codes in the range 0-FFFF.
|
|
||||||
# - \xED\xA0 through \xED\xBF generate high and low UTF-16 surrogates.
|
# - \xED\xA0 through \xED\xBF generate high and low UTF-16 surrogates.
|
||||||
|
# - \xEF\xBF\xBE is the sequence for the invalid code U+FFFE.
|
||||||
|
# - \xF0\x80 through \xF0\x8F generate overlong codes in the range 0-FFFF.
|
||||||
# - \xF4\x90 through \xF4\xBF generate codes above U+10FFFF.
|
# - \xF4\x90 through \xF4\xBF generate codes above U+10FFFF.
|
||||||
# - All of the above are invalid, as well as start bytes >= \xF5.
|
# - \xF5 through \xFD generate even bigger codes.
|
||||||
|
# - \xFE and \xFF were never valid UTF-8.
|
||||||
|
#
|
||||||
|
# Reminder: Start codes \xC0-\xDF have length 2; \xE0-\xEF have length 3
|
||||||
|
# and \xF0-\xF4, length 4.
|
||||||
#
|
#
|
||||||
# Examples:
|
# Examples:
|
||||||
# b'\xC0\x81' is invalid because it represents an overlong U+0001.
|
# b'\xC0\x81' is invalid because it represents an overlong U+0001.
|
||||||
|
|
|
@ -743,7 +743,7 @@ def generateScriptTests():
|
||||||
else ['main.py',
|
else ['main.py',
|
||||||
# Defaults for Expr:
|
# Defaults for Expr:
|
||||||
'-O', 'clear,optimize,constfold'
|
'-O', 'clear,optimize,constfold'
|
||||||
',addstrings,expr',
|
',addstrings,foldtabs,expr',
|
||||||
'-y',
|
'-y',
|
||||||
'-']))
|
'-']))
|
||||||
werr(u"\nRunning test %s: " % any2u(fbase))
|
werr(u"\nRunning test %s: " % any2u(fbase))
|
||||||
|
|
|
@ -1 +1,9 @@
|
||||||
llUnescapeURL("a%C3%A1%FC%80%80%E8%B0%F0%A0")
|
[ llEscapeURL(llUnescapeURL("%01%09%80%BF%C2%C3 a?%C0%80%C1%BF%C2%80%DF%BF"))
|
||||||
|
, llEscapeURL(llUnescapeURL("%E0%80%80%E0%9F%BF%E0%A0%80%EC%BF%BF%ED%9F%BF"))
|
||||||
|
, llEscapeURL(llUnescapeURL("%ED%A0%80%ED%BF%BF%EE%80%80%EF%BE%BF%EF%BF%80"))
|
||||||
|
, llEscapeURL(llUnescapeURL("%EF%BF%BD%EF%BF%BE%EF%BF%BF"))
|
||||||
|
, llEscapeURL(llUnescapeURL("%F0%80%80%80%F0%8F%BF%BF%F0%90%80%80%F3%BF%BF%BF"))
|
||||||
|
, llEscapeURL(llUnescapeURL("%F4%80%80%80%F4%8F%BF%BF%F4%90%80%80%F4%BF%BF%BF"))
|
||||||
|
, llEscapeURL(llUnescapeURL("%F5%80%80%80%F7%BF%BF%BF"))
|
||||||
|
, llEscapeURL(llUnescapeURL("%F8%80%80%80%80%F9%FA%FB%FC%FD%FE%FF%E1%80"))
|
||||||
|
]
|
||||||
|
|
|
@ -1 +1,9 @@
|
||||||
"aá???????"
|
[ "%01%09%3F%3F%3F%3F%20a%3F%3F%3F%3F%3F%C2%80%DF%BF"
|
||||||
|
, "%3F%3F%3F%3F%3F%3F%E0%A0%80%EC%BF%BF%ED%9F%BF"
|
||||||
|
, "%3F%3F%3F%3F%3F%3F%EE%80%80%EF%BE%BF%EF%BF%80"
|
||||||
|
, "%EF%BF%BD%3F%3F%3F%EF%BF%BF"
|
||||||
|
, "%3F%3F%3F%3F%3F%3F%3F%3F%F0%90%80%80%F3%BF%BF%BF"
|
||||||
|
, "%F4%80%80%80%F4%8F%BF%BF%3F%3F%3F%3F%3F%3F%3F%3F"
|
||||||
|
, "%3F%3F%3F%3F%3F%3F%3F%3F"
|
||||||
|
, "%3F%3F%3F%3F%3F%3F%3F%3F%3F%3F%3F%3F%3F%3F"
|
||||||
|
]
|
Loading…
Add table
Add a link
Reference in a new issue