Deal with changes in llChar, strengthen some tests

Previously, llChar formed an UTF-8-1993 string with the given code and converted that, resulting in multiple question marks when the conversion to Unicode forced by Mono caused errors in multiple characters. They have changed the implementation and now it also considers U+FFFF invalid, and only returns one U+FFFD character if the input is invalid, and LSO behaves the same as Mono (no UTF-8-1993 anymore). We've also detected problems with Windows (who else would it be) for the Unicode "astral" planes (planes beyond the Basic Multilingual Plane), so now there are new tests that include characters > U+FFFF. And since some builds of Python 2 use UTF-16 internally, we also check llSubString and friends with positions after an astral plane character. This is currently failing under Windows, as there are numerous encoding and line ending problems happening on that OS, especially with Python 3.
2025-07-14 08:46:24 -07:00 · 2022-12-08 13:22:54 +01:00 · 2022-12-08 13:22:54 +01:00 · 0b7d04e5ff
commit 0b7d04e5ff
parent 947dcd9383
5 changed files with 32 additions and 56 deletions
--- a/lslopt/lslbasefuncs.py
+++ b/lslopt/lslbasefuncs.py
@ -1153,50 +1153,10 @@ def llCeil(f):

 def llChar(code):
    code = fi(code)
-    # The result is consistent with a conversion of the codepoint to
-    # UTF-8-1993, then using InternalUTF8toString on the result.
-    # A thorough test shows that llChar(n) equals llUnescapeURL(utf8_1993)
-    # up to codepoint 0x13FFFF. Furthermore llChar(0x200000) returns "?????",
-    # and llChar(0x7FFFFFFF) returns "??????", which are also consistent with
-    # that. LSO also returns UTF-8-1993 for codepoints > 0x10FFFF. So, the
-    # internal implementation is likely to form a UTF8-1993 string from the
-    # codepoint and then convert that to string, like this:
-#    if code < 0:
-#       return u'?'
-#    if code < 0x80:
-#        s = (code,)
-#    elif code < 0x800:
-#        s = (0xC0+(code >> 6), 0x80+(code&0x3F))
-#    elif code < 0x10000:
-#        s = (0xE0+(code >> 12), 0x80+((code >> 6)&0x3F), 0x80+(code&0x3F))
-#    elif code < 0x200000:
-#        s = (0xF0+(code >> 18), 0x80+((code >> 12)&0x3F),
-#            0x80+((code >> 6)&0x3F), 0x80+(code&0x3F))
-#    elif code < 0x4000000:
-#        s = (0xF8+(code >> 24),
-#            0x80+((code >> 18)&0x3F), 0x80+((code >> 12)&0x3F),
-#            0x80+((code >> 6)&0x3F), 0x80+(code&0x3F))
-#    else:
-#        s = (0xFC+(code >> 30), 0x80+((code >> 24)&0x3F),
-#            0x80+((code >> 18)&0x3F), 0x80+((code >> 12)&0x3F),
-#            0x80+((code >> 6)&0x3F), 0x80+(code&0x3F))
-#    return zstr(InternalUTF8toString(bytearray(s)))

-    # Here's an alternative, simpler implementation that only works for Mono:
-    if lslcommon.LSO:
-        raise ELSLCantCompute
-    if code <= 0 or code > 0x10FFFF:
-        if code == 0:
-            return u''
-        if code < 0:
-            return u'?'
-        if code >= 0x4000000:
-            return u'??????'
-        if code >= 0x200000:
-            return u'?????'
-        return u'????'
-    if (0xD800 <= code <= 0xDFFF) or code == 0xFFFE:
-        return u'???'
+    if (not 1 <= code <= 0x10FFFF or 0xD800 <= code <= 0xDFFF
+            or code == 0xFFFE or code == 0xFFFF):
+        return u'' if code == 0 else u'\uFFFD'
    return unichr(code)

 def llCos(f):
--- a/unit_tests/expr.suite/llord-char-hash.lsl
+++ b/unit_tests/expr.suite/llord-char-hash.lsl
@ -5,12 +5,15 @@
 , llOrd(".", 1)
 , llOrd(".", 2)
 , llOrd("ð", 0)
+, llOrd("𝄞𝐀", -1)
 , llOrd("𝄞𝐀", 0)
 , llOrd("𝄞𝐀", 1)
 , llOrd("𝄞𝐀", 2)
 , llOrd("𝄞𝐀", 3)
+, llOrd("𝄞𝐀", 4)
 , llOrd(JSON_TRUE, 0)
 , llOrd(llUnescapeURL("%EF%BF%BF"), 0)
+, llEscapeURL(llChar(-123456789))
 , llEscapeURL(llChar(-123))
 , llEscapeURL(llChar(-1))
 , llEscapeURL(llChar(0))
--- a/unit_tests/expr.suite/llord-char-hash.out
+++ b/unit_tests/expr.suite/llord-char-hash.out
@ -5,30 +5,33 @@
 , 0
 , 0
 , 240
+, 119808
 , 119070
 , 119808
 , 0
 , 0
+, 0
 , 64982
 , 65535
-, "%3F"
-, "%3F"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
 , ""
 , "%01"
 , "%C2%A9"
 , "%C5%8D"
 , "%E2%80%90"
-, "%3F%3F%3F"
-, "%3F%3F%3F"
-, "%3F%3F%3F"
-, "%EF%BF%BF"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
 , "%F0%9F%98%80"
 , "%F4%8F%BF%BF"
-, "%3F%3F%3F%3F"
-, "%3F%3F%3F%3F%3F"
-, "%3F%3F%3F%3F%3F%3F"
-, "%3F%3F%3F%3F%3F%3F"
-, "%3F"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
 , 1203819346
 , 0
 , 1172851538
--- a/unit_tests/expr.suite/string-funcs.lsl
+++ b/unit_tests/expr.suite/string-funcs.lsl
@ -182,9 +182,14 @@
 , llGetSubString("abcd",  9,  3)
 , llGetSubString("abcd",  9,  4)
 , llGetSubString("abcd",  9,  5)
+, llGetSubString("😀bcd",  0,  0)
+, llGetSubString("😀bcd",  1,  1)
+, llGetSubString("😀bcd",  2,  2)
+, llGetSubString("😀bcd",  3,  3)
+, llGetSubString("😀bcd",  4,  4)
 , llGetSubString("", 0, -1)
 , llStringLength("")
-, llStringLength("÷½¬⅛⅜⅝⅞±°z")
+, llStringLength("÷½¬⅛⅜⅝⅞😀±°z")
 , llSubStringIndex("x", "blah")
 , llSubStringIndex("", "")
 , llSubStringIndex("", "x")
--- a/unit_tests/expr.suite/string-funcs.out
+++ b/unit_tests/expr.suite/string-funcs.out
@ -182,9 +182,14 @@
 , "abcd"
 , "abcd"
 , "abcd"
+, "😀"
+, "b"
+, "c"
+, "d"
+, ""
 , ""
 , 0
-, 10
+, 11
 , -1
 , 0
 , -1