LSL-PyOptimizer/strutil.py

#    (C) Copyright 2015-2024 Sei Lisa. All rights reserved.
#
#    This file is part of LSL PyOptimizer.
#
#    LSL PyOptimizer is free software: you can redistribute it and/or
#    modify it under the terms of the GNU General Public License as
#    published by the Free Software Foundation, either version 3 of the
#    License, or (at your option) any later version.
#
#    LSL PyOptimizer is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with LSL PyOptimizer. If not, see <http://www.gnu.org/licenses/>.

# String <-> Bytes conversion and output utilities

# Microsoft again not following standards. Sigh.
import codecs
codecs.register(lambda x: codecs.lookup('utf8') if x == 'cp65001' else None)

import sys

python2Narrow = False
if sys.version_info.major >= 3:
    unicode = str
    unichr = chr
    xrange = range
    python3 = True
    python2 = False
    uniwrap = unicode
    bytewrap = bytes

    def str2u(s, enc=None):
        """Convert a native Python3 str to Unicode. This is a NOP."""
        return s

    def str2b(s, enc=None):
        """Convert a native Python3 str to bytes, with the given encoding."""
        return s.encode(getattr(enc, 'encoding', enc) or 'utf8',
                        'backslashreplace')

    def u2str(s, enc=None):
        """Convert a Unicode string to native Python 3 str. This is a NOP."""
        return s

    def b2str(s, enc=None):
        """Convert a Bytes string to native Python 3 str."""
        return s.decode(getattr(enc, 'encoding', enc) or 'utf8',
                        'replace')

    def any2str(s, enc=None):
        """Convert Bytes or Unicode to native Python 3 str."""
        return s if type(s) == str else b2str(s, enc)

else:
    unicode = unicode
    unichr = unichr
    xrange = xrange
    python2 = True
    python3 = False
    uniwrap = unicode
    bytewrap = bytearray

    def str2u(s, enc=None):
        """Convert a native Python2 str to Unicode."""
        return s.decode(getattr(enc, 'encoding', enc) or 'utf8',
                        'replace')

    def str2b(s, enc=None):
        """Convert a native Python2 str to bytes. This is a NOP."""
        return s

    def u2str(s, enc=None):
        """Convert a Unicode string to native Python 2 str."""
        return s.encode(getattr(enc, 'encoding', enc) or 'utf8',
                        'backslashreplace')

    def b2str(s, enc=None):
        """Convert a Bytes string to native Python 2 str. This is a NOP."""
        return s

    def any2str(s, enc=None):
        """Convert Bytes or Unicode to native Python 2 str."""
        return s if type(s) == str else u2str(s, enc)

    if len(u'\U00010001') == 2:
        # Narrow character build (UTF-16 strings)
        # Monkey-patch the relevant functions
        python2Narrow = True
        _unichr = unichr
        _ord = ord
        _len = len

        def unichr(n):
            if not (65536 <= n < 0x110000):
                return _unichr(n)
            return ('\\U%08X' % n).decode('unicode-escape')

        def ord(x):
            if isinstance(x, unicode) and _len(x) == 2:
                x = unicode(x)
                if 0xD800 <= _ord(x[0]) < 0xDC00:
                    return 65536 + ((_ord(x[0]) & 0x3FF) << 10
                        | (_ord(x[1]) & 0x3FF))
            return _ord(x)

        def len(x):
            if isinstance(x, unicode):
                return _len(x.encode('utf-32le')) >> 2
            return _len(x)

        # Alas, we can't monkey-patch the unicode class' __getitem__ and
        # __getslice__ methods; we need a workaround.
        class uniwrap(unicode):
            def __getslice__(self, start, stop):
                lim = sys.maxint >> 2
                if start < 0: start = 0
                if stop < 0: stop = 0
                if start < lim:
                    start <<= 2
                else:
                    start = sys.maxint
                if stop < lim:
                    stop <<= 2
                else:
                    stop = sys.maxint
                return self.encode('utf-32le')[start:stop].decode(
                    'utf-32le')
            def __getitem__(self, item):
                if type(item) == slice:
                    start = item.start
                    stop = item.stop
                    step = item.step
                    if start is not None:
                        start <<= 2
                    if stop is not None:
                        stop <<= 2
                    if step is not None:
                        step <<= 2
                    return self.encode('utf-32le')[start:stop:step].decode(
                        'utf-32le')
                u = self.encode('utf-32le')
                item <<= 2
                if item >= _len(u):
                    return u[item]  # raise IndexError, as slicing doesn't
                return u[item:(item+4 if item != -4 else None)].decode(
                    'utf-32le')

def b2u(s, enc=None):
    """Bytes to Unicode"""
    return str2u(b2str(s, enc), enc)

def u2b(s, enc=None):
    """Unicode to Bytes"""
    return str2b(u2str(s, enc), enc)

def any2b(s, enc=None):
    """Bytes or Unicode to Bytes"""
    return s if type(s) == bytes else u2b(s, enc)

def any2u(s, enc=None):
    """Bytes or Unicode to Unicode"""
    return s if type(s) == unicode else b2u(s, enc)

def werr(s):
    """Write any string to stderr"""
    sys.stderr.write(any2str(s, sys.stderr))

def wout(s):
    """Write any string to stdout"""
    sys.stdout.write(any2str(s, sys.stdout))

strutil_used = True
New copyright year 2024-04-14 02:40:21 -07:00			`# (C) Copyright 2015-2024 Sei Lisa. All rights reserved.`
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00			`#`
			`# This file is part of LSL PyOptimizer.`
			`#`
			`# LSL PyOptimizer is free software: you can redistribute it and/or`
			`# modify it under the terms of the GNU General Public License as`
			`# published by the Free Software Foundation, either version 3 of the`
			`# License, or (at your option) any later version.`
			`#`
			`# LSL PyOptimizer is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with LSL PyOptimizer. If not, see <http://www.gnu.org/licenses/>.`

			`# String <-> Bytes conversion and output utilities`

Fix codec problem on MSW 2019-01-31 03:49:31 -07:00			`# Microsoft again not following standards. Sigh.`
			`import codecs`
			`codecs.register(lambda x: codecs.lookup('utf8') if x == 'cp65001' else None)`

First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00			`import sys`
Monkey-patch some string functions in narrow string builds The official distribution of Python 2.7 on Windows is built with "narrow strings" (UTF-16 strings with no proper indexing or length). This makes some tests fail. "Fix" this by monkey-patching a few functions and using a wrapping unicode class, as we can't monkey-patch the actual unicode type. This is very fragile code, but it's the best we could do given the limitations. 2022-12-11 12:39:44 -07:00
llHMAC implementation (without importing hmac) 2023-02-03 13:48:33 -07:00			`python2Narrow = False`
Style fixes Calm pyflakes by using identifiers, and change hexversion to version.major. 2019-01-18 15:41:45 -07:00			`if sys.version_info.major >= 3:`
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00			`unicode = str`
			`unichr = chr`
Additions for strutil: xrange, python2, python3, any2str Also output to stderr with str instead of unicode. 2020-11-08 18:12:29 -07:00			`xrange = range`
			`python3 = True`
			`python2 = False`
Monkey-patch some string functions in narrow string builds The official distribution of Python 2.7 on Windows is built with "narrow strings" (UTF-16 strings with no proper indexing or length). This makes some tests fail. "Fix" this by monkey-patching a few functions and using a wrapping unicode class, as we can't monkey-patch the actual unicode type. This is very fragile code, but it's the best we could do given the limitations. 2022-12-11 12:39:44 -07:00			`uniwrap = unicode`
llHMAC implementation (without importing hmac) 2023-02-03 13:48:33 -07:00			`bytewrap = bytes`
Additions for strutil: xrange, python2, python3, any2str Also output to stderr with str instead of unicode. 2020-11-08 18:12:29 -07:00
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00			`def str2u(s, enc=None):`
			`"""Convert a native Python3 str to Unicode. This is a NOP."""`
			`return s`

			`def str2b(s, enc=None):`
			`"""Convert a native Python3 str to bytes, with the given encoding."""`
Misc small non-user-visible fixes Some are Unicode vs Str stuff, others are style adjustments, others in unused functions, and so on. 2020-11-08 17:51:24 -07:00			`return s.encode(getattr(enc, 'encoding', enc) or 'utf8',`
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00			`'backslashreplace')`

			`def u2str(s, enc=None):`
			`"""Convert a Unicode string to native Python 3 str. This is a NOP."""`
			`return s`

			`def b2str(s, enc=None):`
			`"""Convert a Bytes string to native Python 3 str."""`
			`return s.decode(getattr(enc, 'encoding', enc) or 'utf8',`
'backslashreplace' makes no sense for str.decode() 2019-01-18 12:33:02 -07:00			`'replace')`
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00
Additions for strutil: xrange, python2, python3, any2str Also output to stderr with str instead of unicode. 2020-11-08 18:12:29 -07:00			`def any2str(s, enc=None):`
			`"""Convert Bytes or Unicode to native Python 3 str."""`
			`return s if type(s) == str else b2str(s, enc)`

First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00			`else:`
Additions for strutil: xrange, python2, python3, any2str Also output to stderr with str instead of unicode. 2020-11-08 18:12:29 -07:00			`unicode = unicode`
			`unichr = unichr`
			`xrange = xrange`
			`python2 = True`
			`python3 = False`
Monkey-patch some string functions in narrow string builds The official distribution of Python 2.7 on Windows is built with "narrow strings" (UTF-16 strings with no proper indexing or length). This makes some tests fail. "Fix" this by monkey-patching a few functions and using a wrapping unicode class, as we can't monkey-patch the actual unicode type. This is very fragile code, but it's the best we could do given the limitations. 2022-12-11 12:39:44 -07:00			`uniwrap = unicode`
llHMAC implementation (without importing hmac) 2023-02-03 13:48:33 -07:00			`bytewrap = bytearray`
Additions for strutil: xrange, python2, python3, any2str Also output to stderr with str instead of unicode. 2020-11-08 18:12:29 -07:00
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00			`def str2u(s, enc=None):`
			`"""Convert a native Python2 str to Unicode."""`
			`return s.decode(getattr(enc, 'encoding', enc) or 'utf8',`
'backslashreplace' makes no sense for str.decode() 2019-01-18 12:33:02 -07:00			`'replace')`
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00
			`def str2b(s, enc=None):`
			`"""Convert a native Python2 str to bytes. This is a NOP."""`
			`return s`

			`def u2str(s, enc=None):`
			`"""Convert a Unicode string to native Python 2 str."""`
Misc small non-user-visible fixes Some are Unicode vs Str stuff, others are style adjustments, others in unused functions, and so on. 2020-11-08 17:51:24 -07:00			`return s.encode(getattr(enc, 'encoding', enc) or 'utf8',`
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00			`'backslashreplace')`

			`def b2str(s, enc=None):`
			`"""Convert a Bytes string to native Python 2 str. This is a NOP."""`
			`return s`

Additions for strutil: xrange, python2, python3, any2str Also output to stderr with str instead of unicode. 2020-11-08 18:12:29 -07:00			`def any2str(s, enc=None):`
			`"""Convert Bytes or Unicode to native Python 2 str."""`
			`return s if type(s) == str else u2str(s, enc)`

Monkey-patch some string functions in narrow string builds The official distribution of Python 2.7 on Windows is built with "narrow strings" (UTF-16 strings with no proper indexing or length). This makes some tests fail. "Fix" this by monkey-patching a few functions and using a wrapping unicode class, as we can't monkey-patch the actual unicode type. This is very fragile code, but it's the best we could do given the limitations. 2022-12-11 12:39:44 -07:00			`if len(u'\U00010001') == 2:`
			`# Narrow character build (UTF-16 strings)`
			`# Monkey-patch the relevant functions`
			`python2Narrow = True`
			`_unichr = unichr`
			`_ord = ord`
			`_len = len`

			`def unichr(n):`
			`if not (65536 <= n < 0x110000):`
			`return _unichr(n)`
			`return ('\\U%08X' % n).decode('unicode-escape')`

			`def ord(x):`
			`if isinstance(x, unicode) and _len(x) == 2:`
			`x = unicode(x)`
			`if 0xD800 <= _ord(x[0]) < 0xDC00:`
			`return 65536 + ((_ord(x[0]) & 0x3FF) << 10`
			`\| (_ord(x[1]) & 0x3FF))`
			`return _ord(x)`

			`def len(x):`
			`if isinstance(x, unicode):`
			`return _len(x.encode('utf-32le')) >> 2`
			`return _len(x)`

			`# Alas, we can't monkey-patch the unicode class' __getitem__ and`
			`# __getslice__ methods; we need a workaround.`
			`class uniwrap(unicode):`
			`def __getslice__(self, start, stop):`
			`lim = sys.maxint >> 2`
			`if start < 0: start = 0`
			`if stop < 0: stop = 0`
			`if start < lim:`
			`start <<= 2`
			`else:`
			`start = sys.maxint`
			`if stop < lim:`
			`stop <<= 2`
			`else:`
			`stop = sys.maxint`
			`return self.encode('utf-32le')[start:stop].decode(`
			`'utf-32le')`
			`def __getitem__(self, item):`
			`if type(item) == slice:`
			`start = item.start`
			`stop = item.stop`
			`step = item.step`
			`if start is not None:`
			`start <<= 2`
			`if stop is not None:`
			`stop <<= 2`
			`if step is not None:`
			`step <<= 2`
			`return self.encode('utf-32le')[start:stop:step].decode(`
			`'utf-32le')`
			`u = self.encode('utf-32le')`
			`item <<= 2`
			`if item >= _len(u):`
			`return u[item] # raise IndexError, as slicing doesn't`
			`return u[item:(item+4 if item != -4 else None)].decode(`
			`'utf-32le')`
Additions for strutil: xrange, python2, python3, any2str Also output to stderr with str instead of unicode. 2020-11-08 18:12:29 -07:00
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00			`def b2u(s, enc=None):`
			`"""Bytes to Unicode"""`
			`return str2u(b2str(s, enc), enc)`

			`def u2b(s, enc=None):`
			`"""Unicode to Bytes"""`
Misc small non-user-visible fixes Some are Unicode vs Str stuff, others are style adjustments, others in unused functions, and so on. 2020-11-08 17:51:24 -07:00			`return str2b(u2str(s, enc), enc)`
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00
			`def any2b(s, enc=None):`
			`"""Bytes or Unicode to Bytes"""`
			`return s if type(s) == bytes else u2b(s, enc)`

			`def any2u(s, enc=None):`
			`"""Bytes or Unicode to Unicode"""`
			`return s if type(s) == unicode else b2u(s, enc)`

			`def werr(s):`
			`"""Write any string to stderr"""`
Additions for strutil: xrange, python2, python3, any2str Also output to stderr with str instead of unicode. 2020-11-08 18:12:29 -07:00			`sys.stderr.write(any2str(s, sys.stderr))`
First baby steps towards dual Python2+3 compatibility 2019-01-15 12:27:02 -07:00
			`def wout(s):`
			`"""Write any string to stdout"""`
Additions for strutil: xrange, python2, python3, any2str Also output to stderr with str instead of unicode. 2020-11-08 18:12:29 -07:00			`sys.stdout.write(any2str(s, sys.stdout))`
Style fixes Calm pyflakes by using identifiers, and change hexversion to version.major. 2019-01-18 15:41:45 -07:00
			`strutil_used = True`