Implement the shrinknames option.

Fixes some bugs with the treatment of the shrink attribute, some others with the output of renamed stuff.
2025-07-01 23:58:20 +00:00 · 2014-08-01 05:07:50 +02:00 · 2014-08-01 05:07:50 +02:00 · 6c248c46e3
commit 6c248c46e3
parent 847d7b1e20
5 changed files with 174 additions and 15 deletions
--- a/lslopt/lsloptimizer.py
+++ b/lslopt/lsloptimizer.py
@ -2,7 +2,9 @@
 import lslfuncs
 from lslparse import warning

-class optimizer(object):
+from lslrenamer import renamer
+
+class optimizer(renamer):

    # Default values per type when declaring variables
    DefaultValues = {'integer': 0, 'float': 0.0, 'string': u'',
@ -606,6 +608,9 @@ class optimizer(object):
            else:
                self.FoldTree(tree, idx)

+        if 'shrinknames' in options:
+            self.AssignNewNames()
+
        treesymtab = (self.tree, self.symtab)
        del self.tree
        del self.symtab
--- a/lslopt/lsloutput.py
+++ b/lslopt/lsloutput.py
@ -135,11 +135,16 @@ class outscript(object):
    def dent(self):
        return self.indent * self.indentlevel

-    def FindName(self, node):
-        try:
+    def FindName(self, node, scope = None):
+        if scope is None:
+            # node is a node
+            if 'scope' in node and'NewName' in self.symtab[node['scope']][node['name']]:
                return self.symtab[node['scope']][node['name']]['NewName']
-        except KeyError:
            return node['name']
+        # node is a name
+        if 'NewName' in self.symtab[scope][node]:
+            return self.symtab[scope][node]['NewName']
+        return node

    def OutIndented(self, node):
        if node['nt'] != '{}':
@ -300,7 +305,9 @@ class outscript(object):
            if node['t'] is not None:
                ret += node['t'] + ' '
            ret += self.FindName(node) + '('
-            ret += ', '.join(typ + ' ' + name for typ, name in zip(node['ptypes'], node['pnames']))
+            scope = node['pscope']
+            ret += ', '.join(typ + ' ' + self.FindName(name, scope)
+                             for typ, name in zip(node['ptypes'], node['pnames']))
            return ret + ')\n' + self.OutCode(child[0])

        return self.dent() + self.OutExpr(node) + ';\n'
--- a/lslopt/lslparse.py
+++ b/lslopt/lslparse.py
@ -638,8 +638,10 @@ class parser(object):
            args = self.Parse_optional_expression_list(sym['ParamTypes'])
            self.expect(')')
            self.NextToken()
-            return {'nt':'FNCALL', 't':sym['Type'], 'name':name,
-                'scope':self.scopeindex, 'ch':args}
+            ret = {'nt':'FNCALL', 't':sym['Type'], 'name':name, 'ch':args}
+            if 'Scope' in sym:
+                ret['scope'] = sym['Scope']
+            return ret
        if sym['Kind'] != 'v':
            raise EParseTypeMismatch(self)
        typ = sym['Type']
@ -1170,7 +1172,7 @@ class parser(object):
                        x = random.randint(0, 16777215)
                        unique += b64encode(chr(x>>16) + chr((x>>8)&255)
                            + chr(x&255)).replace('+', '_')
-                        if '/' not in unique and unique not in self.locallabels:
+                        if '/' not in unique not in self.locallabels:
                            break
                else:
                    # Use the existing name. Faster and more readable.
@ -1200,11 +1202,11 @@ class parser(object):
                # It might still be a forward reference, so we add it to the
                # list of things to look up when done
                self.jump_lookups.append((name, self.scopeindex, self.errorpos, jumpnode))
+            else:
+                jumpnode['scope'] = sym['Scope']
            self.NextToken()
            self.expect(';')
            self.NextToken()
-            if sym is not None:
-                jumpnode['scope'] = sym['Scope']
            return jumpnode
        if tok0 == 'STATE':
            self.NextToken()
@ -1217,7 +1219,7 @@ class parser(object):
            self.NextToken()
            self.expect(';')
            self.NextToken()
-            return {'nt':'STSW', 't':None, 'name':name}
+            return {'nt':'STSW', 't':None, 'name':name, 'scope':0}
        if tok0 == 'RETURN':
            self.NextToken()
            if self.tok[0] == ';':
@ -1609,7 +1611,7 @@ class parser(object):
            events = self.Parse_events()

            self.expect('}')
-            self.tree.append({'nt':'STDEF', 't':None, 'name':name, 'ch':events})
+            self.tree.append({'nt':'STDEF', 't':None, 'name':name, 'scope':0, 'ch':events})
            self.NextToken()

    def Parse_script(self):
@ -1633,10 +1635,11 @@ class parser(object):
        # Check the pending jump targets
        for tgt in self.jump_lookups:
            self.scopeindex = tgt[1]
-            if self.FindSymbolPartial(tgt[0], MustBeLabel = True) is None:
+            sym = self.FindSymbolPartial(tgt[0], MustBeLabel = True)
+            if sym is None:
                self.errorpos = tgt[2]
                raise EParseUndefined(self)
-            tgt[3]['scope'] = tgt[1]
+            tgt[3]['scope'] = sym['Scope']

        del self.jump_lookups # Finished with it.

--- a/lslopt/lslrenamer.py
+++ b/lslopt/lslrenamer.py
@ -0,0 +1,140 @@
+import random
+from base64 import b64encode
+
+class renamer(object):
+    CharSet1 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_'
+    CharSet2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_0123456789'
+    # TODO: Derive these from builtins.txt somehow.
+    KwByLen = ((), (), ('do', 'if', 'PI'), ('for', 'key', 'EOF'),
+        ('jump', 'else', 'list', 'TRUE', 'LOOP'))
+    def GetNextShortest(self):
+        """Generate the next shortest possible identifier"""
+        while True:
+            n = self.WordCntr
+            self.WordCntr += 1
+            ret = self.CharSet1[n % 53]
+            n //= 53
+            while n > 1:
+                ret += self.CharSet2[n % 63]
+                n //= 63
+            if ret not in self.KwByLen[len(ret)] and ret not in self.UsedNames:
+                return ret
+
+    def AssignNewNames(self):
+        self.WordCntr = 53 # Initialize to length 1
+
+        # Names that can be reused without penalty. The initial set is there
+        # since the beginning. Others (e.g. Key) are created when some kinds
+        # of stuff are present, but we don't take so many risks.
+        ReusableNames = set(['LslLibrary', 'LslUserScript', 'System'])
+
+        # Names from ReusableNames that have already been used
+        self.UsedNames = set()
+
+        UsedLocals = set()
+
+        # Make a first pass to separate the symbols into three categories.
+        globalvars = []
+        states = []
+        functions = []
+        globaldefs = self.symtab[0]
+        for name in globaldefs:
+            if name == -1: continue
+            kind = globaldefs[name]['Kind']
+            if kind == 's':
+                states.append(name)
+            elif kind == 'f':
+                if 'Loc' in globaldefs[name]:
+                    functions.append(name)
+            elif kind == 'v':
+                globalvars.append(name)
+            else:
+                assert False, 'Invalid kind at this scope: ' + kind
+
+        # We make three passes, one for states, then functions, then globals,
+        # in that order.
+
+        for name in states:
+            # States have top priority. Here's why. An internal event function
+            # name is made by concatenating an 'e', then the state name, then
+            # the event name, e.g. edefaultstate_entry. Since a new identifier
+            # having part of the state name is created for every event in that
+            # state, the shortest the state name, the least bytes it will use.
+            # Furthermore, a state switch instruction further adds an Unicode
+            # string (all other identifier names use one-byte strings), which
+            # is the more reason to shorten it as much as possible.
+            #
+            # Unfortunately, there isn't much that can be done about 'default'.
+            #
+            # The good side is that we get to reuse these names for variables
+            # without using extra space and without wasting single or double
+            # letter identifiers.
+
+            entry = globaldefs[name]
+            if name != 'default':
+                name = entry['NewName'] = self.GetNextShortest()
+            # Find also the event names it uses, to add them for reuse.
+            for node in self.tree[entry['Loc']]['ch']:
+                assert node['nt'] == 'FNDEF'
+                ReusableNames.add('e' + name + node['name'])
+        del states
+
+        for name in functions:
+            # Assign a new name. Internal function names get a 'g' prepended
+            # to them, so these are candidates for reuse too.
+
+            # Unfortunately, we won't find any reusable name starting with 'g'
+            # this early, so no point in searching.
+
+            short = globaldefs[name]['NewName'] = self.GetNextShortest()
+            ReusableNames.add('g' + short)
+        del functions
+
+        for name in globalvars:
+            # First, check if we have reusable names available.
+            if ReusableNames:
+                short = ReusableNames.pop()
+                self.UsedNames.add(short)
+            else:
+                short = self.GetNextShortest()
+            globaldefs[name]['NewName'] = short
+
+        # Do the same for function and event parameter names. Pure locals get
+        # long distinct names.
+        First = True
+        for table in self.symtab:
+            if First:
+                First = False
+                # Skip globals
+                continue
+            for name,sym in table.iteritems():
+                if name == -1: continue
+                if sym['Kind'] != 'v':
+                    assert sym['Kind'] == 'l'
+                    continue
+                if 'Param' in sym:
+                    # Same procedure as for global vars
+                    # Not the best strategy (using locally unique names would
+                    # work optimally) but hey. At the time of writing there's
+                    # no reference analysis. TODO: Implement.
+                    if ReusableNames:
+                        short = ReusableNames.pop()
+                        self.UsedNames.add(short)
+                    else:
+                        short = self.GetNextShortest()
+                    table[name]['NewName'] = short
+                else:
+                    # Generate new identifier
+                    while True:
+                        x = random.randint(0, 16777215)
+                        unique = 'L_' + b64encode(chr(x>>16) + chr((x>>8)&255)
+                            + chr(x&255)).replace('+', '_')
+                        x = random.randint(0, 16777215)
+                        unique += b64encode(chr(x>>16) + chr((x>>8)&255)
+                            + chr(x&255)).replace('+', '_')
+                        if '/' not in unique not in UsedLocals:
+                            break
+                    UsedLocals.add(unique)
+                    table[name]['NewName'] = unique
+
+        del globalvars
--- a/main.py
+++ b/main.py
@ -50,6 +50,10 @@ Options (+ means active by default, - means inactive by default):
                       will go to the last label with that name). This flag
                       works around that limitation by replacing the names of
                       the labels in the output with unique ones.
+  shrinknames        - Reduces script memory by shrinking identifiers. In the
+                       process, it turns the script into unreadable gibberish,
+                       hard to debug, but this gets big savings for complex
+                       scripts.

 Note that the optimizer doesn't reorder expressions to fold constants. This
 means that e.g. a + 3 + 5 is not optimized to a + 8; however a + (3 + 5) is.