Update to latest pcpp, remove expression evaluator

PCPP now includes a decent expression evaluator instead of relying on Python's, therefore our evaluator is removed. The corresponding license text no longer applies, because the snippet that used it has been removed. This patch also corrects the handling of include files, which was supposed to raise an exception or return something (and now it needs an extra parameter). We just report an error and raise the special exception that pcpp uses to ignore and remove the directive. This patch also handles #error and invalid directives, which were passed through in previous versions. #warning and #line are also passed through.
2025-07-01 07:38:21 +00:00 · 2020-10-29 20:24:00 +01:00 · 2020-10-29 20:24:00 +01:00 · 5e88adcff3
commit 5e88adcff3
parent 8db8872dbe
2 changed files with 23 additions and 645 deletions
--- a/cpreproc.py
+++ b/cpreproc.py
@ -14,567 +14,18 @@
 #
 #    You should have received a copy of the GNU General Public License
 #    along with LSL PyOptimizer. If not, see <http://www.gnu.org/licenses/>.
 #
 #    This file includes an excerpt from PCPP, by Niall Douglas and David
 #    Beazley. PCPP is available here: https://github.com/ned14/pcpp and
 #    the fragment used here was distributed under the following conditions:
 #
 #    (C) Copyright 2018-2019 Niall Douglas http://www.nedproductions.biz/
 #    (C) Copyright 2007-2019 David Beazley http://www.dabeaz.com/
 #
 #    All rights reserved.
 #
 #    Redistribution and use in source and binary forms, with or without
 #    modification, are permitted provided that the following conditions are
 #    met:
 #
 #  * Redistributions of source code must retain the above copyright notice,
 #    this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright notice,
 #    this list of conditions and the following disclaimer in the documentation
 #    and/or other materials provided with the distribution.
 #  * Neither the name of the David Beazley or Dabeaz LLC may be used to
 #    endorse or promote products derived from this software without
 #    specific prior written permission.
 #
 #    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 #    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 #    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 #    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 #    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 #    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 #    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 #    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 #    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 #    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 #    (End of terms and conditions for the PCPP excerpt)
 #
 #    The particular excerpt used is this one:
 # https://github.com/ned14/pcpp/blob/e1219ce157b4dfcfee3181faa6ec5129c3a41e78/pcpp/preprocessor.py#L873-L935
 #    The license that applies, reproduced above, is this one:
 # https://raw.githubusercontent.com/ned14/pcpp/e1219ce157b4dfcfee3181faa6ec5129c3a41e78/LICENSE.txt
 #
 #    The following fragments of code are hereby irrevokably donated to the
 #    public domain:
 #    - The Evaluator class in its entirety.
 #    - The evalexpr method in its entirety except for the excerpt mentioned
 #      above, which remains copyright of its authors.
 #    - Every line between this one and the Evaluator class.
-import sys, os, re, copy
+# Interface for Niall Douglas' and David M. Beazley's PCPP (a C preprocessor)
 import sys, os
 oldsyspath = sys.path
 sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                'pcpp'))
-from pcpp import preprocessor
+from pcpp import preprocessor, OutputDirective, Action
 path = oldsyspath
-# Define the number of bits to work with in expression evaluation
+DIRECTIVES_PASSED_THROUGH = {'warning', 'pragma', 'line'}
 # (per the standard, this should be the bits in uintmax_t).
 INTMAXBITS = 64
 UINTMAX_MAX = (1 << INTMAXBITS) - 1
 INTMAX_MIN = -(1 << (INTMAXBITS - 1))
 DSYMBOLS = {'->', '-=', '--', '==', '<<', '<=', '>>', '>=', '||', '|=',
            '&&', '&=', '!=', '^=', '*=', '/=', '%=', '+=', '++'}
 DIGRAPHS = {'<:':'[', ':>':']', '<%':'{', '%>':'}', '%:':'#'}
 ESCAPES = {'a':7,'b':8,'f':12,'n':10,'r':13,'t':9,'v':11,
           '"':34, '\\':92, '\'':39, '?':63}
 # Exception to report an evaluation error
 class EvalError(Exception): pass
 class uint(long): pass
 class sint(long): pass
 class Evaluator(object):
    """Recursive descendent parser to evaluate C preprocessor expressions."""
    # Int parser
    resolve_int_regex = re.compile(
        # Group 1: Hex
        # Group 2: Oct
        # Group 3: Dec
        # Group 4: Unsigned
        r'^(?:(0x[0-9a-f]+)|(0[0-7]*)|([1-9][0-9]*))'
        r'(?:(u(?:ll?)?|(?:ll?)?u)|(?:ll?)?)$', re.I | re.S)
    # Char parser (without the quotes)
    ctoken_regex = re.compile(
        r'\\(?:'
            r'[\?' r"'" r'"\\abfnrtv]|[Xx][0-9a-fA-F]+|[0-7]{1,3}'
            r'|u[0-9a-fA-F]{4}|U[0-9a-fA-F]{8}'
        r')'
        r'|.', re.S)
    def __init__(self, tokens):
        assert tokens, "Empty tokens list???"
        self.tokens = tokens
        self.ptr = 0
        self.evaluating = True
        self.conv = {uint: self.to_uint, sint: self.to_sint}
        self.nextToken()
    def to_uint(self, i):
        return uint(i & UINTMAX_MAX)
    def to_sint(self, i):
        return sint(((i - INTMAX_MIN) & UINTMAX_MAX) + INTMAX_MIN)
    def nextToken(self):
        """Sets self.token to the next token and advances the token pointer.
        Skips whitespace tokens. Returns a CPP_WS token with value '\n' if
        there's no next token. Returns synthesized tokens for multichar tokens
        not currently handled by PCPP.
        """
        try:
            while True:
                tok = self.token = self.tokens[self.ptr]
                self.ptr += 1
                # Eat whitespace except newlines, and /* */ comments
                if (tok.type == 'CPP_WS' and '\n' not in tok.value
                    or tok.type == 'CPP_COMMENT1'
                   ):
                    continue
                break
        except IndexError:
            # Synthesize a new CPP_WS token with a newline, to signal
            # end-of-text (we copy it from the last one in the token stream).
            tok = self.token = copy.copy(self.tokens[-1])
            tok.type = 'CPP_WS'
            tok.value = '\n'
            return
        # Single-line comments are line terminators; convert them
        if tok.type == 'CPP_COMMENT2':
            tok = self.token = copy.copy(tok)
            tok.type = 'CPP_WS'
            tok.value = '\n'
            return
        # Work around a lexing problem in PCPP
        #
        # PCPP doesn't tokenize multichar tokens except ##, so we do that job
        # here, to ease processing and report more errors (e.g. 5--3 should be
        # reported as an error because it uses the post-decrement operator,
        # instead of evaluating to 8, which is the correct result for 5- -3).
        # The tokens processed here are those in the C standard missed by PCPP:
        #   -> -= -- << <= >> >= || |= && &= == != ^= *= /= += ++ %=
        #   >>= <<=
        #   ...
        #   <: :> <% %> %:
        #   %:%:
        #
        # This is already a single token, therefore it's not processed here:
        #   ##
        try:
            next = self.tokens[self.ptr]
        except IndexError:
            return
        s = tok.type + next.type
        if s in DSYMBOLS:
            tok = self.token = copy.copy(tok)
            tok.type = s
            tok.value += next.value
            self.ptr += 1
            if s in ('<<', '>>'):
                # check for <<= >>=
                try:
                    next2 = self.tokens[self.ptr]
                    if next2.type == '=':
                        tok.type += next2.type
                        tok.value += next2.value
                        self.ptr += 1
                except IndexError:
                    pass
            return
        if s in DIGRAPHS:
            # digraph or DPOUND
            tok = self.token = copy.copy(tok)
            tok.type = DIGRAPHS[s]
            tok.value += next.value
            self.ptr += 1
            try:
                next2 = self.tokens[self.ptr]
                next3 = self.tokens[self.ptr + 1]
                if next2.type == '%' and next3.type == ':':
                    tok.type = '##'
                    tok.value += next2.value + next3.value
                    self.ptr += 2
            except IndexError:
                pass
            return
        if s == '..':
            try:
                next2 = self.tokens[self.ptr + 1]
                if next2.type == '.':
                    tok = self.token = copy.copy(tok)
                    tok.type = '...'
                    tok.value += next.value + next2.value
                    self.ptr += 2
            except IndexError:
                pass
            return
    def eat(self, *toktypes):
        """Return True and advance pointer if the current token matches. """
        if self.token.type in toktypes:
            self.nextToken()
            return True
        return False
    def expect(self, toktype):
        """Checks an expected token and eats it"""
        expect = toktype
        if toktype == 'END' and '\n' in self.token.value:
            expect = 'CPP_WS'
        if not self.eat(expect):
            raise EvalError(
                "Unexpected token %s (%s) in expression, expected %s"
                % (repr(self.token.value), self.token.type, toktype))
    def conversions(self, op1, op2):
        """Perform usual arithmetic conversions on two operands."""
        assert type(op1) in (sint, uint) and type(op2) in (sint, uint)
        if type(op1) != type(op2):
            return self.to_uint(op1), self.to_uint(op2)
        return op1, op2
    def primary_expression(self, evaluating):
        """Non-terminal: primary_expression.
        primary_expression:
            IDENTIFIER | STRING_LITERAL | CHAR_LITERAL | INTEGER
            | '(' expression ')'
        """
        tok = self.token
        if self.eat('('):
            ret = self.expression(evaluating)
            self.expect(')')
            return ret
        #if self.eat('CPP_STRING'):
        #    return tok.value
        if self.eat('CPP_CHAR'):
            charstr = tok.value
            unicode = False
            if tok.value.startswith('L'):
                unicode = True
                charstr = charstr[2:-1]
            else:
                charstr = charstr[1:-1]
            onechar = False
            for ctok in self.ctoken_regex.finditer(charstr):
                if onechar:
                    raise EvalError("Multiple characters in char literal")
                onechar = True
                c = ctok.group(0)
                if c == '\\':
                    raise EvalError("Invalid escape sequence in char literal")
                if c.startswith('\\'):
                    if c.startswith('\\u') or c.startswith('\\U'):
                        result = int(c[2:], 16)
                        if ((result < 0xA0 and result not in (0x24,0x40,0x60))
                            or 0xD800 <= result <= 0xDFFF
                           ):
                            raise EvalError("Invalid universal character %s"
                                % c)
                        if result > 0xFF and not unicode:
                            raise EvalError("Char literal out of range")
                    elif c.startswith('\\x') or c.startswith('\\X'):
                        result = int(c[2:], 16)
                        if result > 0xFF:
                            raise EvalError("Hex literal out of range")
                    elif c[1] in ESCAPES:
                        result = ESCAPES[c[1]]
                    else:
                        result = int(c[1:], 8)
                else:
                    assert len(c) == 1 and c != '\''
                    return ord(c)
            # This may need reconsideration if INTMAXBITS is < 22 (the bits
            # necessary to fit a Unicode codepoint in a signed integer).
            return sint(result)  # our char is unsigned
        if tok.type == 'CPP_ID':
            tok = self.token = copy.copy(tok)
            tok.type = 'CPP_INTEGER'
            tok.value = '0'
            # fall through to process it as CPP_INTEGER
        if self.eat('CPP_INTEGER'):
            m = self.resolve_int_regex.search(tok.value)
            if not m:
                raise EvalError("Invalid integer literal")
            val = (int(m.group(2), 8) if m.group(2)
                   else int(m.group(1) or m.group(3), 0))
            val = (self.to_uint(val)
                   if m.group(4)
                      or val >= -INTMAX_MIN and m.group(3) is None
                   else self.to_sint(val))
            return val
        if tok.type == 'CPP_STRING':
            raise EvalError("Strings are not allowed in expressions")
        if tok.type == 'CPP_WS' and '\n' in tok.value:
            raise EvalError('Unexpected end of expression')
        self.expect('CPP_INTEGER')
    def factor_expression(self, evaluating):
        """Non-terminal: factor_expression
        factor_expression:
            primary_expression
            | unary_operator factor_expression
        """
        # Avoid recursing for unary operators. Apply them post-evaluation.
        k = None
        while True:
            toktype = self.token.type
            if self.eat('-', '+', '~', '!') and toktype != '+':
                k = k or []
                k.append(toktype)
            else:
                break
        result = self.primary_expression(evaluating)
        while k:
            operation = k.pop()
            if operation == '!':
                result = sint(0 if result else 1)
            else:
                result = self.conv[type(result)](-result if operation == '-'
                else ~result)
        return result
    def term_expression(self, evaluating):
        """Non-terminal: term_expression
        term_expression:
            factor_expression
            | term_expression '*' factor_expression
            | term_expression '/' factor_expression
            | term_expression '%' factor_expression
        """
        result = self.factor_expression(evaluating)
        while True:
            toktype = self.token.type
            if not self.eat('*', '/', '%'):
                return result
            operand = self.factor_expression(evaluating)
            if evaluating and operand == 0 and toktype != '*':
                raise EvalError("Division by zero")
            result, operand = self.conversions(result, operand)
            result = self.conv[type(result)](result if not evaluating
                else result * operand if toktype == '*'
                else result // operand if toktype == '/'
                else result % operand)
    def arithmetic_expression(self, evaluating):
        """Non-terminal: arithmetic_expression
        arithmetic_expression:
            term_expression
            | arithmetic_expression '+' term_expression
            | arithmetic_expression '-' term_expression
        """
        result = self.term_expression(evaluating)
        while True:
            toktype = self.token.type
            if not self.eat('+', '-'):
                return result
            operand = self.term_expression(evaluating)
            result, operand = self.conversions(result, operand)
            result = self.conv[type(result)](result + operand if toktype == '+'
                else result - operand)
    def shift_expression(self, evaluating):
        """Non-terminal: shift_expression
        shift_expression:
            arithmetic_expression
            | shift_expression '<<' arithmetic_expression
            | shift_expression '>>' arithmetic_expression
        """
        result = self.arithmetic_expression(evaluating)
        while True:
            tok = self.token
            if not self.eat('<<', '>>'):
                return result
            operand = self.arithmetic_expression(evaluating)
            # We don't want a too large intermediate result, to prevent DoS
            result = self.conv[type(result)](result << min(operand, INTMAXBITS)
                if tok.type == '<<' else result >> max(operand, 0))
    def relational_expression(self, evaluating):
        """Non-terminal: relational_expression
        relational_expression:
            shift_expression
            | relational_expression '>' shift_expression
            | relational_expression '<' shift_expression
            | relational_expression '>=' shift_expression
            | relational_expression '<=' shift_expression
        """
        result = self.shift_expression(evaluating)
        while True:
            tok = self.token
            if not self.eat('<', '>', '<=', '>='):
                return result
            operand = self.shift_expression(evaluating)
            result, operand = self.conversions(result, operand)
            # Use the fact that a < b  <->  b > a
            # Use the fact that a < b  <-> !(a >= b)
            if tok.type == '>' or tok.type == '<=':
                result, operand = operand, result
            result = sint(1 if (result < operand) == (tok.type in ('<', '>'))
                          else 0)
    def equality_expression(self, evaluating):
        """Non-terminal: equality_expression
        equality_expression:
            relational_expression
            | equality_expression '==' relational_expression
            | equality_expression '!=' relational_expression
        """
        result = self.relational_expression(evaluating)
        while True:
            tok = self.token
            if not self.eat('==', '!='):
                return result
            operand = self.relational_expression(evaluating)
            result, operand = self.conversions(result, operand)
            result = sint(1 if (result == operand) == (tok.type == '==')
                          else 0)
    def bitwise_and_expression(self, evaluating):
        """Non-terminal: bitwise_and_expression
        bitwise_and_expression:
            equality_expression
            | bitwise_and_expression '&' equality_expression
        """
        result = self.equality_expression(evaluating)
        while True:
            if not self.eat('&'):
                return result
            operand = self.equality_expression(evaluating)
            result, operand = self.conversions(result, operand)
            result = self.conv[type(result)](result & operand)
    def bitwise_xor_expression(self, evaluating):
        """Non-terminal: bitwise_xor_expression
        bitwise_xor_expression:
            bitwise_and_expression
            | bitwise_xor_expression '^' bitwise_and_expression
        """
        result = self.bitwise_and_expression(evaluating)
        while True:
            if not self.eat('^'):
                return result
            operand = self.bitwise_and_expression(evaluating)
            result, operand = self.conversions(result, operand)
            result = self.conv[type(result)](result ^ operand)
    def bitwise_or_expression(self, evaluating):
        """Non-terminal: bitwise_or_expression
        bitwise_or_expression:
            bitwise_xor_expression
            | bitwise_or_expression '|' bitwise_xor_expression
        """
        result = self.bitwise_xor_expression(evaluating)
        while True:
            if not self.eat('|'):
                return result
            operand = self.bitwise_xor_expression(evaluating)
            result, operand = self.conversions(result, operand)
            result = self.conv[type(result)](result | operand)
    def logical_and_expression(self, evaluating):
        """Non-terminal: logical_and_expression
        logical_and_expression:
            bitwise_or_expression
            | logical_and_expression '&&' bitwise_or_expression
        """
        result = self.bitwise_or_expression(evaluating)
        while True:
            if not self.eat('&&'):
                return result
            evaluating = evaluating and not not result
            operand = self.bitwise_or_expression(evaluating)
            result = sint(1 if result and (not evaluating or operand) else 0)
    def logical_or_expression(self, evaluating):
        """Non-terminal: logical_or_expression
        logical_or_expression:
            logical_and_expression
            | logical_or_expression '||' logical_and_expression
        """
        result = self.logical_and_expression(evaluating)
        while True:
            if not self.eat('||'):
                return result
            evaluating = evaluating and not result
            operand = self.logical_and_expression(evaluating)
            result = sint(1 if result or (evaluating and operand) else 0)
    def conditional_expression(self, evaluating):
        """Non-terminal: conditional_expression.
        conditional_expression:
            logical_or_expression
            | logical_or_expression '?' expression ':' conditional_expression
        """
        result = self.logical_or_expression(evaluating)
        if self.eat('?'):
            if result:
                result = self.expression(evaluating)
                self.expect(':')
                operand = self.conditional_expression(False)
            else:
                operand = self.expression(False)
                self.expect(':')
                result = self.conditional_expression(evaluating)
            result, operand = self.conversions(result, operand)
        return result
    def expression(self, evaluating = True):
        """Non-terminal: expression.
        expression:
            conditional_expression                (always)
            | expression conditional_expression   (if not evaluating)
        """
        if evaluating:
            return self.conditional_expression(evaluating)
        while True:
            result = self.conditional_expression(evaluating)
            if not self.eat(','):
                return result
    def evaluate(self):
        result = self.expression(True)
        # Did we eat all tokens?
        self.expect('END')
        return result
 class Preproc(preprocessor.Preprocessor):
    def __init__(self, input, params=()):
@ -622,95 +73,22 @@ class Preproc(preprocessor.Preprocessor):
        self.errors_present = True
        return super(Preproc, self).on_error(*args, **kwargs)
-    def on_include_not_found(self, is_system_include, curdir, includepath):
+    def on_include_not_found(self, is_malformed, is_system_include, curdir,
                             includepath):
        """Don't pass through the #include line if the file does not exist."""
-        self.on_error(self.lastdirective.source, self.lastdirective.lineno,
+        if is_malformed:
-            "Include file not found: %s" % includepath)
+            self.on_error(self.lastdirective.source, self.lastdirective.lineno,
                "Malformed include file directive")
        else:
            self.on_error(self.lastdirective.source, self.lastdirective.lineno,
                "Include file not found: %s" % includepath)
        raise OutputDirective(Action.IgnoreAndRemove)
-    def evalexpr(self, tokens):
+    def on_directive_unknown(self, directive, toks, ifpassthru, precedingtoks):
-        """Evaluate a sequence of tokens as an expression.
+        """pcpp does not process #error/#warning/#pragma/#line; do it here."""
-
+        if directive.value == 'error':
-        The original uses eval(), which is unsafe for web usage. This one uses
+            self.on_error(directive.source, directive.lineno,
-        our own recursive-descendent parser.
+                "Error directive: \"%s\"" % ''.join(i.value for i in toks))
-        """
+        elif directive.value not in DIRECTIVES_PASSED_THROUGH:
-
+            self.on_error(directive.source, directive.lineno,
-        # ****************************************************
+                "Unknown directive: \"%s\"" % directive.value)
        # Start of fragment copied from PCPP's preprocessor.py
        """Evaluate an expression token sequence for the purposes of evaluating
        integral expressions."""
        if not tokens:
            self.on_error('unknown', 0, "Empty expression")
            return (0, None)
        # tokens = tokenize(line)
        # Search for defined macros
        evalfuncts = {'defined' : lambda x: True}
        evalvars = {}
        def replace_defined(tokens):
            i = 0
            while i < len(tokens):
                if tokens[i].type == self.t_ID and tokens[i].value == 'defined':
                    j = i + 1
                    needparen = False
                    result = "0L"
                    while j < len(tokens):
                        if tokens[j].type in self.t_WS:
                            j += 1
                            continue
                        elif tokens[j].type == self.t_ID:
                            if tokens[j].value in self.macros:
                                result = "1L"
                            else:
                                repl = self.on_unknown_macro_in_defined_expr(tokens[j])
                                if repl is None:
                                    # Add this identifier to a dictionary of variables
                                    evalvars[tokens[j].value] = 0
                                    result = 'defined('+tokens[j].value+')'
                                else:
                                    result = "1L" if repl else "0L"
                            if not needparen: break
                        elif tokens[j].value == '(':
                            needparen = True
                        elif tokens[j].value == ')':
                            break
                        else:
                            self.on_error(tokens[i].source,tokens[i].lineno,"Malformed defined()")
                        j += 1
                    if result.startswith('defined'):
                        tokens[i].type = self.t_ID
                        tokens[i].value = result
                    else:
                        tokens[i].type = self.t_INTEGER
                        tokens[i].value = self.t_INTEGER_TYPE(result)
                    del tokens[i+1:j+1]
                i += 1
            return tokens
        # Replace any defined(macro) before macro expansion
        tokens = replace_defined(tokens)
        tokens = self.expand_macros(tokens)
        # Replace any defined(macro) after macro expansion
        tokens = replace_defined(tokens)
        if not tokens:
            return (0, None)
        for i,t in enumerate(tokens):
            if t.type == self.t_ID:
                repl = self.on_unknown_macro_in_expr(copy.copy(t))
                if repl is None:
                    # Add this identifier to a dictionary of variables
                    evalvars[t.value] = 0
                else:
                    tokens[i] = t = repl
        # End of fragment copied from PCPP's preprocessor.py
        # **************************************************
        del evalfuncts  # we don't use this
        evaluator = Evaluator(tokens)
        try:
            result = int(evaluator.evaluate())
        except EvalError as e:
            self.on_error(evaluator.token.source, evaluator.token.lineno,
                e.message)
            return (0, None)
        del evaluator
        return (result, tokens) if evalvars else (result, None)
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit ed3b3f02e8f97c9112e2f6cd82115864ee056e21
+Subproject commit 18d5bc4cdb594c6d76a67f4e76fd5250015f6700
		`@ -1 +1 @@`
			`Subproject commit ed3b3f02e8f97c9112e2f6cd82115864ee056e21`				`Subproject commit 18d5bc4cdb594c6d76a67f4e76fd5250015f6700`