Revamp and fixes of REs for pre-preprocessing.

Easier to follow and to maintain this way. Fixes known and potential bugs, plus potential lack of greediness.
2025-07-01 23:58:20 +00:00 · 2015-03-20 17:45:38 +01:00 · 2015-03-20 17:45:38 +01:00 · 47eee0312b
commit 47eee0312b
parent cd1826e9e0
1 changed files with 31 additions and 22 deletions
--- a/main.py
+++ b/main.py
@ -51,30 +51,39 @@ def PreparePreproc(script):
    col = 0
    # Trigraphs make our life really difficult.
-    # We join lines with \<return> or ??/<return> inside strings,
+    # We join lines that have \<return> or ??/<return> inside strings,
-    # and count <return>s to add them back at the end of the string,
+    # and we also replace regular <return> inside strings with \n, counting how
-    # as well as spaces.
+    # many lines we join, to add them back at the end of the string in order to
-    # We skip as much as possible in one go every time, only stopping to
+    # keep the line count exact prior to preprocessing. We also preserve the
-    # analyze critical substrings.
+    # original column after the string, by adding as many spaces as necessary.
-    tok = re.compile(r'[^"/]+|"|/(?:\?\?\/\n)*\*.*?\*(?:\?\?\/\n)*/'
+    # We could let the preprocessor do the line joining on backslash-newline,
-        r'|/(?:\?\?\/\n)*/(?:\?\?\/.|\\.|.)*?\n'
+    # but by eliminating all newlines, we have control over the output column
    # of the text that follows the string and can report an accurate column
    # position in case of error.
    # The REs skip as much as possible in one go every time, only stopping to
    # analyze critical tokens.
    tok = re.compile(
        r'(?:'
            r'/(?:\?\?/\n|\\\n)*\*.*?\*(?:\?\?/\n|\\\n)*/'
            r'|/(?:\?\?/\n|\\\n)*/(?:\?\?/\n|\\\n|[^\n])*\n'
            r'|[^"]'
        r')+'
        r'|"'
        , re.S)
-    #tok2 = re.compile(r'(?:(?!\?\?/.|\\.|"|\n).)+|\\.|\?\?/.|.', re.S)
+    # RE used inside strings.
    tok2 = re.compile(
-        r"\\\n|\?\?/\n|" '"' r"|\n|"
+        r'(?:'
-        r"(?:"
+            r"\?\?[='()!<>-]"   # valid trigraph except ??/ (backslash)
-            # negative match for the above - tough
+            r"|(?:\?\?/|\\)(?:\?\?[/='()!<>-]|[^\n])"
-            # eat as a unit:
+                                # backslash trigraph or actual backslash,
-            # - a backslash or corresponding trigraph followed by any trigraph
+                                # followed by any trigraph or non-newline
-            #   or by any non-newline character
+            r'|(?!\?\?/\n|\\\n|"|\n).'
-            # - any trigraph other than ??/
+                                # any character that doesn't start a trigraph/
-            # - any character that is not a newline, double quote, backslash
+                                # backslash escape followed by a newline
-            #   or the start of a trigraph
+                                # or is a newline or double quote, as we're
-            # - any trigraph-like sequence that is not a trigraph
+                                # interested in all those individually.
-            r"(?:\\|\?\?/)(?:\?\?[=/'()!<>\-]|[^\n])"
+        r')+'                   # as many of those as possible
-            r"|\?\?[='()!<>\-]"
+        r'|\?\?/\n|\\\n|\n|"'   # or any of those individually
            r"|[^\n" '"' r"\\?]|\?(?!\?[=/'()!<>\-])"
        r")+"
        )
    pos = 0