Revamp and fixes of REs for pre-preprocessing.

Easier to follow and to maintain this way. Fixes known and potential bugs, plus potential lack of greediness.
This commit is contained in:
Sei Lisa 2015-03-20 17:45:38 +01:00
parent cd1826e9e0
commit 47eee0312b

53
main.py
View file

@ -51,30 +51,39 @@ def PreparePreproc(script):
col = 0 col = 0
# Trigraphs make our life really difficult. # Trigraphs make our life really difficult.
# We join lines with \<return> or ??/<return> inside strings, # We join lines that have \<return> or ??/<return> inside strings,
# and count <return>s to add them back at the end of the string, # and we also replace regular <return> inside strings with \n, counting how
# as well as spaces. # many lines we join, to add them back at the end of the string in order to
# We skip as much as possible in one go every time, only stopping to # keep the line count exact prior to preprocessing. We also preserve the
# analyze critical substrings. # original column after the string, by adding as many spaces as necessary.
tok = re.compile(r'[^"/]+|"|/(?:\?\?\/\n)*\*.*?\*(?:\?\?\/\n)*/' # We could let the preprocessor do the line joining on backslash-newline,
r'|/(?:\?\?\/\n)*/(?:\?\?\/.|\\.|.)*?\n' # but by eliminating all newlines, we have control over the output column
# of the text that follows the string and can report an accurate column
# position in case of error.
# The REs skip as much as possible in one go every time, only stopping to
# analyze critical tokens.
tok = re.compile(
r'(?:'
r'/(?:\?\?/\n|\\\n)*\*.*?\*(?:\?\?/\n|\\\n)*/'
r'|/(?:\?\?/\n|\\\n)*/(?:\?\?/\n|\\\n|[^\n])*\n'
r'|[^"]'
r')+'
r'|"'
, re.S) , re.S)
#tok2 = re.compile(r'(?:(?!\?\?/.|\\.|"|\n).)+|\\.|\?\?/.|.', re.S) # RE used inside strings.
tok2 = re.compile( tok2 = re.compile(
r"\\\n|\?\?/\n|" '"' r"|\n|" r'(?:'
r"(?:" r"\?\?[='()!<>-]" # valid trigraph except ??/ (backslash)
# negative match for the above - tough r"|(?:\?\?/|\\)(?:\?\?[/='()!<>-]|[^\n])"
# eat as a unit: # backslash trigraph or actual backslash,
# - a backslash or corresponding trigraph followed by any trigraph # followed by any trigraph or non-newline
# or by any non-newline character r'|(?!\?\?/\n|\\\n|"|\n).'
# - any trigraph other than ??/ # any character that doesn't start a trigraph/
# - any character that is not a newline, double quote, backslash # backslash escape followed by a newline
# or the start of a trigraph # or is a newline or double quote, as we're
# - any trigraph-like sequence that is not a trigraph # interested in all those individually.
r"(?:\\|\?\?/)(?:\?\?[=/'()!<>\-]|[^\n])" r')+' # as many of those as possible
r"|\?\?[='()!<>\-]" r'|\?\?/\n|\\\n|\n|"' # or any of those individually
r"|[^\n" '"' r"\\?]|\?(?!\?[=/'()!<>\-])"
r")+"
) )
pos = 0 pos = 0