Make PreparePreproc Unicode-aware.

Fixes mismatches in column number output after a multiline string, if the last line of the string contains non-ASCII Unicode characters.
This commit is contained in:
Sei Lisa 2017-11-26 14:10:33 +01:00
parent eba4df6903
commit ed05a2e022

42
main.py
View file

@ -106,27 +106,27 @@ def PreparePreproc(script):
# instead of reproducing that C quirk. This also matches what FS is doing # instead of reproducing that C quirk. This also matches what FS is doing
# currently, so it's good for compatibility. # currently, so it's good for compatibility.
tok = re.compile( tok = re.compile(
r'(?:' ur'(?:'
r'/(?:\?\?/\n|\\\n)*\*.*?\*(?:\?\?/\n|\\\n)*/' ur'/(?:\?\?/\n|\\\n)*\*.*?\*(?:\?\?/\n|\\\n)*/'
r'|/(?:\?\?/\n|\\\n)*/(?:\?\?/\n|\\\n|[^\n])*\n' ur'|/(?:\?\?/\n|\\\n)*/(?:\?\?/\n|\\\n|[^\n])*\n'
r'|[^"]' ur'|[^"]'
r')+' ur')+'
r'|"' ur'|"'
, re.S) , re.S)
# RE used inside strings. # RE used inside strings.
tok2 = re.compile( tok2 = re.compile(
r'(?:' ur'(?:'
r"\?\?[='()!<>-]" # valid trigraph except ??/ (backslash) ur"\?\?[='()!<>-]" # valid trigraph except ??/ (backslash)
r"|(?:\?\?/|\\)(?:\?\?[/='()!<>-]|[^\n])" ur"|(?:\?\?/|\\)(?:\?\?[/='()!<>-]|[^\n])"
# backslash trigraph or actual backslash, # backslash trigraph or actual backslash,
# followed by any trigraph or non-newline # followed by any trigraph or non-newline
r'|(?!\?\?/\n|\\\n|"|\n).' ur'|(?!\?\?/\n|\\\n|"|\n).'
# any character that doesn't start a trigraph/ # any character that doesn't start a trigraph/
# backslash escape followed by a newline # backslash escape followed by a newline
# or is a newline or double quote, as we're # or is a newline or double quote, as we're
# interested in all those individually. # interested in all those individually.
r')+' # as many of those as possible ur')+' # as many of those as possible
r'|\?\?/\n|\\\n|\n|"' # or any of those individually ur'|\?\?/\n|\\\n|\n|"' # or any of those individually
) )
pos = 0 pos = 0
@ -134,7 +134,7 @@ def PreparePreproc(script):
while match: while match:
matched = match.group(0) matched = match.group(0)
pos += len(matched) pos += len(matched)
if matched == '"': if matched == u'"':
s += matched s += matched
nlines = col = 0 nlines = col = 0
match2 = tok2.search(script, pos) match2 = tok2.search(script, pos)
@ -142,24 +142,24 @@ def PreparePreproc(script):
matched2 = match2.group(0) matched2 = match2.group(0)
pos += len(matched2) pos += len(matched2)
if matched2 == '\\\n' or matched2 == '??/\n': if matched2 == u'\\\n' or matched2 == u'??/\n':
nlines += 1 nlines += 1
col = 0 col = 0
match2 = tok2.search(script, pos) match2 = tok2.search(script, pos)
continue continue
if matched2 == '"': if matched2 == u'"':
if nlines: if nlines:
if script[pos:pos+1] == '\n': if script[pos:pos+1] == u'\n':
col = -1 # don't add spaces if not necessary col = -1 # don't add spaces if not necessary
# col misses the quote added here, so add 1 # col misses the quote added here, so add 1
s += '"' + '\n'*nlines + ' '*(col+1) s += u'"' + u'\n'*nlines + u' '*(col+1)
else: else:
s += '"' s += u'"'
break break
if matched2 == '\n': if matched2 == u'\n':
nlines += 1 nlines += 1
col = 0 col = 0
s += '\\n' s += u'\\n'
else: else:
col += len(matched2) col += len(matched2)
s += matched2 s += matched2
@ -628,7 +628,7 @@ def main(argv):
if preproc != 'none': if preproc != 'none':
# At this point, for the external preprocessor to work we need the # At this point, for the external preprocessor to work we need the
# script as a byte array, not as unicode, but it should be UTF-8. # script as a byte array, not as unicode, but it should be UTF-8.
script = PreparePreproc(script) script = PreparePreproc(script.decode('utf8')).encode('utf8')
if preproc == 'mcpp': if preproc == 'mcpp':
# As a special treatment for mcpp, we force it to output its # As a special treatment for mcpp, we force it to output its
# macros so we can read if USE_xxx are defined. With GCC that # macros so we can read if USE_xxx are defined. With GCC that