From c544b51e37ca4b6fcef7f41dadc49f6a5d257725 Mon Sep 17 00:00:00 2001
From: Sei Lisa <sei-lisa@email.fake>
Date: Mon, 2 Oct 2017 00:40:59 +0200
Subject: [PATCH] Rewrite ReportError() and change EParse to report columns in
 chars.

ReportError() needed to account for terminal encodings that don't support the characters being printed. It was also reporting an inaccurate column number and its corresponding marker position, because the count was in bytes, not in characters, so that has been fixed.

Now EParse.__init__() calls a new function GetErrLineCol() that calculates the line and column corresponding to an error position.

The algorithm for finding the start of the line has also been changed in both ReportError() and EParse.__init__(); as a result, function fieldpos() has been removed.

The exception's lno and cno fields have been changed to be 1-based, rather than 0-based.

Thanks to @Jomik for the report. Fixes #5.
---
 lslopt/lslparse.py | 24 +++++++++---------------
 main.py            | 30 ++++++++++++++++++++++++------
 testparser.py      |  5 ++---
 3 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/lslopt/lslparse.py b/lslopt/lslparse.py
index 6461298..ff43533 100644
--- a/lslopt/lslparse.py
+++ b/lslopt/lslparse.py
@@ -44,26 +44,20 @@ def isalphanum_(c):
 def ishex(c):
     return '0' <= c <= '9' or 'A' <= c <= 'F' or 'a' <= c <= 'f'
 
-def fieldpos(inp, sep, n):
-    """Return the starting position of field n in a string inp that has zero or
-    more fields separated by sep
-    """
-    i = -1
-    for n in xrange(n):
-        i = inp.find(sep, i + 1)
-        if i < 0:
-            return i
-    return i + 1
+def GetErrLineCol(parser):
+    errorpos = parser.errorpos
+    lno = parser.script.count('\n', 0, errorpos)
+    lstart = parser.script.rfind('\n', 0, errorpos) + 1
+    # Find column number in characters
+    cno = len(parser.script[lstart:errorpos].decode('utf8'))
+    return (lno + 1, cno + 1)
 
 class EParse(Exception):
-
     def __init__(self, parser, msg):
         self.errorpos = parser.errorpos
-        self.lno = parser.script.count('\n', 0, self.errorpos)
-        self.cno = self.errorpos - fieldpos(parser.script, '\n', self.lno)
-        # Note the column number reported is in bytes.
+        self.lno, self.cno = GetErrLineCol(parser)
 
-        msg = u"(Line %d char %d): ERROR: %s" % (self.lno + 1, self.cno + 1, msg)
+        msg = u"(Line %d char %d): ERROR: %s" % (self.lno, self.cno, msg)
         super(EParse, self).__init__(msg)
 
 class EParseUEOF(EParse):
diff --git a/main.py b/main.py
index 8648af8..65b22f7 100755
--- a/main.py
+++ b/main.py
@@ -19,7 +19,7 @@
 
 # This is the main executable program that imports the libraries.
 
-from lslopt.lslparse import parser,EParse,fieldpos
+from lslopt.lslparse import parser,EParse
 from lslopt.lsloutput import outscript
 from lslopt.lsloptimizer import optimizer
 import sys, os, getopt, re
@@ -30,11 +30,29 @@ VERSION = '0.2.1beta'
 
 
 def ReportError(script, e):
-    lastpos = fieldpos(script, '\n', e.lno+1)-1
-    assert lastpos != -1
-    if lastpos < -1: lastpos = len(script) # may hit EOF
-    sys.stderr.write(script[fieldpos(script, '\n', e.lno):lastpos].decode('utf8') + u"\n")
-    sys.stderr.write(u" " * e.cno + u"^\n")
+    linestart = script.rfind(b'\n', 0, e.errorpos) + 1
+    lineend = script.find(b'\n', e.errorpos)
+    if lineend == -1: lineend = len(script) # may hit EOF
+
+    # When the encoding of stderr is unknown (e.g. when redirected to a file),
+    # output will be encoded in UTF-8; otherwise the terminal's encoding will
+    # be used.
+    enc = sys.stderr.encoding if sys.stderr.encoding is not None else 'utf8'
+
+    # Synchronize the UTF-8 encoded line with the output line in the
+    # terminal's encoding. We need to compensate for the fact that the
+    # reported column applies to the UTF-8 version of the script.
+    # 1. Trim the UTF-8 line.
+    err_frag = script[linestart:e.errorpos]
+    # 2. Convert to Unicode; encode in the target encoding with replacing.
+    err_frag = err_frag.decode('utf8').encode(enc, 'backslashreplace')
+    # 3. Collect our prize: the length of that in characters.
+    cno = len(err_frag.decode(enc))
+
+    # Write the whole line in the target encoding.
+    err_line = script[linestart:lineend] + b'\n'
+    sys.stderr.write(err_line.decode('utf8').encode(enc, 'backslashreplace'))
+    sys.stderr.write(u" " * cno + u"^\n")
     sys.stderr.write(e.args[0] + u"\n")
 
 class UniConvScript(object):
diff --git a/testparser.py b/testparser.py
index de55560..f6e922f 100644
--- a/testparser.py
+++ b/testparser.py
@@ -21,7 +21,7 @@
 from lslopt.lslparse import parser,EParseSyntax,EParseUEOF,EParseAlreadyDefined,\
     EParseUndefined,EParseTypeMismatch,EParseReturnShouldBeEmpty,EParseReturnIsEmpty,\
     EParseInvalidField,EParseFunctionMismatch,EParseDeclarationScope,\
-    EParseDuplicateLabel,EParseCantChangeState,EParseCodePathWithoutRet,fieldpos
+    EParseDuplicateLabel,EParseCantChangeState,EParseCodePathWithoutRet
 from lslopt.lsloutput import outscript
 from lslopt.lsloptimizer import optimizer
 from lslopt import lslfuncs
@@ -217,7 +217,6 @@ class Test02_Parser(UnitTestCase):
             ))
         print self.parser.scopeindex
 
-        self.assertEqual(fieldpos("a,b", ",", 3), -1)
         self.assertEqual(self.outscript.Value2LSL(lslfuncs.Key(u'')), '((key)"")')
         self.assertRaises(AssertionError, self.outscript.Value2LSL, '')
 
@@ -528,7 +527,7 @@ class Test03_Optimizer(UnitTestCase):
             self.assertFalse(True)
         except EParseSyntax as e:
             # should err before first closing brace
-            self.assertEqual(e.cno, 27)
+            self.assertEqual(e.cno, 28)
         except:
             # should raise no other exception
             self.assertFalse(True)