Browse code

Parse xdot with bytes, not unicode strings.

This is because several fields in xdot have number of bytes, not
characters.

Jose Fonseca authored on 16/10/2015 23:41:29
Showing 2 changed files

  • test.py index 235158a..480b0b5 100755
  • xdot.py index ec9c608..970b707 100755
... ...
@@ -80,7 +80,7 @@ def main():
80 80
         sys.stdout.write(arg + '\n')
81 81
         sys.stdout.flush()
82 82
         name, ext = os.path.splitext(os.path.basename(arg))
83
-        dotcode = open(arg).read()
83
+        dotcode = open(arg, 'rb').read()
84 84
         widget = TestDotWidget(name)
85 85
         window = DotWindow(widget)
86 86
         window.connect('delete-event', Gtk.main_quit)
... ...
@@ -564,13 +564,17 @@ class XDotAttrParser:
564 564
         return self.pos < len(self.buf)
565 565
 
566 566
     def read_code(self):
567
-        pos = self.buf.find(" ", self.pos)
567
+        pos = self.buf.find(b" ", self.pos)
568 568
         res = self.buf[self.pos:pos]
569 569
         self.pos = pos + 1
570
-        while self.pos < len(self.buf) and self.buf[self.pos].isspace():
571
-            self.pos += 1
570
+        self.skip_space()
571
+        res = res.decode('utf-8')
572 572
         return res
573 573
 
574
+    def skip_space(self):
575
+        while self.pos < len(self.buf) and self.buf[self.pos : self.pos + 1].isspace():
576
+            self.pos += 1
577
+
574 578
     def read_int(self):
575 579
         return int(self.read_code())
576 580
 
... ...
@@ -584,11 +588,11 @@ class XDotAttrParser:
584 588
 
585 589
     def read_text(self):
586 590
         num = self.read_int()
587
-        pos = self.buf.find("-", self.pos) + 1
591
+        pos = self.buf.find(b"-", self.pos) + 1
588 592
         self.pos = pos + num
589 593
         res = self.buf[pos:self.pos]
590
-        while self.pos < len(self.buf) and self.buf[self.pos].isspace():
591
-            self.pos += 1
594
+        self.skip_space()
595
+        res = res.decode('utf-8')
592 596
         return res
593 597
 
594 598
     def read_polygon(self):
... ...
@@ -819,13 +823,13 @@ class Scanner:
819 823
         if self.ignorecase:
820 824
             flags |= re.IGNORECASE
821 825
         self.tokens_re = re.compile(
822
-            '|'.join(['(' + regexp + ')' for type, regexp, test_lit in self.tokens]),
826
+            b'|'.join([b'(' + regexp + b')' for type, regexp, test_lit in self.tokens]),
823 827
              flags
824 828
         )
825 829
 
826 830
     def next(self, buf, pos):
827 831
         if pos >= len(buf):
828
-            return EOF, '', pos
832
+            return EOF, b'', pos
829 833
         mo = self.tokens_re.match(buf, pos)
830 834
         if mo:
831 835
             text = mo.group()
... ...
@@ -835,7 +839,7 @@ class Scanner:
835 839
                 type = self.literals.get(text, type)
836 840
             return type, text, pos
837 841
         else:
838
-            c = buf[pos]
842
+            c = buf[pos : pos + 1]
839 843
             return self.symbols.get(c, None), c, pos + 1
840 844
 
841 845
 
... ...
@@ -854,7 +858,7 @@ class Lexer:
854 858
     scanner = None
855 859
     tabsize = 8
856 860
 
857
-    newline_re = re.compile(r'\r\n?|\n')
861
+    newline_re = re.compile(br'\r\n?|\n')
858 862
 
859 863
     def __init__(self, buf = None, pos = 0, filename = None, fp = None):
860 864
         if fp is not None:
... ...
@@ -873,7 +877,7 @@ class Lexer:
873 877
                     buf = mmap.mmap(fileno, length, access = mmap.ACCESS_READ)
874 878
                     pos = os.lseek(fileno, 0, 1)
875 879
                 else:
876
-                    buf = ''
880
+                    buf = b''
877 881
                     pos = 0
878 882
 
879 883
             if filename is None:
... ...
@@ -896,6 +900,7 @@ class Lexer:
896 900
             col = self.col
897 901
 
898 902
             type, text, endpos = self.scanner.next(self.buf, pos)
903
+            assert isinstance(text, bytes)
899 904
             assert pos + len(text) == endpos
900 905
             self.consume(text)
901 906
             type, text = self.filter(type, text)
... ...
@@ -904,11 +909,7 @@ class Lexer:
904 909
             if type == SKIP:
905 910
                 continue
906 911
             elif type is None:
907
-                msg = 'unexpected char '
908
-                if text >= ' ' and text <= '~':
909
-                    msg += "'%s'" % text
910
-                else:
911
-                    msg += "0x%X" % ord(text)
912
+                msg = 'unexpected char %r' % (text,)
912 913
                 raise ParseError(msg, self.filename, line, col)
913 914
             else:
914 915
                 break
... ...
@@ -924,7 +925,7 @@ class Lexer:
924 925
 
925 926
         # update column number
926 927
         while True:
927
-            tabpos = text.find('\t', pos)
928
+            tabpos = text.find(b'\t', pos)
928 929
             if tabpos == -1:
929 930
                 break
930 931
             self.col += tabpos - pos
... ...
@@ -986,49 +987,49 @@ class DotScanner(Scanner):
986 987
     tokens = [
987 988
         # whitespace and comments
988 989
         (SKIP,
989
-            r'[ \t\f\r\n\v]+|'
990
-            r'//[^\r\n]*|'
991
-            r'/\*.*?\*/|'
992
-            r'#[^\r\n]*',
990
+            br'[ \t\f\r\n\v]+|'
991
+            br'//[^\r\n]*|'
992
+            br'/\*.*?\*/|'
993
+            br'#[^\r\n]*',
993 994
         False),
994 995
 
995 996
         # Alphanumeric IDs
996
-        (ID, r'[a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*', True),
997
+        (ID, br'[a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*', True),
997 998
 
998 999
         # Numeric IDs
999
-        (ID, r'-?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)', False),
1000
+        (ID, br'-?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)', False),
1000 1001
 
1001 1002
         # String IDs
1002
-        (STR_ID, r'"[^"\\]*(?:\\.[^"\\]*)*"', False),
1003
+        (STR_ID, br'"[^"\\]*(?:\\.[^"\\]*)*"', False),
1003 1004
 
1004 1005
         # HTML IDs
1005
-        (HTML_ID, r'<[^<>]*(?:<[^<>]*>[^<>]*)*>', False),
1006
+        (HTML_ID, br'<[^<>]*(?:<[^<>]*>[^<>]*)*>', False),
1006 1007
 
1007 1008
         # Edge operators
1008
-        (EDGE_OP, r'-[>-]', False),
1009
+        (EDGE_OP, br'-[>-]', False),
1009 1010
     ]
1010 1011
 
1011 1012
     # symbol table
1012 1013
     symbols = {
1013
-        '[': LSQUARE,
1014
-        ']': RSQUARE,
1015
-        '{': LCURLY,
1016
-        '}': RCURLY,
1017
-        ',': COMMA,
1018
-        ':': COLON,
1019
-        ';': SEMI,
1020
-        '=': EQUAL,
1021
-        '+': PLUS,
1014
+        b'[': LSQUARE,
1015
+        b']': RSQUARE,
1016
+        b'{': LCURLY,
1017
+        b'}': RCURLY,
1018
+        b',': COMMA,
1019
+        b':': COLON,
1020
+        b';': SEMI,
1021
+        b'=': EQUAL,
1022
+        b'+': PLUS,
1022 1023
     }
1023 1024
 
1024 1025
     # literal table
1025 1026
     literals = {
1026
-        'strict': STRICT,
1027
-        'graph': GRAPH,
1028
-        'digraph': DIGRAPH,
1029
-        'node': NODE,
1030
-        'edge': EDGE,
1031
-        'subgraph': SUBGRAPH,
1027
+        b'strict': STRICT,
1028
+        b'graph': GRAPH,
1029
+        b'digraph': DIGRAPH,
1030
+        b'node': NODE,
1031
+        b'edge': EDGE,
1032
+        b'subgraph': SUBGRAPH,
1032 1033
     }
1033 1034
 
1034 1035
     ignorecase = True
... ...
@@ -1044,12 +1045,12 @@ class DotLexer(Lexer):
1044 1045
             text = text[1:-1]
1045 1046
 
1046 1047
             # line continuations
1047
-            text = text.replace('\\\r\n', '')
1048
-            text = text.replace('\\\r', '')
1049
-            text = text.replace('\\\n', '')
1048
+            text = text.replace(b'\\\r\n', b'')
1049
+            text = text.replace(b'\\\r', b'')
1050
+            text = text.replace(b'\\\n', b'')
1050 1051
             
1051 1052
             # quotes
1052
-            text = text.replace('\\"', '"')
1053
+            text = text.replace(b'\\"', b'"')
1053 1054
 
1054 1055
             # layout engines recognize other escape codes (many non-standard)
1055 1056
             # but we don't translate them here
... ...
@@ -1137,6 +1138,7 @@ class DotParser(Parser):
1137 1138
             self.consume()
1138 1139
             while self.lookahead.type != RSQUARE:
1139 1140
                 name, value = self.parse_attr()
1141
+                name = name.decode('utf-8')
1140 1142
                 attrs[name] = value
1141 1143
                 if self.lookahead.type == COMMA:
1142 1144
                     self.consume()
... ...
@@ -1149,7 +1151,7 @@ class DotParser(Parser):
1149 1151
             self.consume()
1150 1152
             value = self.parse_id()
1151 1153
         else:
1152
-            value = 'true'
1154
+            value = b'true'
1153 1155
         return name, value
1154 1156
 
1155 1157
     def parse_node_id(self):
... ...
@@ -1218,7 +1220,7 @@ class XDotParser(DotParser):
1218 1220
                 return
1219 1221
 
1220 1222
             if bb:
1221
-                xmin, ymin, xmax, ymax = map(float, bb.split(","))
1223
+                xmin, ymin, xmax, ymax = map(float, bb.split(b","))
1222 1224
 
1223 1225
                 self.xoffset = -xmin
1224 1226
                 self.yoffset = -ymax
... ...
@@ -1275,17 +1277,16 @@ class XDotParser(DotParser):
1275 1277
 
1276 1278
     def parse(self):
1277 1279
         DotParser.parse(self)
1278
-
1279 1280
         return Graph(self.width, self.height, self.shapes, self.nodes, self.edges)
1280 1281
 
1281 1282
     def parse_node_pos(self, pos):
1282
-        x, y = pos.split(",")
1283
+        x, y = pos.split(b",")
1283 1284
         return self.transform(float(x), float(y))
1284 1285
 
1285 1286
     def parse_edge_pos(self, pos):
1286 1287
         points = []
1287
-        for entry in pos.split(' '):
1288
-            fields = entry.split(',')
1288
+        for entry in pos.split(b' '):
1289
+            fields = entry.split(b',')
1289 1290
             try:
1290 1291
                 x, y = fields
1291 1292
             except ValueError:
... ...
@@ -1567,7 +1568,7 @@ class DotWidget(Gtk.DrawingArea):
1567 1568
                 stdout=subprocess.PIPE,
1568 1569
                 stderr=subprocess.PIPE,
1569 1570
                 shell=False,
1570
-                universal_newlines=True
1571
+                universal_newlines=False
1571 1572
             )
1572 1573
         except OSError as exc:
1573 1574
             error = '%s: %s' % (self.filter, exc.strerror)
... ...
@@ -1576,6 +1577,7 @@ class DotWidget(Gtk.DrawingArea):
1576 1577
             xdotcode, error = p.communicate(dotcode)
1577 1578
         error = error.rstrip()
1578 1579
         if error:
1580
+            error = error.decode()
1579 1581
             sys.stderr.write(error + '\n')
1580 1582
         if p.returncode != 0:
1581 1583
             dialog = Gtk.MessageDialog(type=Gtk.MessageType.ERROR,
... ...
@@ -1589,6 +1591,8 @@ class DotWidget(Gtk.DrawingArea):
1589 1591
 
1590 1592
     def set_dotcode(self, dotcode, filename=None):
1591 1593
         self.openfilename = None
1594
+        if isinstance(dotcode, str):
1595
+            dotcode = dotcode.encode('utf-8')
1592 1596
         xdotcode = self.run_filter(dotcode)
1593 1597
         if xdotcode is None:
1594 1598
             return False
... ...
@@ -1611,6 +1615,7 @@ class DotWidget(Gtk.DrawingArea):
1611 1615
             return True
1612 1616
 
1613 1617
     def set_xdotcode(self, xdotcode):
1618
+        assert isinstance(xdotcode, bytes)
1614 1619
         parser = XDotParser(xdotcode)
1615 1620
         self.graph = parser.parse()
1616 1621
         self.zoom_image(self.zoom_ratio, center=True)