| ... | ... |
@@ -44,7 +44,9 @@ class ParseError(Exception): |
| 44 | 44 |
self.col = col |
| 45 | 45 |
|
| 46 | 46 |
def __str__(self): |
| 47 |
- return ':'.join([str(part) for part in (self.filename, self.line, self.col, self.msg) if part != None]) |
|
| 47 |
+ return ':'.join([str(part) for part in |
|
| 48 |
+ (self.filename, self.line, self.col, self.msg) |
|
| 49 |
+ if part is not None]) |
|
| 48 | 50 |
|
| 49 | 51 |
|
| 50 | 52 |
class Lexer: |
| ... | ... |
@@ -55,7 +57,7 @@ class Lexer: |
| 55 | 57 |
|
| 56 | 58 |
newline_re = re.compile(br'\r\n?|\n') |
| 57 | 59 |
|
| 58 |
- def __init__(self, buf = None, pos = 0, filename = None, fp = None): |
|
| 60 |
+ def __init__(self, buf=None, pos=0, filename=None, fp=None): |
|
| 59 | 61 |
if fp is not None: |
| 60 | 62 |
try: |
| 61 | 63 |
fileno = fp.fileno() |
| ... | ... |
@@ -69,7 +71,7 @@ class Lexer: |
| 69 | 71 |
# map the whole file into memory |
| 70 | 72 |
if length: |
| 71 | 73 |
# length must not be zero |
| 72 |
- buf = mmap.mmap(fileno, length, access = mmap.ACCESS_READ) |
|
| 74 |
+ buf = mmap.mmap(fileno, length, access=mmap.ACCESS_READ) |
|
| 73 | 75 |
pos = os.lseek(fileno, 0, 1) |
| 74 | 76 |
else: |
| 75 | 77 |
buf = b'' |
| ... | ... |
@@ -108,7 +110,7 @@ class Lexer: |
| 108 | 110 |
raise ParseError(msg, self.filename, line, col) |
| 109 | 111 |
else: |
| 110 | 112 |
break |
| 111 |
- return Token(type = type, text = text, line = line, col = col) |
|
| 113 |
+ return Token(type=type, text=text, line=line, col=col) |
|
| 112 | 114 |
|
| 113 | 115 |
def consume(self, text): |
| 114 | 116 |
# update line number |
| ... | ... |
@@ -124,7 +126,7 @@ class Lexer: |
| 124 | 126 |
if tabpos == -1: |
| 125 | 127 |
break |
| 126 | 128 |
self.col += tabpos - pos |
| 127 |
- self.col = ((self.col - 1)//self.tabsize + 1)*self.tabsize + 1 |
|
| 129 |
+ self.col = ((self.col - 1) // self.tabsize + 1) * self.tabsize + 1 |
|
| 128 | 130 |
pos = tabpos + 1 |
| 129 | 131 |
self.col += len(text) - pos |
| 130 | 132 |
|
| 1 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,158 @@ |
| 1 |
+# Copyright 2008-2015 Jose Fonseca |
|
| 2 |
+# |
|
| 3 |
+# This program is free software: you can redistribute it and/or modify it |
|
| 4 |
+# under the terms of the GNU Lesser General Public License as published |
|
| 5 |
+# by the Free Software Foundation, either version 3 of the License, or |
|
| 6 |
+# (at your option) any later version. |
|
| 7 |
+# |
|
| 8 |
+# This program is distributed in the hope that it will be useful, |
|
| 9 |
+# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 10 |
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 11 |
+# GNU Lesser General Public License for more details. |
|
| 12 |
+# |
|
| 13 |
+# You should have received a copy of the GNU Lesser General Public License |
|
| 14 |
+# along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
| 15 |
+# |
|
| 16 |
+import os |
|
| 17 |
+import re |
|
| 18 |
+ |
|
| 19 |
+from .scanner import DotScanner |
|
| 20 |
+ |
|
| 21 |
+EOF = -1 |
|
| 22 |
+SKIP = -2 |
|
| 23 |
+ |
|
| 24 |
+ID = 0 |
|
| 25 |
+STR_ID = 1 |
|
| 26 |
+HTML_ID = 2 |
|
| 27 |
+ |
|
| 28 |
+ |
|
| 29 |
+class Token: |
|
| 30 |
+ |
|
| 31 |
+ def __init__(self, type, text, line, col): |
|
| 32 |
+ self.type = type |
|
| 33 |
+ self.text = text |
|
| 34 |
+ self.line = line |
|
| 35 |
+ self.col = col |
|
| 36 |
+ |
|
| 37 |
+ |
|
| 38 |
+class ParseError(Exception): |
|
| 39 |
+ |
|
| 40 |
+ def __init__(self, msg=None, filename=None, line=None, col=None): |
|
| 41 |
+ self.msg = msg |
|
| 42 |
+ self.filename = filename |
|
| 43 |
+ self.line = line |
|
| 44 |
+ self.col = col |
|
| 45 |
+ |
|
| 46 |
+ def __str__(self): |
|
| 47 |
+ return ':'.join([str(part) for part in (self.filename, self.line, self.col, self.msg) if part != None]) |
|
| 48 |
+ |
|
| 49 |
+ |
|
| 50 |
+class Lexer: |
|
| 51 |
+ |
|
| 52 |
+ # should be overriden by derived classes |
|
| 53 |
+ scanner = None |
|
| 54 |
+ tabsize = 8 |
|
| 55 |
+ |
|
| 56 |
+ newline_re = re.compile(br'\r\n?|\n') |
|
| 57 |
+ |
|
| 58 |
+ def __init__(self, buf = None, pos = 0, filename = None, fp = None): |
|
| 59 |
+ if fp is not None: |
|
| 60 |
+ try: |
|
| 61 |
+ fileno = fp.fileno() |
|
| 62 |
+ length = os.path.getsize(fp.name) |
|
| 63 |
+ import mmap |
|
| 64 |
+ except: |
|
| 65 |
+ # read whole file into memory |
|
| 66 |
+ buf = fp.read() |
|
| 67 |
+ pos = 0 |
|
| 68 |
+ else: |
|
| 69 |
+ # map the whole file into memory |
|
| 70 |
+ if length: |
|
| 71 |
+ # length must not be zero |
|
| 72 |
+ buf = mmap.mmap(fileno, length, access = mmap.ACCESS_READ) |
|
| 73 |
+ pos = os.lseek(fileno, 0, 1) |
|
| 74 |
+ else: |
|
| 75 |
+ buf = b'' |
|
| 76 |
+ pos = 0 |
|
| 77 |
+ |
|
| 78 |
+ if filename is None: |
|
| 79 |
+ try: |
|
| 80 |
+ filename = fp.name |
|
| 81 |
+ except AttributeError: |
|
| 82 |
+ filename = None |
|
| 83 |
+ |
|
| 84 |
+ self.buf = buf |
|
| 85 |
+ self.pos = pos |
|
| 86 |
+ self.line = 1 |
|
| 87 |
+ self.col = 1 |
|
| 88 |
+ self.filename = filename |
|
| 89 |
+ |
|
| 90 |
+ def __next__(self): |
|
| 91 |
+ while True: |
|
| 92 |
+ # save state |
|
| 93 |
+ pos = self.pos |
|
| 94 |
+ line = self.line |
|
| 95 |
+ col = self.col |
|
| 96 |
+ |
|
| 97 |
+ type, text, endpos = self.scanner.next(self.buf, pos) |
|
| 98 |
+ assert isinstance(text, bytes) |
|
| 99 |
+ assert pos + len(text) == endpos |
|
| 100 |
+ self.consume(text) |
|
| 101 |
+ type, text = self.filter(type, text) |
|
| 102 |
+ self.pos = endpos |
|
| 103 |
+ |
|
| 104 |
+ if type == SKIP: |
|
| 105 |
+ continue |
|
| 106 |
+ elif type is None: |
|
| 107 |
+ msg = 'unexpected char %r' % (text,) |
|
| 108 |
+ raise ParseError(msg, self.filename, line, col) |
|
| 109 |
+ else: |
|
| 110 |
+ break |
|
| 111 |
+ return Token(type = type, text = text, line = line, col = col) |
|
| 112 |
+ |
|
| 113 |
+ def consume(self, text): |
|
| 114 |
+ # update line number |
|
| 115 |
+ pos = 0 |
|
| 116 |
+ for mo in self.newline_re.finditer(text, pos): |
|
| 117 |
+ self.line += 1 |
|
| 118 |
+ self.col = 1 |
|
| 119 |
+ pos = mo.end() |
|
| 120 |
+ |
|
| 121 |
+ # update column number |
|
| 122 |
+ while True: |
|
| 123 |
+ tabpos = text.find(b'\t', pos) |
|
| 124 |
+ if tabpos == -1: |
|
| 125 |
+ break |
|
| 126 |
+ self.col += tabpos - pos |
|
| 127 |
+ self.col = ((self.col - 1)//self.tabsize + 1)*self.tabsize + 1 |
|
| 128 |
+ pos = tabpos + 1 |
|
| 129 |
+ self.col += len(text) - pos |
|
| 130 |
+ |
|
| 131 |
+ |
|
| 132 |
+class DotLexer(Lexer): |
|
| 133 |
+ |
|
| 134 |
+ scanner = DotScanner() |
|
| 135 |
+ |
|
| 136 |
+ def filter(self, type, text): |
|
| 137 |
+ # TODO: handle charset |
|
| 138 |
+ if type == STR_ID: |
|
| 139 |
+ text = text[1:-1] |
|
| 140 |
+ |
|
| 141 |
+ # line continuations |
|
| 142 |
+ text = text.replace(b'\\\r\n', b'') |
|
| 143 |
+ text = text.replace(b'\\\r', b'') |
|
| 144 |
+ text = text.replace(b'\\\n', b'') |
|
| 145 |
+ |
|
| 146 |
+ # quotes |
|
| 147 |
+ text = text.replace(b'\\"', b'"') |
|
| 148 |
+ |
|
| 149 |
+ # layout engines recognize other escape codes (many non-standard) |
|
| 150 |
+ # but we don't translate them here |
|
| 151 |
+ |
|
| 152 |
+ type = ID |
|
| 153 |
+ |
|
| 154 |
+ elif type == HTML_ID: |
|
| 155 |
+ text = text[1:-1] |
|
| 156 |
+ type = ID |
|
| 157 |
+ |
|
| 158 |
+ return type, text |