# Copyright 2008-2015 Jose Fonseca # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # import os import re from .scanner import DotScanner EOF = -1 SKIP = -2 ID = 0 STR_ID = 1 HTML_ID = 2 class Token: def __init__(self, type, text, line, col): self.type = type self.text = text self.line = line self.col = col class ParseError(Exception): def __init__(self, msg=None, filename=None, line=None, col=None): self.msg = msg self.filename = filename self.line = line self.col = col def __str__(self): return ':'.join([str(part) for part in (self.filename, self.line, self.col, self.msg) if part is not None]) class Lexer: # should be overriden by derived classes scanner = None tabsize = 8 newline_re = re.compile(br'\r\n?|\n') def __init__(self, buf=None, pos=0, filename=None, fp=None): if fp is not None: try: fileno = fp.fileno() length = os.path.getsize(fp.name) import mmap except: # read whole file into memory buf = fp.read() pos = 0 else: # map the whole file into memory if length: # length must not be zero buf = mmap.mmap(fileno, length, access=mmap.ACCESS_READ) pos = os.lseek(fileno, 0, 1) else: buf = b'' pos = 0 if filename is None: try: filename = fp.name except AttributeError: filename = None self.buf = buf self.pos = pos self.line = 1 self.col = 1 self.filename = filename def __next__(self): while True: # save state pos = self.pos line = self.line col = self.col type, text, endpos = self.scanner.next(self.buf, pos) assert isinstance(text, bytes) assert pos + len(text) == endpos self.consume(text) type, text = self.filter(type, text) self.pos = endpos if type == SKIP: continue elif type is None: msg = 'unexpected char %r' % (text,) raise ParseError(msg, self.filename, line, col) else: break return Token(type=type, text=text, line=line, col=col) def consume(self, text): # update line number pos = 0 for mo in self.newline_re.finditer(text, pos): self.line += 1 self.col = 1 pos = mo.end() # update column number while True: tabpos = text.find(b'\t', pos) if tabpos == -1: break self.col += tabpos - pos self.col = ((self.col - 1) // self.tabsize + 1) * self.tabsize + 1 pos = tabpos + 1 self.col += len(text) - pos class DotLexer(Lexer): scanner = DotScanner() def filter(self, type, text): # TODO: handle charset if type == STR_ID: text = text[1:-1] # line continuations text = text.replace(b'\\\r\n', b'') text = text.replace(b'\\\r', b'') text = text.replace(b'\\\n', b'') # quotes text = text.replace(b'\\"', b'"') # layout engines recognize other escape codes (many non-standard) # but we don't translate them here type = ID elif type == HTML_ID: text = text[1:-1] type = ID return type, text