# Copyright 2008-2015 Jose Fonseca
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
import re

EOF = -1
SKIP = -2

ID = 0
STR_ID = 1
HTML_ID = 2
EDGE_OP = 3

LSQUARE = 4
RSQUARE = 5
LCURLY = 6
RCURLY = 7
COMMA = 8
COLON = 9
SEMI = 10
EQUAL = 11
PLUS = 12

STRICT = 13
GRAPH = 14
DIGRAPH = 15
NODE = 16
EDGE = 17
SUBGRAPH = 18


class Scanner:
    """Stateless scanner."""

    # should be overriden by derived classes
    tokens = []
    symbols = {}
    literals = {}
    ignorecase = False

    def __init__(self):
        flags = re.DOTALL
        if self.ignorecase:
            flags |= re.IGNORECASE
            self.tokens_re = re.compile(
                b'|'.join([b'(' + regexp + b')'
                           for type, regexp, test_lit in self.tokens]),
                flags
            )

    def next(self, buf, pos):
        if pos >= len(buf):
            return EOF, b'', pos
        mo = self.tokens_re.match(buf, pos)
        if mo:
            text = mo.group()
            type, regexp, test_lit = self.tokens[mo.lastindex - 1]
            pos = mo.end()
            if test_lit:
                type = self.literals.get(text, type)
            return type, text, pos
        else:
            c = buf[pos:pos+1]
            return self.symbols.get(c, None), c, pos + 1


class DotScanner(Scanner):

    # token regular expression table
    tokens = [
        # whitespace and comments
        (SKIP,
         br'[ \t\f\r\n\v]+|'
         br'//[^\r\n]*|'
         br'/\*.*?\*/|'
         br'#[^\r\n]*',
         False),

        # Alphanumeric IDs
        (ID, br'[a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*', True),

        # Numeric IDs
        (ID, br'-?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)', False),

        # String IDs
        (STR_ID, br'"[^"\\]*(?:\\.[^"\\]*)*"', False),

        # HTML IDs
        (HTML_ID, br'<[^<>]*(?:<[^<>]*>[^<>]*)*>', False),

        # Edge operators
        (EDGE_OP, br'-[>-]', False),
    ]

    # symbol table
    symbols = {
        b'[': LSQUARE,
        b']': RSQUARE,
        b'{': LCURLY,
        b'}': RCURLY,
        b',': COMMA,
        b':': COLON,
        b';': SEMI,
        b'=': EQUAL,
        b'+': PLUS,
    }

    # literal table
    literals = {
        b'strict': STRICT,
        b'graph': GRAPH,
        b'digraph': DIGRAPH,
        b'node': NODE,
        b'edge': EDGE,
        b'subgraph': SUBGRAPH,
    }

    ignorecase = True