163 lines
4.9 KiB
Python
163 lines
4.9 KiB
Python
from .exceptions import LexerError
|
|
from .tokens import Token, TokenType
|
|
|
|
_KEYWORDS: dict[str, TokenType] = {
|
|
"AND": TokenType.AND,
|
|
"OR": TokenType.OR,
|
|
"NOT": TokenType.NOT,
|
|
"IN": TokenType.IN,
|
|
}
|
|
|
|
_SINGLE_CHAR_TOKENS: dict[str, TokenType] = {
|
|
"(": TokenType.LPAREN,
|
|
")": TokenType.RPAREN,
|
|
"[": TokenType.LBRACKET,
|
|
"]": TokenType.RBRACKET,
|
|
",": TokenType.COMMA,
|
|
}
|
|
|
|
|
|
class Lexer:
|
|
def __init__(self, expression: str) -> None:
|
|
self._expression = expression
|
|
self._pos = 0
|
|
self._tokens: list[Token] = []
|
|
|
|
def tokenize(self) -> list[Token]:
|
|
while self._pos < len(self._expression):
|
|
self._skip_whitespace()
|
|
if self._pos >= len(self._expression):
|
|
break
|
|
|
|
char = self._expression[self._pos]
|
|
|
|
if char in {'"', "'"}:
|
|
self._read_string()
|
|
elif char.isdigit() or (char == "-" and self._next_is_digit()):
|
|
self._read_number()
|
|
elif char.isalpha() or char == "_":
|
|
self._read_word()
|
|
elif char in _SINGLE_CHAR_TOKENS:
|
|
self._tokens.append(
|
|
Token(_SINGLE_CHAR_TOKENS[char], char, self._pos)
|
|
)
|
|
self._pos += 1
|
|
else:
|
|
self._read_operator()
|
|
|
|
self._tokens.append(Token(TokenType.EOF, None, self._pos))
|
|
return self._tokens
|
|
|
|
def _skip_whitespace(self) -> None:
|
|
while (
|
|
self._pos < len(self._expression)
|
|
and self._expression[self._pos].isspace()
|
|
):
|
|
self._pos += 1
|
|
|
|
def _peek(self, offset: int = 1) -> str | None:
|
|
target = self._pos + offset
|
|
if target < len(self._expression):
|
|
return self._expression[target]
|
|
return None
|
|
|
|
def _next_is_digit(self) -> bool:
|
|
nxt = self._peek()
|
|
return nxt is not None and nxt.isdigit()
|
|
|
|
def _read_string(self) -> None:
|
|
quote = self._expression[self._pos]
|
|
start = self._pos
|
|
self._pos += 1
|
|
parts: list[str] = []
|
|
|
|
while self._pos < len(self._expression):
|
|
char = self._expression[self._pos]
|
|
if char == "\\":
|
|
self._pos += 1
|
|
if self._pos >= len(self._expression):
|
|
raise LexerError("Unexpected end of string", start)
|
|
parts.append(self._expression[self._pos])
|
|
self._pos += 1
|
|
elif char == quote:
|
|
self._pos += 1
|
|
self._tokens.append(
|
|
Token(TokenType.STRING, "".join(parts), start)
|
|
)
|
|
return
|
|
else:
|
|
parts.append(char)
|
|
self._pos += 1
|
|
|
|
raise LexerError("Unterminated string", start)
|
|
|
|
def _read_number(self) -> None:
|
|
start = self._pos
|
|
has_dot = False
|
|
|
|
if self._expression[self._pos] == "-":
|
|
self._pos += 1
|
|
|
|
while self._pos < len(self._expression):
|
|
char = self._expression[self._pos]
|
|
if char.isdigit():
|
|
self._pos += 1
|
|
elif char == "." and not has_dot:
|
|
has_dot = True
|
|
self._pos += 1
|
|
else:
|
|
break
|
|
|
|
raw = self._expression[start : self._pos]
|
|
value: int | float = float(raw) if has_dot else int(raw)
|
|
self._tokens.append(Token(TokenType.NUMBER, value, start))
|
|
|
|
def _read_word(self) -> None:
|
|
start = self._pos
|
|
while self._pos < len(self._expression) and (
|
|
self._expression[self._pos].isalnum()
|
|
or self._expression[self._pos] in {"_", "."}
|
|
):
|
|
self._pos += 1
|
|
|
|
word = self._expression[start : self._pos]
|
|
upper = word.upper()
|
|
|
|
if upper in {"TRUE", "FALSE"}:
|
|
self._tokens.append(Token(TokenType.BOOL, upper == "TRUE", start))
|
|
elif upper in _KEYWORDS:
|
|
self._tokens.append(Token(_KEYWORDS[upper], word, start))
|
|
else:
|
|
self._tokens.append(Token(TokenType.IDENTIFIER, word, start))
|
|
|
|
def _read_operator(self) -> None:
|
|
start = self._pos
|
|
char = self._expression[self._pos]
|
|
nxt = self._peek()
|
|
|
|
two_char_ops: dict[str, TokenType] = {
|
|
"==": TokenType.EQ,
|
|
"!=": TokenType.NEQ,
|
|
">=": TokenType.GTE,
|
|
"<=": TokenType.LTE,
|
|
}
|
|
|
|
if nxt is not None:
|
|
pair = char + nxt
|
|
if pair in two_char_ops:
|
|
self._tokens.append(Token(two_char_ops[pair], pair, start))
|
|
self._pos += 2
|
|
return
|
|
|
|
one_char_ops: dict[str, TokenType] = {
|
|
">": TokenType.GT,
|
|
"<": TokenType.LT,
|
|
}
|
|
|
|
if char in one_char_ops:
|
|
self._tokens.append(Token(one_char_ops[char], char, start))
|
|
self._pos += 1
|
|
return
|
|
|
|
raise LexerError(f"Unexpected character: {char!r}", start)
|