418 lines
13 KiB
Zig
418 lines
13 KiB
Zig
//! GQL Lexer/Tokenizer
|
|
//!
|
|
//! Converts GQL query string into tokens for parser.
|
|
//! ISO/IEC 39075:2024 lexical structure.
|
|
|
|
const std = @import("std");
|
|
|
|
pub const TokenType = enum {
|
|
// Keywords
|
|
match,
|
|
create,
|
|
delete,
|
|
return_keyword,
|
|
where,
|
|
as_keyword,
|
|
and_keyword,
|
|
or_keyword,
|
|
not_keyword,
|
|
null_keyword,
|
|
true_keyword,
|
|
false_keyword,
|
|
|
|
// Punctuation
|
|
left_paren, // (
|
|
right_paren, // )
|
|
left_bracket, // [
|
|
right_bracket, // ]
|
|
left_brace, // {
|
|
right_brace, // }
|
|
colon, // :
|
|
comma, // ,
|
|
dot, // .
|
|
minus, // -
|
|
arrow_right, // ->
|
|
arrow_left, // <-
|
|
star, // *
|
|
slash, // /
|
|
percent, // %
|
|
plus, // +
|
|
|
|
// Comparison operators
|
|
eq, // =
|
|
neq, // <>
|
|
lt, // <
|
|
lte, // <=
|
|
gt, // >
|
|
gte, // >=
|
|
|
|
// Literals
|
|
identifier,
|
|
string_literal,
|
|
integer_literal,
|
|
float_literal,
|
|
|
|
// Special
|
|
eof,
|
|
invalid,
|
|
};
|
|
|
|
pub const Token = struct {
|
|
type: TokenType,
|
|
text: []const u8, // Slice into original source
|
|
line: u32,
|
|
column: u32,
|
|
};
|
|
|
|
pub const Lexer = struct {
|
|
source: []const u8,
|
|
pos: usize,
|
|
line: u32,
|
|
column: u32,
|
|
allocator: std.mem.Allocator,
|
|
|
|
const Self = @This();
|
|
|
|
pub fn init(source: []const u8, allocator: std.mem.Allocator) Self {
|
|
return Self{
|
|
.source = source,
|
|
.pos = 0,
|
|
.line = 1,
|
|
.column = 1,
|
|
.allocator = allocator,
|
|
};
|
|
}
|
|
|
|
/// Get next token
|
|
pub fn nextToken(self: *Self) !Token {
|
|
self.skipWhitespace();
|
|
|
|
if (self.pos >= self.source.len) {
|
|
return self.makeToken(.eof, 0);
|
|
}
|
|
|
|
const c = self.source[self.pos];
|
|
|
|
// Identifiers and keywords
|
|
if (isAlpha(c) or c == '_') {
|
|
return self.readIdentifier();
|
|
}
|
|
|
|
// Numbers
|
|
if (isDigit(c)) {
|
|
return self.readNumber();
|
|
}
|
|
|
|
// Strings
|
|
if (c == '"' or c == '\'') {
|
|
return self.readString();
|
|
}
|
|
|
|
// Single-char tokens and operators
|
|
switch (c) {
|
|
'(' => { self.advance(); return self.makeToken(.left_paren, 1); },
|
|
')' => { self.advance(); return self.makeToken(.right_paren, 1); },
|
|
'[' => { self.advance(); return self.makeToken(.left_bracket, 1); },
|
|
']' => { self.advance(); return self.makeToken(.right_bracket, 1); },
|
|
'{' => { self.advance(); return self.makeToken(.left_brace, 1); },
|
|
'}' => { self.advance(); return self.makeToken(.right_brace, 1); },
|
|
':' => { self.advance(); return self.makeToken(.colon, 1); },
|
|
',' => { self.advance(); return self.makeToken(.comma, 1); },
|
|
'.' => { self.advance(); return self.makeToken(.dot, 1); },
|
|
'+' => { self.advance(); return self.makeToken(.plus, 1); },
|
|
'%' => { self.advance(); return self.makeToken(.percent, 1); },
|
|
'*' => { self.advance(); return self.makeToken(.star, 1); },
|
|
|
|
'-' => {
|
|
self.advance();
|
|
if (self.peek() == '>') {
|
|
self.advance();
|
|
return self.makeToken(.arrow_right, 2);
|
|
}
|
|
return self.makeToken(.minus, 1);
|
|
},
|
|
|
|
'<' => {
|
|
self.advance();
|
|
if (self.peek() == '-') {
|
|
self.advance();
|
|
return self.makeToken(.arrow_left, 2);
|
|
} else if (self.peek() == '>') {
|
|
self.advance();
|
|
return self.makeToken(.neq, 2);
|
|
} else if (self.peek() == '=') {
|
|
self.advance();
|
|
return self.makeToken(.lte, 2);
|
|
}
|
|
return self.makeToken(.lt, 1);
|
|
},
|
|
|
|
'>' => {
|
|
self.advance();
|
|
if (self.peek() == '=') {
|
|
self.advance();
|
|
return self.makeToken(.gte, 2);
|
|
}
|
|
return self.makeToken(.gt, 1);
|
|
},
|
|
|
|
'=' => { self.advance(); return self.makeToken(.eq, 1); },
|
|
|
|
else => {
|
|
self.advance();
|
|
return self.makeToken(.invalid, 1);
|
|
},
|
|
}
|
|
}
|
|
|
|
/// Read all tokens into array
|
|
pub fn tokenize(self: *Self) ![]Token {
|
|
var tokens: std.ArrayList(Token) = .{};
|
|
errdefer tokens.deinit(self.allocator);
|
|
|
|
while (true) {
|
|
const tok = try self.nextToken();
|
|
try tokens.append(self.allocator, tok);
|
|
if (tok.type == .eof) break;
|
|
}
|
|
|
|
return tokens.toOwnedSlice(self.allocator);
|
|
}
|
|
|
|
// =========================================================================
|
|
// Internal helpers
|
|
// =========================================================================
|
|
|
|
fn advance(self: *Self) void {
|
|
if (self.pos >= self.source.len) return;
|
|
|
|
if (self.source[self.pos] == '\n') {
|
|
self.line += 1;
|
|
self.column = 1;
|
|
} else {
|
|
self.column += 1;
|
|
}
|
|
self.pos += 1;
|
|
}
|
|
|
|
fn peek(self: *Self) u8 {
|
|
if (self.pos >= self.source.len) return 0;
|
|
return self.source[self.pos];
|
|
}
|
|
|
|
fn skipWhitespace(self: *Self) void {
|
|
while (self.pos < self.source.len) {
|
|
const c = self.source[self.pos];
|
|
if (c == ' ' or c == '\t' or c == '\n' or c == '\r') {
|
|
self.advance();
|
|
} else if (c == '/' and self.pos + 1 < self.source.len and self.source[self.pos + 1] == '/') {
|
|
// Single-line comment
|
|
while (self.pos < self.source.len and self.source[self.pos] != '\n') {
|
|
self.advance();
|
|
}
|
|
} else if (c == '/' and self.pos + 1 < self.source.len and self.source[self.pos + 1] == '*') {
|
|
// Multi-line comment
|
|
self.advance(); // /
|
|
self.advance(); // *
|
|
while (self.pos + 1 < self.source.len) {
|
|
if (self.source[self.pos] == '*' and self.source[self.pos + 1] == '/') {
|
|
self.advance(); // *
|
|
self.advance(); // /
|
|
break;
|
|
}
|
|
self.advance();
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
fn readIdentifier(self: *Self) Token {
|
|
const start = self.pos;
|
|
const start_line = self.line;
|
|
const start_col = self.column;
|
|
|
|
while (self.pos < self.source.len) {
|
|
const c = self.source[self.pos];
|
|
if (isAlphaNum(c) or c == '_') {
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
const text = self.source[start..self.pos];
|
|
const tok_type = keywordFromString(text);
|
|
|
|
return Token{
|
|
.type = tok_type,
|
|
.text = text,
|
|
.line = start_line,
|
|
.column = start_col,
|
|
};
|
|
}
|
|
|
|
fn readNumber(self: *Self) !Token {
|
|
const start = self.pos;
|
|
const start_line = self.line;
|
|
const start_col = self.column;
|
|
var is_float = false;
|
|
|
|
while (self.pos < self.source.len) {
|
|
const c = self.source[self.pos];
|
|
if (isDigit(c)) {
|
|
self.advance();
|
|
} else if (c == '.' and !is_float) {
|
|
// Check for range operator (e.g., 1..3)
|
|
if (self.pos + 1 < self.source.len and self.source[self.pos + 1] == '.') {
|
|
break; // Stop before range operator
|
|
}
|
|
is_float = true;
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
const text = self.source[start..self.pos];
|
|
const tok_type: TokenType = if (is_float) .float_literal else .integer_literal;
|
|
|
|
return Token{
|
|
.type = tok_type,
|
|
.text = text,
|
|
.line = start_line,
|
|
.column = start_col,
|
|
};
|
|
}
|
|
|
|
fn readString(self: *Self) !Token {
|
|
const start = self.pos;
|
|
const start_line = self.line;
|
|
const start_col = self.column;
|
|
const quote = self.source[self.pos];
|
|
self.advance(); // opening quote
|
|
|
|
while (self.pos < self.source.len) {
|
|
const c = self.source[self.pos];
|
|
if (c == quote) {
|
|
self.advance(); // closing quote
|
|
break;
|
|
} else if (c == '\\' and self.pos + 1 < self.source.len) {
|
|
self.advance(); // backslash
|
|
self.advance(); // escaped char
|
|
} else {
|
|
self.advance();
|
|
}
|
|
}
|
|
|
|
const text = self.source[start..self.pos];
|
|
return Token{
|
|
.type = .string_literal,
|
|
.text = text,
|
|
.line = start_line,
|
|
.column = start_col,
|
|
};
|
|
}
|
|
|
|
fn makeToken(self: *Self, tok_type: TokenType, len: usize) Token {
|
|
const tok = Token{
|
|
.type = tok_type,
|
|
.text = self.source[self.pos - len .. self.pos],
|
|
.line = self.line,
|
|
.column = self.column - @as(u32, @intCast(len)),
|
|
};
|
|
return tok;
|
|
}
|
|
};
|
|
|
|
// ============================================================================
|
|
// Helper functions
|
|
// ============================================================================
|
|
|
|
fn isAlpha(c: u8) bool {
|
|
return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
|
|
}
|
|
|
|
fn isDigit(c: u8) bool {
|
|
return c >= '0' and c <= '9';
|
|
}
|
|
|
|
fn isAlphaNum(c: u8) bool {
|
|
return isAlpha(c) or isDigit(c);
|
|
}
|
|
|
|
fn keywordFromString(text: []const u8) TokenType {
|
|
// Zig 0.15.2 compatible: use switch instead of ComptimeStringMap
|
|
if (std.mem.eql(u8, text, "MATCH") or std.mem.eql(u8, text, "match")) return .match;
|
|
if (std.mem.eql(u8, text, "CREATE") or std.mem.eql(u8, text, "create")) return .create;
|
|
if (std.mem.eql(u8, text, "DELETE") or std.mem.eql(u8, text, "delete")) return .delete;
|
|
if (std.mem.eql(u8, text, "RETURN") or std.mem.eql(u8, text, "return")) return .return_keyword;
|
|
if (std.mem.eql(u8, text, "WHERE") or std.mem.eql(u8, text, "where")) return .where;
|
|
if (std.mem.eql(u8, text, "AS") or std.mem.eql(u8, text, "as")) return .as_keyword;
|
|
if (std.mem.eql(u8, text, "AND") or std.mem.eql(u8, text, "and")) return .and_keyword;
|
|
if (std.mem.eql(u8, text, "OR") or std.mem.eql(u8, text, "or")) return .or_keyword;
|
|
if (std.mem.eql(u8, text, "NOT") or std.mem.eql(u8, text, "not")) return .not_keyword;
|
|
if (std.mem.eql(u8, text, "NULL") or std.mem.eql(u8, text, "null")) return .null_keyword;
|
|
if (std.mem.eql(u8, text, "TRUE") or std.mem.eql(u8, text, "true")) return .true_keyword;
|
|
if (std.mem.eql(u8, text, "FALSE") or std.mem.eql(u8, text, "false")) return .false_keyword;
|
|
return .identifier;
|
|
}
|
|
|
|
// ============================================================================
|
|
// TESTS
|
|
// ============================================================================
|
|
|
|
test "Lexer: simple keywords" {
|
|
const allocator = std.testing.allocator;
|
|
const source = "MATCH (n) RETURN n";
|
|
|
|
var lex = Lexer.init(source, allocator);
|
|
const tokens = try lex.tokenize();
|
|
defer allocator.free(tokens);
|
|
|
|
try std.testing.expectEqual(TokenType.match, tokens[0].type);
|
|
try std.testing.expectEqual(TokenType.left_paren, tokens[1].type);
|
|
try std.testing.expectEqual(TokenType.identifier, tokens[2].type);
|
|
try std.testing.expectEqual(TokenType.right_paren, tokens[3].type);
|
|
try std.testing.expectEqual(TokenType.return_keyword, tokens[4].type);
|
|
try std.testing.expectEqual(TokenType.identifier, tokens[5].type);
|
|
try std.testing.expectEqual(TokenType.eof, tokens[6].type);
|
|
}
|
|
|
|
test "Lexer: arrow operators" {
|
|
const allocator = std.testing.allocator;
|
|
const source = "-> <-";
|
|
|
|
var lexer = Lexer.init(source, allocator);
|
|
const tokens = try lexer.tokenize();
|
|
defer allocator.free(tokens);
|
|
|
|
try std.testing.expectEqual(TokenType.arrow_right, tokens[0].type);
|
|
try std.testing.expectEqual(TokenType.arrow_left, tokens[1].type);
|
|
}
|
|
|
|
test "Lexer: string literal" {
|
|
const allocator = std.testing.allocator;
|
|
const source = "\"hello world\"";
|
|
|
|
var lexer = Lexer.init(source, allocator);
|
|
const tokens = try lexer.tokenize();
|
|
defer allocator.free(tokens);
|
|
|
|
try std.testing.expectEqual(TokenType.string_literal, tokens[0].type);
|
|
try std.testing.expectEqualStrings("\"hello world\"", tokens[0].text);
|
|
}
|
|
|
|
test "Lexer: numbers" {
|
|
const allocator = std.testing.allocator;
|
|
const source = "42 3.14";
|
|
|
|
var lexer = Lexer.init(source, allocator);
|
|
const tokens = try lexer.tokenize();
|
|
defer allocator.free(tokens);
|
|
|
|
try std.testing.expectEqual(TokenType.integer_literal, tokens[0].type);
|
|
try std.testing.expectEqual(TokenType.float_literal, tokens[1].type);
|
|
}
|