libertaria-stack/l1-identity/qvl/gql/lexer.zig

418 lines
13 KiB
Zig

//! GQL Lexer/Tokenizer
//!
//! Converts GQL query string into tokens for parser.
//! ISO/IEC 39075:2024 lexical structure.
const std = @import("std");
pub const TokenType = enum {
// Keywords
match,
create,
delete,
return_keyword,
where,
as_keyword,
and_keyword,
or_keyword,
not_keyword,
null_keyword,
true_keyword,
false_keyword,
// Punctuation
left_paren, // (
right_paren, // )
left_bracket, // [
right_bracket, // ]
left_brace, // {
right_brace, // }
colon, // :
comma, // ,
dot, // .
minus, // -
arrow_right, // ->
arrow_left, // <-
star, // *
slash, // /
percent, // %
plus, // +
// Comparison operators
eq, // =
neq, // <>
lt, // <
lte, // <=
gt, // >
gte, // >=
// Literals
identifier,
string_literal,
integer_literal,
float_literal,
// Special
eof,
invalid,
};
pub const Token = struct {
type: TokenType,
text: []const u8, // Slice into original source
line: u32,
column: u32,
};
pub const Lexer = struct {
source: []const u8,
pos: usize,
line: u32,
column: u32,
allocator: std.mem.Allocator,
const Self = @This();
pub fn init(source: []const u8, allocator: std.mem.Allocator) Self {
return Self{
.source = source,
.pos = 0,
.line = 1,
.column = 1,
.allocator = allocator,
};
}
/// Get next token
pub fn nextToken(self: *Self) !Token {
self.skipWhitespace();
if (self.pos >= self.source.len) {
return self.makeToken(.eof, 0);
}
const c = self.source[self.pos];
// Identifiers and keywords
if (isAlpha(c) or c == '_') {
return self.readIdentifier();
}
// Numbers
if (isDigit(c)) {
return self.readNumber();
}
// Strings
if (c == '"' or c == '\'') {
return self.readString();
}
// Single-char tokens and operators
switch (c) {
'(' => { self.advance(); return self.makeToken(.left_paren, 1); },
')' => { self.advance(); return self.makeToken(.right_paren, 1); },
'[' => { self.advance(); return self.makeToken(.left_bracket, 1); },
']' => { self.advance(); return self.makeToken(.right_bracket, 1); },
'{' => { self.advance(); return self.makeToken(.left_brace, 1); },
'}' => { self.advance(); return self.makeToken(.right_brace, 1); },
':' => { self.advance(); return self.makeToken(.colon, 1); },
',' => { self.advance(); return self.makeToken(.comma, 1); },
'.' => { self.advance(); return self.makeToken(.dot, 1); },
'+' => { self.advance(); return self.makeToken(.plus, 1); },
'%' => { self.advance(); return self.makeToken(.percent, 1); },
'*' => { self.advance(); return self.makeToken(.star, 1); },
'-' => {
self.advance();
if (self.peek() == '>') {
self.advance();
return self.makeToken(.arrow_right, 2);
}
return self.makeToken(.minus, 1);
},
'<' => {
self.advance();
if (self.peek() == '-') {
self.advance();
return self.makeToken(.arrow_left, 2);
} else if (self.peek() == '>') {
self.advance();
return self.makeToken(.neq, 2);
} else if (self.peek() == '=') {
self.advance();
return self.makeToken(.lte, 2);
}
return self.makeToken(.lt, 1);
},
'>' => {
self.advance();
if (self.peek() == '=') {
self.advance();
return self.makeToken(.gte, 2);
}
return self.makeToken(.gt, 1);
},
'=' => { self.advance(); return self.makeToken(.eq, 1); },
else => {
self.advance();
return self.makeToken(.invalid, 1);
},
}
}
/// Read all tokens into array
pub fn tokenize(self: *Self) ![]Token {
var tokens: std.ArrayList(Token) = .{};
errdefer tokens.deinit(self.allocator);
while (true) {
const tok = try self.nextToken();
try tokens.append(self.allocator, tok);
if (tok.type == .eof) break;
}
return tokens.toOwnedSlice(self.allocator);
}
// =========================================================================
// Internal helpers
// =========================================================================
fn advance(self: *Self) void {
if (self.pos >= self.source.len) return;
if (self.source[self.pos] == '\n') {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
self.pos += 1;
}
fn peek(self: *Self) u8 {
if (self.pos >= self.source.len) return 0;
return self.source[self.pos];
}
fn skipWhitespace(self: *Self) void {
while (self.pos < self.source.len) {
const c = self.source[self.pos];
if (c == ' ' or c == '\t' or c == '\n' or c == '\r') {
self.advance();
} else if (c == '/' and self.pos + 1 < self.source.len and self.source[self.pos + 1] == '/') {
// Single-line comment
while (self.pos < self.source.len and self.source[self.pos] != '\n') {
self.advance();
}
} else if (c == '/' and self.pos + 1 < self.source.len and self.source[self.pos + 1] == '*') {
// Multi-line comment
self.advance(); // /
self.advance(); // *
while (self.pos + 1 < self.source.len) {
if (self.source[self.pos] == '*' and self.source[self.pos + 1] == '/') {
self.advance(); // *
self.advance(); // /
break;
}
self.advance();
}
} else {
break;
}
}
}
fn readIdentifier(self: *Self) Token {
const start = self.pos;
const start_line = self.line;
const start_col = self.column;
while (self.pos < self.source.len) {
const c = self.source[self.pos];
if (isAlphaNum(c) or c == '_') {
self.advance();
} else {
break;
}
}
const text = self.source[start..self.pos];
const tok_type = keywordFromString(text);
return Token{
.type = tok_type,
.text = text,
.line = start_line,
.column = start_col,
};
}
fn readNumber(self: *Self) !Token {
const start = self.pos;
const start_line = self.line;
const start_col = self.column;
var is_float = false;
while (self.pos < self.source.len) {
const c = self.source[self.pos];
if (isDigit(c)) {
self.advance();
} else if (c == '.' and !is_float) {
// Check for range operator (e.g., 1..3)
if (self.pos + 1 < self.source.len and self.source[self.pos + 1] == '.') {
break; // Stop before range operator
}
is_float = true;
self.advance();
} else {
break;
}
}
const text = self.source[start..self.pos];
const tok_type: TokenType = if (is_float) .float_literal else .integer_literal;
return Token{
.type = tok_type,
.text = text,
.line = start_line,
.column = start_col,
};
}
fn readString(self: *Self) !Token {
const start = self.pos;
const start_line = self.line;
const start_col = self.column;
const quote = self.source[self.pos];
self.advance(); // opening quote
while (self.pos < self.source.len) {
const c = self.source[self.pos];
if (c == quote) {
self.advance(); // closing quote
break;
} else if (c == '\\' and self.pos + 1 < self.source.len) {
self.advance(); // backslash
self.advance(); // escaped char
} else {
self.advance();
}
}
const text = self.source[start..self.pos];
return Token{
.type = .string_literal,
.text = text,
.line = start_line,
.column = start_col,
};
}
fn makeToken(self: *Self, tok_type: TokenType, len: usize) Token {
const tok = Token{
.type = tok_type,
.text = self.source[self.pos - len .. self.pos],
.line = self.line,
.column = self.column - @as(u32, @intCast(len)),
};
return tok;
}
};
// ============================================================================
// Helper functions
// ============================================================================
fn isAlpha(c: u8) bool {
return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
}
fn isDigit(c: u8) bool {
return c >= '0' and c <= '9';
}
fn isAlphaNum(c: u8) bool {
return isAlpha(c) or isDigit(c);
}
fn keywordFromString(text: []const u8) TokenType {
// Zig 0.15.2 compatible: use switch instead of ComptimeStringMap
if (std.mem.eql(u8, text, "MATCH") or std.mem.eql(u8, text, "match")) return .match;
if (std.mem.eql(u8, text, "CREATE") or std.mem.eql(u8, text, "create")) return .create;
if (std.mem.eql(u8, text, "DELETE") or std.mem.eql(u8, text, "delete")) return .delete;
if (std.mem.eql(u8, text, "RETURN") or std.mem.eql(u8, text, "return")) return .return_keyword;
if (std.mem.eql(u8, text, "WHERE") or std.mem.eql(u8, text, "where")) return .where;
if (std.mem.eql(u8, text, "AS") or std.mem.eql(u8, text, "as")) return .as_keyword;
if (std.mem.eql(u8, text, "AND") or std.mem.eql(u8, text, "and")) return .and_keyword;
if (std.mem.eql(u8, text, "OR") or std.mem.eql(u8, text, "or")) return .or_keyword;
if (std.mem.eql(u8, text, "NOT") or std.mem.eql(u8, text, "not")) return .not_keyword;
if (std.mem.eql(u8, text, "NULL") or std.mem.eql(u8, text, "null")) return .null_keyword;
if (std.mem.eql(u8, text, "TRUE") or std.mem.eql(u8, text, "true")) return .true_keyword;
if (std.mem.eql(u8, text, "FALSE") or std.mem.eql(u8, text, "false")) return .false_keyword;
return .identifier;
}
// ============================================================================
// TESTS
// ============================================================================
test "Lexer: simple keywords" {
const allocator = std.testing.allocator;
const source = "MATCH (n) RETURN n";
var lex = Lexer.init(source, allocator);
const tokens = try lex.tokenize();
defer allocator.free(tokens);
try std.testing.expectEqual(TokenType.match, tokens[0].type);
try std.testing.expectEqual(TokenType.left_paren, tokens[1].type);
try std.testing.expectEqual(TokenType.identifier, tokens[2].type);
try std.testing.expectEqual(TokenType.right_paren, tokens[3].type);
try std.testing.expectEqual(TokenType.return_keyword, tokens[4].type);
try std.testing.expectEqual(TokenType.identifier, tokens[5].type);
try std.testing.expectEqual(TokenType.eof, tokens[6].type);
}
test "Lexer: arrow operators" {
const allocator = std.testing.allocator;
const source = "-> <-";
var lexer = Lexer.init(source, allocator);
const tokens = try lexer.tokenize();
defer allocator.free(tokens);
try std.testing.expectEqual(TokenType.arrow_right, tokens[0].type);
try std.testing.expectEqual(TokenType.arrow_left, tokens[1].type);
}
test "Lexer: string literal" {
const allocator = std.testing.allocator;
const source = "\"hello world\"";
var lexer = Lexer.init(source, allocator);
const tokens = try lexer.tokenize();
defer allocator.free(tokens);
try std.testing.expectEqual(TokenType.string_literal, tokens[0].type);
try std.testing.expectEqualStrings("\"hello world\"", tokens[0].text);
}
test "Lexer: numbers" {
const allocator = std.testing.allocator;
const source = "42 3.14";
var lexer = Lexer.init(source, allocator);
const tokens = try lexer.tokenize();
defer allocator.free(tokens);
try std.testing.expectEqual(TokenType.integer_literal, tokens[0].type);
try std.testing.expectEqual(TokenType.float_literal, tokens[1].type);
}