libertaria-stack/l1-identity/qvl/gql/lexer.zig

433 lines
13 KiB
Zig

//! GQL Lexer/Tokenizer
//!
//! Converts GQL query string into tokens for parser.
//! ISO/IEC 39075:2024 lexical structure.
const std = @import("std");
pub const TokenType = enum {
// Keywords
match,
create,
delete,
return_keyword,
where,
as_keyword,
and_keyword,
or_keyword,
not_keyword,
null_keyword,
true_keyword,
false_keyword,
// Punctuation
left_paren, // (
right_paren, // )
left_bracket, // [
right_bracket, // ]
left_brace, // {
right_brace, // }
colon, // :
comma, // ,
dot, // .
minus, // -
arrow_right, // ->
arrow_left, // <-
star, // *
slash, // /
percent, // %
plus, // +
// Comparison operators
eq, // =
neq, // <>
lt, // <
lte, // <=
gt, // >
gte, // >=
// Literals
identifier,
string_literal,
integer_literal,
float_literal,
// Special
eof,
invalid,
};
pub const Token = struct {
type: TokenType,
text: []const u8, // Slice into original source
line: u32,
column: u32,
};
pub const Lexer = struct {
source: []const u8,
pos: usize,
line: u32,
column: u32,
allocator: std.mem.Allocator,
const Self = @This();
pub fn init(source: []const u8, allocator: std.mem.Allocator) Self {
return Self{
.source = source,
.pos = 0,
.line = 1,
.column = 1,
.allocator = allocator,
};
}
/// Get next token
pub fn nextToken(self: *Self) !Token {
self.skipWhitespace();
if (self.pos >= self.source.len) {
return self.makeToken(.eof, 0);
}
const start = self.pos;
const c = self.source[self.pos];
// Identifiers and keywords
if (isAlpha(c) or c == '_') {
return self.readIdentifier();
}
// Numbers
if (isDigit(c)) {
return self.readNumber();
}
// Strings
if (c == '"' or c == '\'') {
return self.readString();
}
// Single-char tokens and operators
switch (c) {
'(' => { self.advance(); return self.makeToken(.left_paren, 1); },
')' => { self.advance(); return self.makeToken(.right_paren, 1); },
'[' => { self.advance(); return self.makeToken(.left_bracket, 1); },
']' => { self.advance(); return self.makeToken(.right_bracket, 1); },
'{' => { self.advance(); return self.makeToken(.left_brace, 1); },
'}' => { self.advance(); return self.makeToken(.right_brace, 1); },
':' => { self.advance(); return self.makeToken(.colon, 1); },
',' => { self.advance(); return self.makeToken(.comma, 1); },
'.' => { self.advance(); return self.makeToken(.dot, 1); },
'+' => { self.advance(); return self.makeToken(.plus, 1); },
'%' => { self.advance(); return self.makeToken(.percent, 1); },
'*' => { self.advance(); return self.makeToken(.star, 1); },
'-' => {
self.advance();
if (self.peek() == '>') {
self.advance();
return self.makeToken(.arrow_right, 2);
}
return self.makeToken(.minus, 1);
},
'<' => {
self.advance();
if (self.peek() == '-') {
self.advance();
return self.makeToken(.arrow_left, 2);
} else if (self.peek() == '>') {
self.advance();
return self.makeToken(.neq, 2);
} else if (self.peek() == '=') {
self.advance();
return self.makeToken(.lte, 2);
}
return self.makeToken(.lt, 1);
},
'>' => {
self.advance();
if (self.peek() == '=') {
self.advance();
return self.makeToken(.gte, 2);
}
return self.makeToken(.gt, 1);
},
'=' => { self.advance(); return self.makeToken(.eq, 1); },
else => {
self.advance();
return self.makeToken(.invalid, 1);
},
}
}
/// Read all tokens into array
pub fn tokenize(self: *Self) ![]Token {
var tokens = std.ArrayList(Token).init(self.allocator);
errdefer tokens.deinit(self.allocator);
while (true) {
const tok = try self.nextToken();
try tokens.append(self.allocator, tok);
if (tok.type == .eof) break;
}
return tokens.toOwnedSlice();
}
// =========================================================================
// Internal helpers
// =========================================================================
fn advance(self: *Self) void {
if (self.pos >= self.source.len) return;
if (self.source[self.pos] == '\n') {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
self.pos += 1;
}
fn peek(self: *Self) u8 {
if (self.pos >= self.source.len) return 0;
return self.source[self.pos];
}
fn skipWhitespace(self: *Self) void {
while (self.pos < self.source.len) {
const c = self.source[self.pos];
if (c == ' ' or c == '\t' or c == '\n' or c == '\r') {
self.advance();
} else if (c == '/' and self.pos + 1 < self.source.len and self.source[self.pos + 1] == '/') {
// Single-line comment
while (self.pos < self.source.len and self.source[self.pos] != '\n') {
self.advance();
}
} else if (c == '/' and self.pos + 1 < self.source.len and self.source[self.pos + 1] == '*') {
// Multi-line comment
self.advance(); // /
self.advance(); // *
while (self.pos + 1 < self.source.len) {
if (self.source[self.pos] == '*' and self.source[self.pos + 1] == '/') {
self.advance(); // *
self.advance(); // /
break;
}
self.advance();
}
} else {
break;
}
}
}
fn readIdentifier(self: *Self) Token {
const start = self.pos;
const start_line = self.line;
const start_col = self.column;
while (self.pos < self.source.len) {
const c = self.source[self.pos];
if (isAlphaNum(c) or c == '_') {
self.advance();
} else {
break;
}
}
const text = self.source[start..self.pos];
const tok_type = keywordFromString(text);
return Token{
.type = tok_type,
.text = text,
.line = start_line,
.column = start_col,
};
}
fn readNumber(self: *Self) !Token {
const start = self.pos;
const start_line = self.line;
const start_col = self.column;
var is_float = false;
while (self.pos < self.source.len) {
const c = self.source[self.pos];
if (isDigit(c)) {
self.advance();
} else if (c == '.' and !is_float) {
// Check for range operator (e.g., 1..3)
if (self.pos + 1 < self.source.len and self.source[self.pos + 1] == '.') {
break; // Stop before range operator
}
is_float = true;
self.advance();
} else {
break;
}
}
const text = self.source[start..self.pos];
const tok_type = if (is_float) .float_literal else .integer_literal;
return Token{
.type = tok_type,
.text = text,
.line = start_line,
.column = start_col,
};
}
fn readString(self: *Self) !Token {
const start = self.pos;
const start_line = self.line;
const start_col = self.column;
const quote = self.source[self.pos];
self.advance(); // opening quote
while (self.pos < self.source.len) {
const c = self.source[self.pos];
if (c == quote) {
self.advance(); // closing quote
break;
} else if (c == '\\' and self.pos + 1 < self.source.len) {
self.advance(); // backslash
self.advance(); // escaped char
} else {
self.advance();
}
}
const text = self.source[start..self.pos];
return Token{
.type = .string_literal,
.text = text,
.line = start_line,
.column = start_col,
};
}
fn makeToken(self: *Self, tok_type: TokenType, len: usize) Token {
const tok = Token{
.type = tok_type,
.text = self.source[self.pos - len .. self.pos],
.line = self.line,
.column = self.column - @as(u32, @intCast(len)),
};
return tok;
}
};
// ============================================================================
// Helper functions
// ============================================================================
fn isAlpha(c: u8) bool {
return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
}
fn isDigit(c: u8) bool {
return c >= '0' and c <= '9';
}
fn isAlphaNum(c: u8) bool {
return isAlpha(c) or isDigit(c);
}
fn keywordFromString(text: []const u8) TokenType {
const map = std.ComptimeStringMap(TokenType, .{
.{ "MATCH", .match },
.{ "match", .match },
.{ "CREATE", .create },
.{ "create", .create },
.{ "DELETE", .delete },
.{ "delete", .delete },
.{ "RETURN", .return_keyword },
.{ "return", .return_keyword },
.{ "WHERE", .where },
.{ "where", .where },
.{ "AS", .as_keyword },
.{ "as", .as_keyword },
.{ "AND", .and_keyword },
.{ "and", .and_keyword },
.{ "OR", .or_keyword },
.{ "or", .or_keyword },
.{ "NOT", .not_keyword },
.{ "not", .not_keyword },
.{ "NULL", .null_keyword },
.{ "null", .null_keyword },
.{ "TRUE", .true_keyword },
.{ "true", .true_keyword },
.{ "FALSE", .false_keyword },
.{ "false", .false_keyword },
});
return map.get(text) orelse .identifier;
}
// ============================================================================
// TESTS
// ============================================================================
test "Lexer: simple keywords" {
const allocator = std.testing.allocator;
const source = "MATCH (n) RETURN n";
var lexer = Lexer.init(source, allocator);
const tokens = try lexer.tokenize();
defer allocator.free(tokens);
try std.testing.expectEqual(TokenType.match, tokens[0].type);
try std.testing.expectEqual(TokenType.left_paren, tokens[1].type);
try std.testing.expectEqual(TokenType.identifier, tokens[2].type);
try std.testing.expectEqual(TokenType.right_paren, tokens[3].type);
try std.testing.expectEqual(TokenType.return_keyword, tokens[4].type);
try std.testing.expectEqual(TokenType.identifier, tokens[5].type);
try std.testing.expectEqual(TokenType.eof, tokens[6].type);
}
test "Lexer: arrow operators" {
const allocator = std.testing.allocator;
const source = "-> <-";
var lexer = Lexer.init(source, allocator);
const tokens = try lexer.tokenize();
defer allocator.free(tokens);
try std.testing.expectEqual(TokenType.arrow_right, tokens[0].type);
try std.testing.expectEqual(TokenType.arrow_left, tokens[1].type);
}
test "Lexer: string literal" {
const allocator = std.testing.allocator;
const source = "\"hello world\"";
var lexer = Lexer.init(source, allocator);
const tokens = try lexer.tokenize();
defer allocator.free(tokens);
try std.testing.expectEqual(TokenType.string_literal, tokens[0].type);
try std.testing.expectEqualStrings("\"hello world\"", tokens[0].text);
}
test "Lexer: numbers" {
const allocator = std.testing.allocator;
const source = "42 3.14";
var lexer = Lexer.init(source, allocator);
const tokens = try lexer.tokenize();
defer allocator.free(tokens);
try std.testing.expectEqual(TokenType.integer_literal, tokens[0].type);
try std.testing.expectEqual(TokenType.float_literal, tokens[1].type);
}