1
0
Fork 0

Improve lexer

This commit is contained in:
Alex Kotov 2023-05-06 22:49:54 +04:00
parent bb100af429
commit b87c4dfbd4
Signed by: kotovalexarian
GPG key ID: 553C0EBBEB5D5F08

View file

@ -13,8 +13,10 @@ static bool State_to_token_type(
enum TokenType *token_type
);
static void buffer_add(Lexer lexer, char chr);
static void token_add(Lexer lexer);
static void token_start(Lexer lexer, enum Lexer_State state);
static void token_start_putc(Lexer lexer, enum Lexer_State state, char chr);
static void token_putc(Lexer lexer, char chr);
static void token_finish(Lexer lexer);
Lexer Lexer_new(const Tokens tokens)
{
@ -60,7 +62,25 @@ State_to_token_type(const enum Lexer_State state, enum TokenType *const token_ty
return true;
}
void buffer_add(const Lexer lexer, char chr)
void token_start(const Lexer lexer, const enum Lexer_State state)
{
assert(lexer);
lexer->state = state;
lexer->buffer_index = 0;
lexer->buffer[0] = '\0';
}
void token_start_putc(
const Lexer lexer,
const enum Lexer_State state,
const char chr
) {
token_start(lexer, state);
token_putc(lexer, chr);
}
void token_putc(const Lexer lexer, char chr)
{
assert(lexer);
assert(lexer->buffer_index < LEXER_BUFFER_SLEN);
@ -70,7 +90,7 @@ void buffer_add(const Lexer lexer, char chr)
lexer->buffer[lexer->buffer_index] = '\0';
}
void token_add(const Lexer lexer)
void token_finish(const Lexer lexer)
{
assert(lexer);
@ -95,102 +115,83 @@ void Lexer_lex(const Lexer self, const char chr)
switch (self->state) {
case STATE_INIT:
if (chr == ';') {
self->state = STATE_COMMENT_LINE;
token_start(self, STATE_COMMENT_LINE);
} else if (chr == '(') {
self->state = STATE_ROUND_OPEN;
buffer_add(self, chr);
token_start_putc(self, STATE_ROUND_OPEN, chr);
} else if (chr == ')') {
self->state = STATE_ROUND_CLOSE;
buffer_add(self, chr);
token_start_putc(self, STATE_ROUND_CLOSE, chr);
} else if (chr == '[') {
self->state = STATE_SQUARE_OPEN;
buffer_add(self, chr);
token_start_putc(self, STATE_SQUARE_OPEN, chr);
} else if (chr == ']') {
self->state = STATE_SQUARE_CLOSE;
buffer_add(self, chr);
token_start_putc(self, STATE_SQUARE_CLOSE, chr);
} else if (chr == '{') {
self->state = STATE_CURLY_OPEN;
buffer_add(self, chr);
token_start_putc(self, STATE_CURLY_OPEN, chr);
} else if (chr == '}') {
self->state = STATE_CURLY_CLOSE;
buffer_add(self, chr);
token_start_putc(self, STATE_CURLY_CLOSE, chr);
} else if (chr == '\'') {
self->state = STATE_QUOTE;
buffer_add(self, chr);
token_start_putc(self, STATE_QUOTE, chr);
} else if (chr == '#') {
self->state = STATE_SHARP;
token_start(self, STATE_SHARP);
} else if (is_space(chr)) {
self->state = STATE_WHITESPACE;
buffer_add(self, chr);
token_start_putc(self, STATE_WHITESPACE, chr);
} else if (is_ident_head(chr)) {
self->state = STATE_IDENT;
buffer_add(self, chr);
token_start_putc(self, STATE_IDENT, chr);
} else if (is_number(chr)) {
self->state = STATE_NUM;
buffer_add(self, chr);
token_start_putc(self, STATE_NUM, chr);
} else if (chr == '"') {
self->state = STATE_STRING_START;
token_start(self, STATE_STRING_START);
} else {
assert(0);
}
break;
case STATE_WHITESPACE:
if (chr == ';') {
self->state = STATE_COMMENT_LINE;
token_start(self, STATE_COMMENT_LINE);
} else if (chr == '(') {
token_add(self);
self->state = STATE_ROUND_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_ROUND_OPEN, chr);
} else if (chr == ')') {
token_add(self);
self->state = STATE_ROUND_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_ROUND_CLOSE, chr);
} else if (chr == '[') {
token_add(self);
self->state = STATE_SQUARE_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_SQUARE_OPEN, chr);
} else if (chr == ']') {
token_add(self);
self->state = STATE_SQUARE_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_SQUARE_CLOSE, chr);
} else if (chr == '{') {
token_add(self);
self->state = STATE_CURLY_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_CURLY_OPEN, chr);
} else if (chr == '}') {
token_add(self);
self->state = STATE_CURLY_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_CURLY_CLOSE, chr);
} else if (chr == '\'') {
token_add(self);
self->state = STATE_QUOTE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_QUOTE, chr);
} else if (chr == '#') {
token_add(self);
self->state = STATE_SHARP;
token_finish(self);
token_start(self, STATE_SHARP);
} else if (is_space(chr)) {
buffer_add(self, chr);
token_putc(self, chr);
} else if (is_ident_head(chr)) {
token_add(self);
self->state = STATE_IDENT;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_IDENT, chr);
} else if (is_number(chr)) {
token_add(self);
self->state = STATE_NUM;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_NUM, chr);
} else if (chr == '"') {
token_add(self);
self->state = STATE_STRING_START;
token_finish(self);
token_start(self, STATE_STRING_START);
} else {
assert(0);
}
break;
case STATE_COMMENT_LINE:
if (chr == '\n') {
token_add(self);
self->state = STATE_WHITESPACE;
token_finish(self);
token_start(self, STATE_WHITESPACE);
} else {
buffer_add(self, chr);
token_putc(self, chr);
}
break;
case STATE_ROUND_OPEN:
@ -201,61 +202,50 @@ void Lexer_lex(const Lexer self, const char chr)
case STATE_CURLY_CLOSE:
case STATE_QUOTE:
if (chr == ';') {
self->state = STATE_COMMENT_LINE;
token_start(self, STATE_COMMENT_LINE);
} else if (chr == '(') {
token_add(self);
self->state = STATE_ROUND_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_ROUND_OPEN, chr);
} else if (chr == ')') {
token_add(self);
self->state = STATE_ROUND_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_ROUND_CLOSE, chr);
} else if (chr == '[') {
token_add(self);
self->state = STATE_SQUARE_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_SQUARE_OPEN, chr);
} else if (chr == ']') {
token_add(self);
self->state = STATE_SQUARE_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_SQUARE_CLOSE, chr);
} else if (chr == '{') {
token_add(self);
self->state = STATE_CURLY_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_CURLY_OPEN, chr);
} else if (chr == '}') {
token_add(self);
self->state = STATE_CURLY_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_CURLY_CLOSE, chr);
} else if (chr == '\'') {
token_add(self);
self->state = STATE_QUOTE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_QUOTE, chr);
} else if (chr == '#') {
token_add(self);
self->state = STATE_SHARP;
token_finish(self);
token_start(self, STATE_SHARP);
} else if (is_space(chr)) {
token_add(self);
self->state = STATE_WHITESPACE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_WHITESPACE, chr);
} else if (is_ident_head(chr)) {
token_add(self);
self->state = STATE_IDENT;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_IDENT, chr);
} else if (is_number(chr)) {
token_add(self);
self->state = STATE_NUM;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_NUM, chr);
} else if (chr == '"') {
token_add(self);
self->state = STATE_STRING_START;
token_finish(self);
token_start(self, STATE_STRING_START);
} else {
assert(0);
}
break;
case STATE_SHARP:
if (is_tag(chr)) {
self->state = STATE_TAG;
buffer_add(self, chr);
token_start_putc(self, STATE_TAG, chr);
} else {
assert(0);
}
@ -264,38 +254,31 @@ void Lexer_lex(const Lexer self, const char chr)
if (chr == ';') {
self->state = STATE_COMMENT_LINE;
} else if (chr == '(') {
token_add(self);
self->state = STATE_ROUND_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_ROUND_OPEN, chr);
} else if (chr == ')') {
token_add(self);
self->state = STATE_ROUND_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_ROUND_CLOSE, chr);
} else if (chr == '[') {
token_add(self);
self->state = STATE_SQUARE_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_SQUARE_OPEN, chr);
} else if (chr == ']') {
token_add(self);
self->state = STATE_SQUARE_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_SQUARE_CLOSE, chr);
} else if (chr == '{') {
token_add(self);
self->state = STATE_CURLY_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_CURLY_OPEN, chr);
} else if (chr == '}') {
token_add(self);
self->state = STATE_CURLY_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_CURLY_CLOSE, chr);
} else if (is_space(chr)) {
token_add(self);
self->state = STATE_WHITESPACE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_WHITESPACE, chr);
} else if (is_tag(chr)) {
buffer_add(self, chr);
token_putc(self, chr);
} else if (chr == '"') {
token_add(self);
self->state = STATE_STRING_START;
token_finish(self);
token_start(self, STATE_STRING_START);
} else {
assert(0);
}
@ -304,78 +287,64 @@ void Lexer_lex(const Lexer self, const char chr)
if (chr == ';') {
self->state = STATE_COMMENT_LINE;
} else if (chr == '(') {
token_add(self);
self->state = STATE_ROUND_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_ROUND_OPEN, chr);
} else if (chr == ')') {
token_add(self);
self->state = STATE_ROUND_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_ROUND_CLOSE, chr);
} else if (chr == '[') {
token_add(self);
self->state = STATE_SQUARE_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_SQUARE_OPEN, chr);
} else if (chr == ']') {
token_add(self);
self->state = STATE_SQUARE_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_SQUARE_CLOSE, chr);
} else if (chr == '{') {
token_add(self);
self->state = STATE_CURLY_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_CURLY_OPEN, chr);
} else if (chr == '}') {
token_add(self);
self->state = STATE_CURLY_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_CURLY_CLOSE, chr);
} else if (is_space(chr)) {
token_add(self);
self->state = STATE_WHITESPACE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_WHITESPACE, chr);
} else if (is_ident_tail(chr)) {
buffer_add(self, chr);
token_putc(self, chr);
} else if (chr == '"') {
token_add(self);
self->state = STATE_STRING_START;
token_finish(self);
token_start(self, STATE_STRING_START);
} else {
assert(0);
}
break;
case STATE_NUM:
if (chr == ';') {
self->state = STATE_COMMENT_LINE;
token_start(self, STATE_COMMENT_LINE);
} else if (chr == '(') {
token_add(self);
self->state = STATE_ROUND_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_ROUND_OPEN, chr);
} else if (chr == ')') {
token_add(self);
self->state = STATE_ROUND_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_ROUND_CLOSE, chr);
} else if (chr == '[') {
token_add(self);
self->state = STATE_SQUARE_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_SQUARE_OPEN, chr);
} else if (chr == ']') {
token_add(self);
self->state = STATE_SQUARE_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_SQUARE_CLOSE, chr);
} else if (chr == '{') {
token_add(self);
self->state = STATE_CURLY_OPEN;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_CURLY_OPEN, chr);
} else if (chr == '}') {
token_add(self);
self->state = STATE_CURLY_CLOSE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_CURLY_CLOSE, chr);
} else if (is_space(chr)) {
token_add(self);
self->state = STATE_WHITESPACE;
buffer_add(self, chr);
token_finish(self);
token_start_putc(self, STATE_WHITESPACE, chr);
} else if (is_number(chr)) {
buffer_add(self, chr);
token_putc(self, chr);
} else if (chr == '"') {
token_add(self);
self->state = STATE_STRING_START;
token_finish(self);
token_start(self, STATE_STRING_START);
} else {
assert(0);
}
@ -383,49 +352,39 @@ void Lexer_lex(const Lexer self, const char chr)
case STATE_STRING_START:
case STATE_STRING_INSIDE:
if (chr == '"') {
token_add(self);
self->state = STATE_STRING_END;
token_finish(self);
token_start(self, STATE_STRING_END);
} else {
buffer_add(self, chr);
token_putc(self, chr);
}
break;
case STATE_STRING_END:
if (chr == ';') {
self->state = STATE_COMMENT_LINE;
token_start(self, STATE_COMMENT_LINE);
} else if (chr == '(') {
self->state = STATE_ROUND_OPEN;
buffer_add(self, chr);
token_start_putc(self, STATE_ROUND_OPEN, chr);
} else if (chr == ')') {
self->state = STATE_ROUND_CLOSE;
buffer_add(self, chr);
token_start_putc(self, STATE_ROUND_CLOSE, chr);
} else if (chr == '[') {
self->state = STATE_SQUARE_OPEN;
buffer_add(self, chr);
token_start_putc(self, STATE_SQUARE_OPEN, chr);
} else if (chr == ']') {
self->state = STATE_SQUARE_CLOSE;
buffer_add(self, chr);
token_start_putc(self, STATE_SQUARE_CLOSE, chr);
} else if (chr == '{') {
self->state = STATE_CURLY_OPEN;
buffer_add(self, chr);
token_start_putc(self, STATE_CURLY_OPEN, chr);
} else if (chr == '}') {
self->state = STATE_CURLY_CLOSE;
buffer_add(self, chr);
token_start_putc(self, STATE_CURLY_CLOSE, chr);
} else if (chr == '\'') {
self->state = STATE_QUOTE;
buffer_add(self, chr);
token_start_putc(self, STATE_QUOTE, chr);
} else if (chr == '#') {
self->state = STATE_SHARP;
token_start(self, STATE_SHARP);
} else if (is_space(chr)) {
self->state = STATE_WHITESPACE;
buffer_add(self, chr);
token_start_putc(self, STATE_WHITESPACE, chr);
} else if (is_ident_head(chr)) {
self->state = STATE_IDENT;
buffer_add(self, chr);
token_start_putc(self, STATE_IDENT, chr);
} else if (is_number(chr)) {
self->state = STATE_NUM;
buffer_add(self, chr);
token_start_putc(self, STATE_NUM, chr);
} else if (chr == '"') {
self->state = STATE_STRING_START;
token_start(self, STATE_STRING_START);
} else {
assert(0);
}