1
0
Fork 0

Improve lexer

This commit is contained in:
Alex Kotov 2023-05-06 22:49:54 +04:00
parent bb100af429
commit b87c4dfbd4
Signed by: kotovalexarian
GPG key ID: 553C0EBBEB5D5F08

View file

@ -13,8 +13,10 @@ static bool State_to_token_type(
enum TokenType *token_type enum TokenType *token_type
); );
static void buffer_add(Lexer lexer, char chr); static void token_start(Lexer lexer, enum Lexer_State state);
static void token_add(Lexer lexer); static void token_start_putc(Lexer lexer, enum Lexer_State state, char chr);
static void token_putc(Lexer lexer, char chr);
static void token_finish(Lexer lexer);
Lexer Lexer_new(const Tokens tokens) Lexer Lexer_new(const Tokens tokens)
{ {
@ -60,7 +62,25 @@ State_to_token_type(const enum Lexer_State state, enum TokenType *const token_ty
return true; return true;
} }
void buffer_add(const Lexer lexer, char chr) void token_start(const Lexer lexer, const enum Lexer_State state)
{
assert(lexer);
lexer->state = state;
lexer->buffer_index = 0;
lexer->buffer[0] = '\0';
}
void token_start_putc(
const Lexer lexer,
const enum Lexer_State state,
const char chr
) {
token_start(lexer, state);
token_putc(lexer, chr);
}
void token_putc(const Lexer lexer, char chr)
{ {
assert(lexer); assert(lexer);
assert(lexer->buffer_index < LEXER_BUFFER_SLEN); assert(lexer->buffer_index < LEXER_BUFFER_SLEN);
@ -70,7 +90,7 @@ void buffer_add(const Lexer lexer, char chr)
lexer->buffer[lexer->buffer_index] = '\0'; lexer->buffer[lexer->buffer_index] = '\0';
} }
void token_add(const Lexer lexer) void token_finish(const Lexer lexer)
{ {
assert(lexer); assert(lexer);
@ -95,102 +115,83 @@ void Lexer_lex(const Lexer self, const char chr)
switch (self->state) { switch (self->state) {
case STATE_INIT: case STATE_INIT:
if (chr == ';') { if (chr == ';') {
self->state = STATE_COMMENT_LINE; token_start(self, STATE_COMMENT_LINE);
} else if (chr == '(') { } else if (chr == '(') {
self->state = STATE_ROUND_OPEN; token_start_putc(self, STATE_ROUND_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ')') { } else if (chr == ')') {
self->state = STATE_ROUND_CLOSE; token_start_putc(self, STATE_ROUND_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '[') { } else if (chr == '[') {
self->state = STATE_SQUARE_OPEN; token_start_putc(self, STATE_SQUARE_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ']') { } else if (chr == ']') {
self->state = STATE_SQUARE_CLOSE; token_start_putc(self, STATE_SQUARE_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '{') { } else if (chr == '{') {
self->state = STATE_CURLY_OPEN; token_start_putc(self, STATE_CURLY_OPEN, chr);
buffer_add(self, chr);
} else if (chr == '}') { } else if (chr == '}') {
self->state = STATE_CURLY_CLOSE; token_start_putc(self, STATE_CURLY_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '\'') { } else if (chr == '\'') {
self->state = STATE_QUOTE; token_start_putc(self, STATE_QUOTE, chr);
buffer_add(self, chr);
} else if (chr == '#') { } else if (chr == '#') {
self->state = STATE_SHARP; token_start(self, STATE_SHARP);
} else if (is_space(chr)) { } else if (is_space(chr)) {
self->state = STATE_WHITESPACE; token_start_putc(self, STATE_WHITESPACE, chr);
buffer_add(self, chr);
} else if (is_ident_head(chr)) { } else if (is_ident_head(chr)) {
self->state = STATE_IDENT; token_start_putc(self, STATE_IDENT, chr);
buffer_add(self, chr);
} else if (is_number(chr)) { } else if (is_number(chr)) {
self->state = STATE_NUM; token_start_putc(self, STATE_NUM, chr);
buffer_add(self, chr);
} else if (chr == '"') { } else if (chr == '"') {
self->state = STATE_STRING_START; token_start(self, STATE_STRING_START);
} else { } else {
assert(0); assert(0);
} }
break; break;
case STATE_WHITESPACE: case STATE_WHITESPACE:
if (chr == ';') { if (chr == ';') {
self->state = STATE_COMMENT_LINE; token_start(self, STATE_COMMENT_LINE);
} else if (chr == '(') { } else if (chr == '(') {
token_add(self); token_finish(self);
self->state = STATE_ROUND_OPEN; token_start_putc(self, STATE_ROUND_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ')') { } else if (chr == ')') {
token_add(self); token_finish(self);
self->state = STATE_ROUND_CLOSE; token_start_putc(self, STATE_ROUND_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '[') { } else if (chr == '[') {
token_add(self); token_finish(self);
self->state = STATE_SQUARE_OPEN; token_start_putc(self, STATE_SQUARE_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ']') { } else if (chr == ']') {
token_add(self); token_finish(self);
self->state = STATE_SQUARE_CLOSE; token_start_putc(self, STATE_SQUARE_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '{') { } else if (chr == '{') {
token_add(self); token_finish(self);
self->state = STATE_CURLY_OPEN; token_start_putc(self, STATE_CURLY_OPEN, chr);
buffer_add(self, chr);
} else if (chr == '}') { } else if (chr == '}') {
token_add(self); token_finish(self);
self->state = STATE_CURLY_CLOSE; token_start_putc(self, STATE_CURLY_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '\'') { } else if (chr == '\'') {
token_add(self); token_finish(self);
self->state = STATE_QUOTE; token_start_putc(self, STATE_QUOTE, chr);
buffer_add(self, chr);
} else if (chr == '#') { } else if (chr == '#') {
token_add(self); token_finish(self);
self->state = STATE_SHARP; token_start(self, STATE_SHARP);
} else if (is_space(chr)) { } else if (is_space(chr)) {
buffer_add(self, chr); token_putc(self, chr);
} else if (is_ident_head(chr)) { } else if (is_ident_head(chr)) {
token_add(self); token_finish(self);
self->state = STATE_IDENT; token_start_putc(self, STATE_IDENT, chr);
buffer_add(self, chr);
} else if (is_number(chr)) { } else if (is_number(chr)) {
token_add(self); token_finish(self);
self->state = STATE_NUM; token_start_putc(self, STATE_NUM, chr);
buffer_add(self, chr);
} else if (chr == '"') { } else if (chr == '"') {
token_add(self); token_finish(self);
self->state = STATE_STRING_START; token_start(self, STATE_STRING_START);
} else { } else {
assert(0); assert(0);
} }
break; break;
case STATE_COMMENT_LINE: case STATE_COMMENT_LINE:
if (chr == '\n') { if (chr == '\n') {
token_add(self); token_finish(self);
self->state = STATE_WHITESPACE; token_start(self, STATE_WHITESPACE);
} else { } else {
buffer_add(self, chr); token_putc(self, chr);
} }
break; break;
case STATE_ROUND_OPEN: case STATE_ROUND_OPEN:
@ -201,61 +202,50 @@ void Lexer_lex(const Lexer self, const char chr)
case STATE_CURLY_CLOSE: case STATE_CURLY_CLOSE:
case STATE_QUOTE: case STATE_QUOTE:
if (chr == ';') { if (chr == ';') {
self->state = STATE_COMMENT_LINE; token_start(self, STATE_COMMENT_LINE);
} else if (chr == '(') { } else if (chr == '(') {
token_add(self); token_finish(self);
self->state = STATE_ROUND_OPEN; token_start_putc(self, STATE_ROUND_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ')') { } else if (chr == ')') {
token_add(self); token_finish(self);
self->state = STATE_ROUND_CLOSE; token_start_putc(self, STATE_ROUND_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '[') { } else if (chr == '[') {
token_add(self); token_finish(self);
self->state = STATE_SQUARE_OPEN; token_start_putc(self, STATE_SQUARE_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ']') { } else if (chr == ']') {
token_add(self); token_finish(self);
self->state = STATE_SQUARE_CLOSE; token_start_putc(self, STATE_SQUARE_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '{') { } else if (chr == '{') {
token_add(self); token_finish(self);
self->state = STATE_CURLY_OPEN; token_start_putc(self, STATE_CURLY_OPEN, chr);
buffer_add(self, chr);
} else if (chr == '}') { } else if (chr == '}') {
token_add(self); token_finish(self);
self->state = STATE_CURLY_CLOSE; token_start_putc(self, STATE_CURLY_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '\'') { } else if (chr == '\'') {
token_add(self); token_finish(self);
self->state = STATE_QUOTE; token_start_putc(self, STATE_QUOTE, chr);
buffer_add(self, chr);
} else if (chr == '#') { } else if (chr == '#') {
token_add(self); token_finish(self);
self->state = STATE_SHARP; token_start(self, STATE_SHARP);
} else if (is_space(chr)) { } else if (is_space(chr)) {
token_add(self); token_finish(self);
self->state = STATE_WHITESPACE; token_start_putc(self, STATE_WHITESPACE, chr);
buffer_add(self, chr);
} else if (is_ident_head(chr)) { } else if (is_ident_head(chr)) {
token_add(self); token_finish(self);
self->state = STATE_IDENT; token_start_putc(self, STATE_IDENT, chr);
buffer_add(self, chr);
} else if (is_number(chr)) { } else if (is_number(chr)) {
token_add(self); token_finish(self);
self->state = STATE_NUM; token_start_putc(self, STATE_NUM, chr);
buffer_add(self, chr);
} else if (chr == '"') { } else if (chr == '"') {
token_add(self); token_finish(self);
self->state = STATE_STRING_START; token_start(self, STATE_STRING_START);
} else { } else {
assert(0); assert(0);
} }
break; break;
case STATE_SHARP: case STATE_SHARP:
if (is_tag(chr)) { if (is_tag(chr)) {
self->state = STATE_TAG; token_start_putc(self, STATE_TAG, chr);
buffer_add(self, chr);
} else { } else {
assert(0); assert(0);
} }
@ -264,38 +254,31 @@ void Lexer_lex(const Lexer self, const char chr)
if (chr == ';') { if (chr == ';') {
self->state = STATE_COMMENT_LINE; self->state = STATE_COMMENT_LINE;
} else if (chr == '(') { } else if (chr == '(') {
token_add(self); token_finish(self);
self->state = STATE_ROUND_OPEN; token_start_putc(self, STATE_ROUND_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ')') { } else if (chr == ')') {
token_add(self); token_finish(self);
self->state = STATE_ROUND_CLOSE; token_start_putc(self, STATE_ROUND_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '[') { } else if (chr == '[') {
token_add(self); token_finish(self);
self->state = STATE_SQUARE_OPEN; token_start_putc(self, STATE_SQUARE_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ']') { } else if (chr == ']') {
token_add(self); token_finish(self);
self->state = STATE_SQUARE_CLOSE; token_start_putc(self, STATE_SQUARE_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '{') { } else if (chr == '{') {
token_add(self); token_finish(self);
self->state = STATE_CURLY_OPEN; token_start_putc(self, STATE_CURLY_OPEN, chr);
buffer_add(self, chr);
} else if (chr == '}') { } else if (chr == '}') {
token_add(self); token_finish(self);
self->state = STATE_CURLY_CLOSE; token_start_putc(self, STATE_CURLY_CLOSE, chr);
buffer_add(self, chr);
} else if (is_space(chr)) { } else if (is_space(chr)) {
token_add(self); token_finish(self);
self->state = STATE_WHITESPACE; token_start_putc(self, STATE_WHITESPACE, chr);
buffer_add(self, chr);
} else if (is_tag(chr)) { } else if (is_tag(chr)) {
buffer_add(self, chr); token_putc(self, chr);
} else if (chr == '"') { } else if (chr == '"') {
token_add(self); token_finish(self);
self->state = STATE_STRING_START; token_start(self, STATE_STRING_START);
} else { } else {
assert(0); assert(0);
} }
@ -304,78 +287,64 @@ void Lexer_lex(const Lexer self, const char chr)
if (chr == ';') { if (chr == ';') {
self->state = STATE_COMMENT_LINE; self->state = STATE_COMMENT_LINE;
} else if (chr == '(') { } else if (chr == '(') {
token_add(self); token_finish(self);
self->state = STATE_ROUND_OPEN; token_start_putc(self, STATE_ROUND_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ')') { } else if (chr == ')') {
token_add(self); token_finish(self);
self->state = STATE_ROUND_CLOSE; token_start_putc(self, STATE_ROUND_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '[') { } else if (chr == '[') {
token_add(self); token_finish(self);
self->state = STATE_SQUARE_OPEN; token_start_putc(self, STATE_SQUARE_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ']') { } else if (chr == ']') {
token_add(self); token_finish(self);
self->state = STATE_SQUARE_CLOSE; token_start_putc(self, STATE_SQUARE_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '{') { } else if (chr == '{') {
token_add(self); token_finish(self);
self->state = STATE_CURLY_OPEN; token_start_putc(self, STATE_CURLY_OPEN, chr);
buffer_add(self, chr);
} else if (chr == '}') { } else if (chr == '}') {
token_add(self); token_finish(self);
self->state = STATE_CURLY_CLOSE; token_start_putc(self, STATE_CURLY_CLOSE, chr);
buffer_add(self, chr);
} else if (is_space(chr)) { } else if (is_space(chr)) {
token_add(self); token_finish(self);
self->state = STATE_WHITESPACE; token_start_putc(self, STATE_WHITESPACE, chr);
buffer_add(self, chr);
} else if (is_ident_tail(chr)) { } else if (is_ident_tail(chr)) {
buffer_add(self, chr); token_putc(self, chr);
} else if (chr == '"') { } else if (chr == '"') {
token_add(self); token_finish(self);
self->state = STATE_STRING_START; token_start(self, STATE_STRING_START);
} else { } else {
assert(0); assert(0);
} }
break; break;
case STATE_NUM: case STATE_NUM:
if (chr == ';') { if (chr == ';') {
self->state = STATE_COMMENT_LINE; token_start(self, STATE_COMMENT_LINE);
} else if (chr == '(') { } else if (chr == '(') {
token_add(self); token_finish(self);
self->state = STATE_ROUND_OPEN; token_start_putc(self, STATE_ROUND_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ')') { } else if (chr == ')') {
token_add(self); token_finish(self);
self->state = STATE_ROUND_CLOSE; token_start_putc(self, STATE_ROUND_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '[') { } else if (chr == '[') {
token_add(self); token_finish(self);
self->state = STATE_SQUARE_OPEN; token_start_putc(self, STATE_SQUARE_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ']') { } else if (chr == ']') {
token_add(self); token_finish(self);
self->state = STATE_SQUARE_CLOSE; token_start_putc(self, STATE_SQUARE_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '{') { } else if (chr == '{') {
token_add(self); token_finish(self);
self->state = STATE_CURLY_OPEN; token_start_putc(self, STATE_CURLY_OPEN, chr);
buffer_add(self, chr);
} else if (chr == '}') { } else if (chr == '}') {
token_add(self); token_finish(self);
self->state = STATE_CURLY_CLOSE; token_start_putc(self, STATE_CURLY_CLOSE, chr);
buffer_add(self, chr);
} else if (is_space(chr)) { } else if (is_space(chr)) {
token_add(self); token_finish(self);
self->state = STATE_WHITESPACE; token_start_putc(self, STATE_WHITESPACE, chr);
buffer_add(self, chr);
} else if (is_number(chr)) { } else if (is_number(chr)) {
buffer_add(self, chr); token_putc(self, chr);
} else if (chr == '"') { } else if (chr == '"') {
token_add(self); token_finish(self);
self->state = STATE_STRING_START; token_start(self, STATE_STRING_START);
} else { } else {
assert(0); assert(0);
} }
@ -383,49 +352,39 @@ void Lexer_lex(const Lexer self, const char chr)
case STATE_STRING_START: case STATE_STRING_START:
case STATE_STRING_INSIDE: case STATE_STRING_INSIDE:
if (chr == '"') { if (chr == '"') {
token_add(self); token_finish(self);
self->state = STATE_STRING_END; token_start(self, STATE_STRING_END);
} else { } else {
buffer_add(self, chr); token_putc(self, chr);
} }
break; break;
case STATE_STRING_END: case STATE_STRING_END:
if (chr == ';') { if (chr == ';') {
self->state = STATE_COMMENT_LINE; token_start(self, STATE_COMMENT_LINE);
} else if (chr == '(') { } else if (chr == '(') {
self->state = STATE_ROUND_OPEN; token_start_putc(self, STATE_ROUND_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ')') { } else if (chr == ')') {
self->state = STATE_ROUND_CLOSE; token_start_putc(self, STATE_ROUND_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '[') { } else if (chr == '[') {
self->state = STATE_SQUARE_OPEN; token_start_putc(self, STATE_SQUARE_OPEN, chr);
buffer_add(self, chr);
} else if (chr == ']') { } else if (chr == ']') {
self->state = STATE_SQUARE_CLOSE; token_start_putc(self, STATE_SQUARE_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '{') { } else if (chr == '{') {
self->state = STATE_CURLY_OPEN; token_start_putc(self, STATE_CURLY_OPEN, chr);
buffer_add(self, chr);
} else if (chr == '}') { } else if (chr == '}') {
self->state = STATE_CURLY_CLOSE; token_start_putc(self, STATE_CURLY_CLOSE, chr);
buffer_add(self, chr);
} else if (chr == '\'') { } else if (chr == '\'') {
self->state = STATE_QUOTE; token_start_putc(self, STATE_QUOTE, chr);
buffer_add(self, chr);
} else if (chr == '#') { } else if (chr == '#') {
self->state = STATE_SHARP; token_start(self, STATE_SHARP);
} else if (is_space(chr)) { } else if (is_space(chr)) {
self->state = STATE_WHITESPACE; token_start_putc(self, STATE_WHITESPACE, chr);
buffer_add(self, chr);
} else if (is_ident_head(chr)) { } else if (is_ident_head(chr)) {
self->state = STATE_IDENT; token_start_putc(self, STATE_IDENT, chr);
buffer_add(self, chr);
} else if (is_number(chr)) { } else if (is_number(chr)) {
self->state = STATE_NUM; token_start_putc(self, STATE_NUM, chr);
buffer_add(self, chr);
} else if (chr == '"') { } else if (chr == '"') {
self->state = STATE_STRING_START; token_start(self, STATE_STRING_START);
} else { } else {
assert(0); assert(0);
} }