#ifndef ENGINE_LEXER_H #define ENGINE_LEXER_H enum csv_token_flags { FL = 1 << 2, }; enum csv_token_type { // first 255 tokens for ascii characters TOKEN_UNDEFINED = 255, TOKEN_IDENTIFIER, TOKEN_VALUE, }; typedef struct csv_token csv_token; struct csv_token { string8 lexeme; csv_token *next_token; enum csv_token_type type; enum csv_token_flags flags; }; // NOTE(nasr): i dont think im going to use this. typedef struct csv_row csv_row; struct csv_row { // array of size col_count, points into mmap buffer string8 *fields; s32 count; }; #if 0 typedef struct csv_lntity csv_entity; struct csv_entity { //- not needed because we use key header mapping i think }; #endif typedef struct csv_header csv_header; struct csv_header { string8 payload; csv_header *next_header; }; typedef struct csv_table csv_table; struct csv_table { // first row, col names // all data rows csv_header *header; s32 row_count; s32 header_count; b32 finding_headers; }; typedef struct csv_token_list csv_token_list; struct csv_token_list { csv_token *start_token; csv_token *end_token; }; read_only global_variable csv_token nil_csv_token= { .lexeme = {.data = NULL, .size = 0}, .type = 0, .flags = 0, .next_token = &nil_csv_token, }; read_only global_variable csv_header nil_csv_header = { .payload = {.data = NULL, .size = 0}, .next_header = &nil_csv_header, }; read_only global_variable csv_token_list nil_csv_token_list = { .start_token = &nil_csv_token, .end_token = &nil_csv_token, }; read_only global_variable csv_row nil_csv_row = { .fields = &nil_string, .count = 0, }; read_only global_variable csv_table nil_csv_table = { .header = &nil_csv_header, .row_count = 0, }; #endif /* ENGINE_LEXER_H */ internal b32 is_nil_csv_token(csv_token *token) { return ((token == NULL) || (token == &nil_csv_token)); } // TODO(nasr): segfaulting because end_token not allocated internal void csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token) { source_token_list->end_token->next_token = source_token; source_token_list->end_token = source_token; } //- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother internal void csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source) { if(is_nil_csv_token(source->start_token)) return; csv_token *source_ct = source->start_token; csv_token *destination_et = destination->end_token; // walk source and stitch each node onto destination's tail for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token) { destination_et->next_token = source_ct; destination_et = source_ct; } // destination_et now points at the last real source node (not the nil sentinel) destination->end_token = destination_et; } #if 0 internal csv_token_list * parse_csv_row(string8 row_buffer) { // csv_token_list * } #endif // the lexer acts as a table builder from a csv file // and parsing indivudal rows and columns // the next step would be building a the b-tree internal csv_token * tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list) { unused(token_list); if(buffer.size == 0) return NULL; // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING??? // forgot what the solution was // TODO(nasr): check what the problem here was // string size tracking across the loop not inside it s32 start = 0; for(s32 index = 0; buffer.data[index] != '\0'; ++index) { u8 point = buffer.data[index]; #if 0 if(is_whitespace(point)) { warn("csv file is invalid, detected whitespace"); return NULL; } #endif if(point == ',') { // emit a token for the field that ended before this comma csv_token *token = PushStructZero(arena, csv_token); assert_msg(token != NULL, "did the push struct fail??"); assert_msg(arena->current_position < arena->capacity, "no more arena size"); token->lexeme = StringCast(&buffer.data[start], index - start); token->type = TOKEN_VALUE; token->next_token = &nil_csv_token; csv_token_list_append_token(token_list, token); start = index + 1; if(table->finding_headers) { table->header_count++; } } else if(point == '\n') { // emit a token for the field that ended at this newline csv_token *token = PushStructZero(arena, csv_token); token->lexeme = StringCast(&buffer.data[start], index - start); token->type = TOKEN_VALUE; token->flags |= FL; token->next_token = &nil_csv_token; assert_msg(token_list, "token list invalid"); assert_msg(token, "you're tring to append an invalid token"); csv_token_list_append_token(token_list, token); start = index + 1; if(table->finding_headers) { { //- map new header token list to table headers } table->finding_headers = FALSE; } table->row_count++; } } // NOTE(nasr): return the first token the caller can walk the list from token_list return token_list->start_token; } //- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future internal btree * parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table) { btree *tree = PushStructZero(arena, btree); s32 col_index = 0; s32 row_index = 0; // iterate over the token list while the token is not nil for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token) { { //- are we parsing the first line tokens? //- if so, do something :)) if(ct->flags & FL) { // NOTE(nasr): FL marks end-of-line; advance row, reset col row_index++; col_index = 0; // TODO(nasr): replace with nil header check function // NOTE(nasr): == nil means header hasn't been set yet if(table->header == &nil_csv_header || table->header == NULL) { #if 0 // - no this should happen in the tokenization table->headers->next = #endif } else { } // FL tokens are structural, no value to index continue; } } // skip non-value tokens, only index actual cell values if (ct->type != TOKEN_VALUE) { col_index++; continue; } // NOTE(nasr): payload is the cten itself so the caller can reach // row/col metadata without us having to copy it key k = { .header_index = col_index, .row_index = row_index, }; btree_insert(arena, tree, k, (void *)ct); col_index++; } return tree; }