From a9cb228861a6b0fad4d508c05c0614757a7f0a34 Mon Sep 17 00:00:00 2001 From: nasr Date: Mon, 13 Apr 2026 14:58:49 +0200 Subject: refactor(main): refactor directory structure --- source/tb_db/csv_decoder.h | 294 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 294 insertions(+) create mode 100644 source/tb_db/csv_decoder.h (limited to 'source/tb_db/csv_decoder.h') diff --git a/source/tb_db/csv_decoder.h b/source/tb_db/csv_decoder.h new file mode 100644 index 0000000..3d09dc6 --- /dev/null +++ b/source/tb_db/csv_decoder.h @@ -0,0 +1,294 @@ +#ifndef ENGINE_LEXER_H +#define ENGINE_LEXER_H + +enum csv_token_flags +{ + FL = 1 << 2, +}; + +enum csv_token_type +{ + // first 255 tokens for ascii characters + TOKEN_UNDEFINED = 255, + TOKEN_IDENTIFIER, + TOKEN_VALUE, +}; + +typedef struct csv_token csv_token; +struct csv_token +{ + string8 lexeme; + csv_token *next_token; + enum csv_token_type type; + enum csv_token_flags flags; +}; + +// NOTE(nasr): i dont think im going to use this. +typedef struct csv_row csv_row; +struct csv_row +{ + // array of size col_count, points into mmap buffer + string8 *fields; + s32 count; +}; + +#if 0 +typedef struct csv_lntity csv_entity; +struct csv_entity +{ + //- not needed because we use key header mapping i think +}; +#endif + +typedef struct csv_header csv_header; +struct csv_header +{ + string8 payload; + csv_header *next_header; +}; + +typedef struct csv_table csv_table; +struct csv_table +{ + // first row, col names + // all data rows + csv_header *header; + s32 row_count; + s32 header_count; + b32 finding_headers; +}; + + +typedef struct csv_token_list csv_token_list; +struct csv_token_list +{ + csv_token *start_token; + csv_token *end_token; +}; + +read_only global_variable +csv_token nil_csv_token= +{ + .lexeme = {.data = NULL, .size = 0}, + .type = 0, + .flags = 0, + .next_token = &nil_csv_token, +}; + +read_only global_variable +csv_header nil_csv_header = +{ + .payload = {.data = NULL, .size = 0}, + .next_header = &nil_csv_header, +}; + +read_only global_variable +csv_token_list nil_csv_token_list = +{ + .start_token = &nil_csv_token, + .end_token = &nil_csv_token, +}; + +read_only global_variable +csv_row nil_csv_row = +{ + .fields = &nil_string, + .count = 0, +}; + +read_only global_variable +csv_table nil_csv_table = +{ + .header = &nil_csv_header, + .row_count = 0, +}; + +#endif /* ENGINE_LEXER_H */ + +internal b32 +is_nil_csv_token(csv_token *token) +{ + return ((token == NULL) || (token == &nil_csv_token)); +} + +// TODO(nasr): segfaulting because end_token not allocated +internal void +csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token) +{ + source_token_list->end_token->next_token = source_token; + source_token_list->end_token = source_token; +} + +//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother +internal void +csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source) +{ + if(is_nil_csv_token(source->start_token)) return; + + csv_token *source_ct = source->start_token; + csv_token *destination_et = destination->end_token; + + // walk source and stitch each node onto destination's tail + for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token) + { + destination_et->next_token = source_ct; + destination_et = source_ct; + } + + // destination_et now points at the last real source node (not the nil sentinel) + destination->end_token = destination_et; +} + +#if 0 +internal csv_token_list * +parse_csv_row(string8 row_buffer) +{ + // csv_token_list * + +} +#endif + + +// the lexer acts as a table builder from a csv file +// and parsing indivudal rows and columns +// the next step would be building a the b-tree +internal csv_token * +tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list) +{ + unused(token_list); + + if(buffer.size == 0) return NULL; + + // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit + // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING??? + // forgot what the solution was + // TODO(nasr): check what the problem here was + + // string size tracking across the loop not inside it + s32 start = 0; + + for(s32 index = 0; buffer.data[index] != '\0'; ++index) + { + u8 point = buffer.data[index]; + +#if 0 + if(is_whitespace(point)) + { + warn("csv file is invalid, detected whitespace"); + return NULL; + } +#endif + + if(point == ',') + { + // emit a token for the field that ended before this comma + csv_token *token = PushStructZero(arena, csv_token); + + assert_msg(token != NULL, "did the push struct fail??"); + assert_msg(arena->current_position < arena->capacity, "no more arena size"); + + token->lexeme = StringCast(&buffer.data[start], index - start); + token->type = TOKEN_VALUE; + token->next_token = &nil_csv_token; + csv_token_list_append_token(token_list, token); + + start = index + 1; + + if(table->finding_headers) + { + table->header_count++; + } + } + else if(point == '\n') + { + // emit a token for the field that ended at this newline + csv_token *token = PushStructZero(arena, csv_token); + token->lexeme = StringCast(&buffer.data[start], index - start); + token->type = TOKEN_VALUE; + token->flags |= FL; + token->next_token = &nil_csv_token; + + assert_msg(token_list, "token list invalid"); + assert_msg(token, "you're tring to append an invalid token"); + + csv_token_list_append_token(token_list, token); + + start = index + 1; + + if(table->finding_headers) + { + { + //- map new header token list to table headers + } + table->finding_headers = FALSE; + } + + table->row_count++; + } + } + + // NOTE(nasr): return the first token the caller can walk the list from token_list + return token_list->start_token; +} + +//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future +internal btree * +parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table) +{ + btree *tree = PushStructZero(arena, btree); + + s32 col_index = 0; + s32 row_index = 0; + + // iterate over the token list while the token is not nil + for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token) + { + { + //- are we parsing the first line tokens? + //- if so, do something :)) + if(ct->flags & FL) + { + // NOTE(nasr): FL marks end-of-line; advance row, reset col + row_index++; + col_index = 0; + + // TODO(nasr): replace with nil header check function + // NOTE(nasr): == nil means header hasn't been set yet + if(table->header == &nil_csv_header || table->header == NULL) + { +#if 0 + // - no this should happen in the tokenization + table->headers->next = +#endif + } + else + { + + } + + // FL tokens are structural, no value to index + continue; + } + } + + // skip non-value tokens, only index actual cell values + if (ct->type != TOKEN_VALUE) + { + col_index++; + continue; + } + + // NOTE(nasr): payload is the cten itself so the caller can reach + // row/col metadata without us having to copy it + key k = { + .header_index = col_index, + .row_index = row_index, + }; + + btree_insert(arena, tree, k, (void *)ct); + + col_index++; + } + + return tree; +} -- cgit v1.3