From a9cb228861a6b0fad4d508c05c0614757a7f0a34 Mon Sep 17 00:00:00 2001 From: nasr Date: Mon, 13 Apr 2026 14:58:49 +0200 Subject: refactor(main): refactor directory structure --- Makefile | 4 +- source/btree_impl.h | 209 -------------------------------- source/csv_decoder.h | 294 --------------------------------------------- source/tb_db.c | 198 ------------------------------ source/tb_db/btree_impl.h | 209 ++++++++++++++++++++++++++++++++ source/tb_db/csv_decoder.h | 294 +++++++++++++++++++++++++++++++++++++++++++++ source/tb_db/tb_db.c | 198 ++++++++++++++++++++++++++++++ 7 files changed, 703 insertions(+), 703 deletions(-) delete mode 100644 source/btree_impl.h delete mode 100644 source/csv_decoder.h delete mode 100644 source/tb_db.c create mode 100644 source/tb_db/btree_impl.h create mode 100644 source/tb_db/csv_decoder.h create mode 100644 source/tb_db/tb_db.c diff --git a/Makefile b/Makefile index 29aa256..619c89f 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ -BIN = build/engine -SRC = source/tb_db.c +BIN = build/tb_db +SRC = source/tb_db/tb_db.c # CC = gcc CC = clang diff --git a/source/btree_impl.h b/source/btree_impl.h deleted file mode 100644 index b7015bd..0000000 --- a/source/btree_impl.h +++ /dev/null @@ -1,209 +0,0 @@ -/** - * >> api notes - * >> by Abdellah El Morabit - * - * - creation of a btree happens when doing an insertion on an - * empty btree where the root is null or nil - * - **/ - -/* single header b-tree implementation by Abdellah El Morabit */ -#ifndef BTREE_H -#define BTREE_H - -// maximum height of the tree the lower the lower the lower amount -// of disk reads which translates into faster? -#if 0 -global_variable read_only s16 btree_ORDER = 4; -#endif -#define BTREE_ORDER 4 - -//- NOTE(nasr): defining a key to improve sorting -// i think saying that a key is a combination of the column + row is a good way of appraoching this -typedef struct key key; -struct key -{ - s32 header_index; //- col number - s32 row_index; -}; - -typedef struct btree_node btree_node; -struct btree_node -{ - // store the key values of the sub nodes? if they are leaves? - key keys[BTREE_ORDER - 1]; - // TODO(nasr): replace with something more generic? - // NOTE(nasr): cons of void * -> no type safety - // is there a way to still have some sort of that? size not variable - void *payload_per_key[BTREE_ORDER - 1]; - btree_node *parent; - // handle to store children faster than linked list - // because now we can iterate over the nodes instead of having cache misses - // when jumping through linked nodes - btree_node *children[BTREE_ORDER]; - // this shouldn't change things but could be a performance improvement on a bigger scale?? - s32 *max_children; - // NOTE(nasr): reference count ::: check how many leaves are using this node - // also not needed for now because we don't free individual node because of arena allocator - // s32 *refc; - s32 key_count; - b32 leaf; - - - // NOTE(nasr): do we hold the reference to the arena? or do we pass is it as a reference? - // this could solve memory location issues? -}; - - -typedef struct btree btree; -struct btree -{ - // NOTE(nasr): make sure this always stays in memory - // so that initial fetch never requires a disk read - btree_node *root; -}; - - - -#ifdef BTREE_IMPLEMENTATION -/////////////////////////////////////////////////////////////////////////////// -//- btree.c - - -internal b32 -is_nil_btree_node(btree_node *node) -{ - // NOTE(nasr): looks like a useless function but could be usefull when we implement a nil_btree_node - return (node == NULL); -} - -// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees? - -internal b8 -key_compare(key destination_key, key source_key) -{ - s32 source_index = source_key.header_index + source_key.row_index; - s32 destination_index = destination_key.header_index + destination_key.row_index; - - if(destination_index > source_index) return 1; - else if(destination_index < source_index) return -1; - else return 0; -} - -// NOTE(nasr): @return the index of the found element -//- @param: start node could be the root node for all we care. but this enables optimizations in the future if we need them -internal s32 -btree_find_ipos(key k, btree_node *start_node) -{ - s32 index = 0; - // scan right while the node's key at [index] is strictly less than k - while (index < start_node->key_count && key_compare(start_node->keys[index], k) < 0) - { - ++index; - } - return index; -} - -// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory -internal void * -btree_search(btree_node *node, key key) -{ - s32 index = btree_find_ipos(key, node); - - if (index < node->key_count && key_compare(node->keys[index], key) == 0) - { - return node->payload_per_key[index]; - } - if (node->leaf) - { - return NULL; - } - return btree_search(node->children[index], key); -} - -// TODO(nasr): split node when key_count == btree_ORDER - 1 (node is full) -// -- each node has max btree_ORDER - 1 keys -// -internal void -btree_insert(mem_arena *arena, btree *tree, key key, void *payload) -{ - if(is_nil_btree_node(tree->root)) - { - tree->root = PushStructZero(arena, btree_node); - tree->root->leaf = 1; - tree->root->key_count = 0; - } - - btree_node *current_node = tree->root; - - if (current_node->leaf) - { - // find the insertion position and shift keys right to make room - //- after: but what is this. we are expecting a new key.but we are looking for - //- its position in the tree is useless because it's non existent - - s32 i = btree_find_ipos(key, current_node); - - for (s32 j = current_node->key_count; j > i; --j) - { - current_node->keys[j] = current_node->keys[j - 1]; - current_node->payload_per_key[j] = current_node->payload_per_key[j - 1]; - } - - current_node->keys[i] = key; - current_node->payload_per_key[i] = payload; - - if(current_node->key_count < (BTREE_ORDER - 1)) - { - current_node->key_count += 1; - } - else { - // TODO(nasr): creating a new branch / tree? - // make a seperate function for this - // NOTE(nasr): only split through the parent when one actually exists; - // root splits need to be handled separately (promote mid key, create two children) - if(current_node->parent != NULL) - { - s32 split_pos = current_node->parent->key_count / 2 - 1; - s32 updated_keycount_p = 0; - for(s32 index = 0; index <= split_pos; ++index) - { - ++updated_keycount_p; - } - current_node->parent->key_count = updated_keycount_p; - } - } - } - // TODO(nasr): internal node case walk down then split on the way back up -} - - -internal void -btree_write(btree *bt) -{ - unused(bt); -#if 0 - temp_arena *temp_arena = temp_arena_begin(arena); - btree_node *root = bt->root; - - for(btree_node *bt = PushStruct(arena); root->next; next = root;) - { - - } - - temp_arena_end(temp); -#endif -} - -// - load the entire db into memory. -#if 0 -internal void -btree_load(mem_arena *arena) -{ - - -} -#endif - -#endif /* BTREE_IMPLEMENTATION */ -#endif /* BTREE_H */ diff --git a/source/csv_decoder.h b/source/csv_decoder.h deleted file mode 100644 index 3d09dc6..0000000 --- a/source/csv_decoder.h +++ /dev/null @@ -1,294 +0,0 @@ -#ifndef ENGINE_LEXER_H -#define ENGINE_LEXER_H - -enum csv_token_flags -{ - FL = 1 << 2, -}; - -enum csv_token_type -{ - // first 255 tokens for ascii characters - TOKEN_UNDEFINED = 255, - TOKEN_IDENTIFIER, - TOKEN_VALUE, -}; - -typedef struct csv_token csv_token; -struct csv_token -{ - string8 lexeme; - csv_token *next_token; - enum csv_token_type type; - enum csv_token_flags flags; -}; - -// NOTE(nasr): i dont think im going to use this. -typedef struct csv_row csv_row; -struct csv_row -{ - // array of size col_count, points into mmap buffer - string8 *fields; - s32 count; -}; - -#if 0 -typedef struct csv_lntity csv_entity; -struct csv_entity -{ - //- not needed because we use key header mapping i think -}; -#endif - -typedef struct csv_header csv_header; -struct csv_header -{ - string8 payload; - csv_header *next_header; -}; - -typedef struct csv_table csv_table; -struct csv_table -{ - // first row, col names - // all data rows - csv_header *header; - s32 row_count; - s32 header_count; - b32 finding_headers; -}; - - -typedef struct csv_token_list csv_token_list; -struct csv_token_list -{ - csv_token *start_token; - csv_token *end_token; -}; - -read_only global_variable -csv_token nil_csv_token= -{ - .lexeme = {.data = NULL, .size = 0}, - .type = 0, - .flags = 0, - .next_token = &nil_csv_token, -}; - -read_only global_variable -csv_header nil_csv_header = -{ - .payload = {.data = NULL, .size = 0}, - .next_header = &nil_csv_header, -}; - -read_only global_variable -csv_token_list nil_csv_token_list = -{ - .start_token = &nil_csv_token, - .end_token = &nil_csv_token, -}; - -read_only global_variable -csv_row nil_csv_row = -{ - .fields = &nil_string, - .count = 0, -}; - -read_only global_variable -csv_table nil_csv_table = -{ - .header = &nil_csv_header, - .row_count = 0, -}; - -#endif /* ENGINE_LEXER_H */ - -internal b32 -is_nil_csv_token(csv_token *token) -{ - return ((token == NULL) || (token == &nil_csv_token)); -} - -// TODO(nasr): segfaulting because end_token not allocated -internal void -csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token) -{ - source_token_list->end_token->next_token = source_token; - source_token_list->end_token = source_token; -} - -//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother -internal void -csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source) -{ - if(is_nil_csv_token(source->start_token)) return; - - csv_token *source_ct = source->start_token; - csv_token *destination_et = destination->end_token; - - // walk source and stitch each node onto destination's tail - for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token) - { - destination_et->next_token = source_ct; - destination_et = source_ct; - } - - // destination_et now points at the last real source node (not the nil sentinel) - destination->end_token = destination_et; -} - -#if 0 -internal csv_token_list * -parse_csv_row(string8 row_buffer) -{ - // csv_token_list * - -} -#endif - - -// the lexer acts as a table builder from a csv file -// and parsing indivudal rows and columns -// the next step would be building a the b-tree -internal csv_token * -tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list) -{ - unused(token_list); - - if(buffer.size == 0) return NULL; - - // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit - // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING??? - // forgot what the solution was - // TODO(nasr): check what the problem here was - - // string size tracking across the loop not inside it - s32 start = 0; - - for(s32 index = 0; buffer.data[index] != '\0'; ++index) - { - u8 point = buffer.data[index]; - -#if 0 - if(is_whitespace(point)) - { - warn("csv file is invalid, detected whitespace"); - return NULL; - } -#endif - - if(point == ',') - { - // emit a token for the field that ended before this comma - csv_token *token = PushStructZero(arena, csv_token); - - assert_msg(token != NULL, "did the push struct fail??"); - assert_msg(arena->current_position < arena->capacity, "no more arena size"); - - token->lexeme = StringCast(&buffer.data[start], index - start); - token->type = TOKEN_VALUE; - token->next_token = &nil_csv_token; - csv_token_list_append_token(token_list, token); - - start = index + 1; - - if(table->finding_headers) - { - table->header_count++; - } - } - else if(point == '\n') - { - // emit a token for the field that ended at this newline - csv_token *token = PushStructZero(arena, csv_token); - token->lexeme = StringCast(&buffer.data[start], index - start); - token->type = TOKEN_VALUE; - token->flags |= FL; - token->next_token = &nil_csv_token; - - assert_msg(token_list, "token list invalid"); - assert_msg(token, "you're tring to append an invalid token"); - - csv_token_list_append_token(token_list, token); - - start = index + 1; - - if(table->finding_headers) - { - { - //- map new header token list to table headers - } - table->finding_headers = FALSE; - } - - table->row_count++; - } - } - - // NOTE(nasr): return the first token the caller can walk the list from token_list - return token_list->start_token; -} - -//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future -internal btree * -parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table) -{ - btree *tree = PushStructZero(arena, btree); - - s32 col_index = 0; - s32 row_index = 0; - - // iterate over the token list while the token is not nil - for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token) - { - { - //- are we parsing the first line tokens? - //- if so, do something :)) - if(ct->flags & FL) - { - // NOTE(nasr): FL marks end-of-line; advance row, reset col - row_index++; - col_index = 0; - - // TODO(nasr): replace with nil header check function - // NOTE(nasr): == nil means header hasn't been set yet - if(table->header == &nil_csv_header || table->header == NULL) - { -#if 0 - // - no this should happen in the tokenization - table->headers->next = -#endif - } - else - { - - } - - // FL tokens are structural, no value to index - continue; - } - } - - // skip non-value tokens, only index actual cell values - if (ct->type != TOKEN_VALUE) - { - col_index++; - continue; - } - - // NOTE(nasr): payload is the cten itself so the caller can reach - // row/col metadata without us having to copy it - key k = { - .header_index = col_index, - .row_index = row_index, - }; - - btree_insert(arena, tree, k, (void *)ct); - - col_index++; - } - - return tree; -} diff --git a/source/tb_db.c b/source/tb_db.c deleted file mode 100644 index cfa26b1..0000000 --- a/source/tb_db.c +++ /dev/null @@ -1,198 +0,0 @@ -#define BTREE_IMPLEMENTATION -#define BASE_UNITY -#include "base/base_include.h" - -internal b32 -is_alpha(u8 point) -{ - return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_')); -} - -internal b32 -is_digit(u8 point) -{ - return (point >= '0' && point <= '9'); -} - -internal b32 -is_alpha_num(u8 point) -{ - return (is_alpha(point) || is_digit(point)); -} - -internal b32 -is_whitespace(u8 point) -{ - return (point == '\n' || point == '\r' || point == ' ' || point == '\t'); -} - -internal b32 -is_delimiter(u8 point) -{ - return (point == ','); -} - -#include "btree_impl.h" -#include "csv_decoder.h" - -typedef struct query_token query_token; -struct query_token -{ - string8 lexeme; - query_token *next; -}; - -typedef struct query_token_list query_token_list; -struct query_token_list -{ - query_token *start_token; - query_token *current_token; -}; - -read_only global_variable -query_token nil_query_token = -{ - .lexeme = {.data = NULL, .size = 0}, - .next = &nil_query_token -}; - - -read_only global_variable -query_token_list nil_query_token_list = -{ - .start_token = &nil_query_token, - .current_token = &nil_query_token, -}; - -internal b32 -is_nil_query_token(query_token *token) -{ - return (token == &nil_query_token) || (token == NULL); -} - -internal b32 -is_nil_query_token_list(query_token *token) -{ - return (token == &nil_query_token) || (token == NULL); -} - -// takes on line of the repl input -// return a reference to the passed list -internal query_token_list * -query_tokenizer(mem_arena *arena, string8 *buffer, query_token_list *list) -{ - b32 initialized = 0; - unused(initialized); - - for (u64 index = 0; index < buffer->size; ++index) - { - u8 codepoint = buffer->data[index]; - - if(codepoint == '\n' || codepoint == '\r') break; - - s32 start = 0; - s32 end = 0; - - if(is_whitespace(codepoint)) end = index; - - // save the token - // TODO(nasr): work on the string macros cuz no work - { - query_token *new_token = PushStruct(arena, query_token); - - //- initialize list - { - if(is_nil_query_token(list->start_token)) - { - list->start_token = new_token; - list->current_token = new_token; - } - else - { - //- all we need to do - we dont track parents or what ever. this is a token stream not a tree - list->current_token->next = new_token; - } - } - - s32 new_token_size = end - start; - - new_token->lexeme = PushString(arena, new_token_size); - new_token->lexeme.data = &buffer->data[index]; - new_token->lexeme.size = new_token_size; - - list->current_token->next = new_token; - - start = index + 1; - } - } - - return list; -} - -int main(int count, char **value) -{ -#if 1 - unused(nil_query_token_list); -#endif - - if(count < 2) value[1] = "./test/data.csv"; - - local_persist b32 running = 1; - - mem_arena *global_arena = arena_create(GiB(1)); - - string8 buffer = load_file(global_arena, value[1]); - - // NOTE(nasr): the use of tables is required for tracking headers etc. - // i think we can optimize this away in the future but for now its fine - csv_table *table = PushStructZero(global_arena, csv_table); - csv_token_list *token_list = PushStructZero(global_arena, csv_token_list); - - token_list->start_token = &nil_csv_token; - token_list->end_token = &nil_csv_token; - - csv_token *tokens = tokenize_csv(buffer, global_arena, table, token_list); - assert_msg(tokens != NULL, "tokens are null"); - - // NOTE(nasr): token_list is now populated — pass it directly, not a fresh empty list - btree *bt = parse_csv(global_arena, token_list, table); - - print("\nDatabase Engine\n"); - - for(;;) - { - if(running) - { - u8 *lbuf = PushArray(global_arena, u8, 256); - s32 err = os_read(STDIN_FD, lbuf, 256); - - if(err < 0) - { - print("error reading from stdin"); - } - - // TODO(nasr): extract this later in the future and make a string copy function/macro - // @params (s32 lbuf_size, string8 lbuf_stringified) - // NOTE(nasr): use err (bytes read) not sizeof(lbuf*) — sizeof a pointer is always 8 - s32 lbuf_size = err; - string8 lbuf_stringified = PushString(global_arena, lbuf_size); - { - memcpy(lbuf_stringified.data, lbuf, lbuf_size); - lbuf_stringified.size = lbuf_size; - } - - query_token_list *qtl = PushStructZero(global_arena, query_token_list); - - - query_tokenizer(global_arena, &lbuf_stringified, qtl); - - // TODO(nasr): dispatch qtl against bt here - - unused(bt); - - sleep(1); - } - } - - return 0; -} diff --git a/source/tb_db/btree_impl.h b/source/tb_db/btree_impl.h new file mode 100644 index 0000000..b7015bd --- /dev/null +++ b/source/tb_db/btree_impl.h @@ -0,0 +1,209 @@ +/** + * >> api notes + * >> by Abdellah El Morabit + * + * - creation of a btree happens when doing an insertion on an + * empty btree where the root is null or nil + * + **/ + +/* single header b-tree implementation by Abdellah El Morabit */ +#ifndef BTREE_H +#define BTREE_H + +// maximum height of the tree the lower the lower the lower amount +// of disk reads which translates into faster? +#if 0 +global_variable read_only s16 btree_ORDER = 4; +#endif +#define BTREE_ORDER 4 + +//- NOTE(nasr): defining a key to improve sorting +// i think saying that a key is a combination of the column + row is a good way of appraoching this +typedef struct key key; +struct key +{ + s32 header_index; //- col number + s32 row_index; +}; + +typedef struct btree_node btree_node; +struct btree_node +{ + // store the key values of the sub nodes? if they are leaves? + key keys[BTREE_ORDER - 1]; + // TODO(nasr): replace with something more generic? + // NOTE(nasr): cons of void * -> no type safety + // is there a way to still have some sort of that? size not variable + void *payload_per_key[BTREE_ORDER - 1]; + btree_node *parent; + // handle to store children faster than linked list + // because now we can iterate over the nodes instead of having cache misses + // when jumping through linked nodes + btree_node *children[BTREE_ORDER]; + // this shouldn't change things but could be a performance improvement on a bigger scale?? + s32 *max_children; + // NOTE(nasr): reference count ::: check how many leaves are using this node + // also not needed for now because we don't free individual node because of arena allocator + // s32 *refc; + s32 key_count; + b32 leaf; + + + // NOTE(nasr): do we hold the reference to the arena? or do we pass is it as a reference? + // this could solve memory location issues? +}; + + +typedef struct btree btree; +struct btree +{ + // NOTE(nasr): make sure this always stays in memory + // so that initial fetch never requires a disk read + btree_node *root; +}; + + + +#ifdef BTREE_IMPLEMENTATION +/////////////////////////////////////////////////////////////////////////////// +//- btree.c + + +internal b32 +is_nil_btree_node(btree_node *node) +{ + // NOTE(nasr): looks like a useless function but could be usefull when we implement a nil_btree_node + return (node == NULL); +} + +// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees? + +internal b8 +key_compare(key destination_key, key source_key) +{ + s32 source_index = source_key.header_index + source_key.row_index; + s32 destination_index = destination_key.header_index + destination_key.row_index; + + if(destination_index > source_index) return 1; + else if(destination_index < source_index) return -1; + else return 0; +} + +// NOTE(nasr): @return the index of the found element +//- @param: start node could be the root node for all we care. but this enables optimizations in the future if we need them +internal s32 +btree_find_ipos(key k, btree_node *start_node) +{ + s32 index = 0; + // scan right while the node's key at [index] is strictly less than k + while (index < start_node->key_count && key_compare(start_node->keys[index], k) < 0) + { + ++index; + } + return index; +} + +// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory +internal void * +btree_search(btree_node *node, key key) +{ + s32 index = btree_find_ipos(key, node); + + if (index < node->key_count && key_compare(node->keys[index], key) == 0) + { + return node->payload_per_key[index]; + } + if (node->leaf) + { + return NULL; + } + return btree_search(node->children[index], key); +} + +// TODO(nasr): split node when key_count == btree_ORDER - 1 (node is full) +// -- each node has max btree_ORDER - 1 keys +// +internal void +btree_insert(mem_arena *arena, btree *tree, key key, void *payload) +{ + if(is_nil_btree_node(tree->root)) + { + tree->root = PushStructZero(arena, btree_node); + tree->root->leaf = 1; + tree->root->key_count = 0; + } + + btree_node *current_node = tree->root; + + if (current_node->leaf) + { + // find the insertion position and shift keys right to make room + //- after: but what is this. we are expecting a new key.but we are looking for + //- its position in the tree is useless because it's non existent + + s32 i = btree_find_ipos(key, current_node); + + for (s32 j = current_node->key_count; j > i; --j) + { + current_node->keys[j] = current_node->keys[j - 1]; + current_node->payload_per_key[j] = current_node->payload_per_key[j - 1]; + } + + current_node->keys[i] = key; + current_node->payload_per_key[i] = payload; + + if(current_node->key_count < (BTREE_ORDER - 1)) + { + current_node->key_count += 1; + } + else { + // TODO(nasr): creating a new branch / tree? + // make a seperate function for this + // NOTE(nasr): only split through the parent when one actually exists; + // root splits need to be handled separately (promote mid key, create two children) + if(current_node->parent != NULL) + { + s32 split_pos = current_node->parent->key_count / 2 - 1; + s32 updated_keycount_p = 0; + for(s32 index = 0; index <= split_pos; ++index) + { + ++updated_keycount_p; + } + current_node->parent->key_count = updated_keycount_p; + } + } + } + // TODO(nasr): internal node case walk down then split on the way back up +} + + +internal void +btree_write(btree *bt) +{ + unused(bt); +#if 0 + temp_arena *temp_arena = temp_arena_begin(arena); + btree_node *root = bt->root; + + for(btree_node *bt = PushStruct(arena); root->next; next = root;) + { + + } + + temp_arena_end(temp); +#endif +} + +// - load the entire db into memory. +#if 0 +internal void +btree_load(mem_arena *arena) +{ + + +} +#endif + +#endif /* BTREE_IMPLEMENTATION */ +#endif /* BTREE_H */ diff --git a/source/tb_db/csv_decoder.h b/source/tb_db/csv_decoder.h new file mode 100644 index 0000000..3d09dc6 --- /dev/null +++ b/source/tb_db/csv_decoder.h @@ -0,0 +1,294 @@ +#ifndef ENGINE_LEXER_H +#define ENGINE_LEXER_H + +enum csv_token_flags +{ + FL = 1 << 2, +}; + +enum csv_token_type +{ + // first 255 tokens for ascii characters + TOKEN_UNDEFINED = 255, + TOKEN_IDENTIFIER, + TOKEN_VALUE, +}; + +typedef struct csv_token csv_token; +struct csv_token +{ + string8 lexeme; + csv_token *next_token; + enum csv_token_type type; + enum csv_token_flags flags; +}; + +// NOTE(nasr): i dont think im going to use this. +typedef struct csv_row csv_row; +struct csv_row +{ + // array of size col_count, points into mmap buffer + string8 *fields; + s32 count; +}; + +#if 0 +typedef struct csv_lntity csv_entity; +struct csv_entity +{ + //- not needed because we use key header mapping i think +}; +#endif + +typedef struct csv_header csv_header; +struct csv_header +{ + string8 payload; + csv_header *next_header; +}; + +typedef struct csv_table csv_table; +struct csv_table +{ + // first row, col names + // all data rows + csv_header *header; + s32 row_count; + s32 header_count; + b32 finding_headers; +}; + + +typedef struct csv_token_list csv_token_list; +struct csv_token_list +{ + csv_token *start_token; + csv_token *end_token; +}; + +read_only global_variable +csv_token nil_csv_token= +{ + .lexeme = {.data = NULL, .size = 0}, + .type = 0, + .flags = 0, + .next_token = &nil_csv_token, +}; + +read_only global_variable +csv_header nil_csv_header = +{ + .payload = {.data = NULL, .size = 0}, + .next_header = &nil_csv_header, +}; + +read_only global_variable +csv_token_list nil_csv_token_list = +{ + .start_token = &nil_csv_token, + .end_token = &nil_csv_token, +}; + +read_only global_variable +csv_row nil_csv_row = +{ + .fields = &nil_string, + .count = 0, +}; + +read_only global_variable +csv_table nil_csv_table = +{ + .header = &nil_csv_header, + .row_count = 0, +}; + +#endif /* ENGINE_LEXER_H */ + +internal b32 +is_nil_csv_token(csv_token *token) +{ + return ((token == NULL) || (token == &nil_csv_token)); +} + +// TODO(nasr): segfaulting because end_token not allocated +internal void +csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token) +{ + source_token_list->end_token->next_token = source_token; + source_token_list->end_token = source_token; +} + +//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother +internal void +csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source) +{ + if(is_nil_csv_token(source->start_token)) return; + + csv_token *source_ct = source->start_token; + csv_token *destination_et = destination->end_token; + + // walk source and stitch each node onto destination's tail + for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token) + { + destination_et->next_token = source_ct; + destination_et = source_ct; + } + + // destination_et now points at the last real source node (not the nil sentinel) + destination->end_token = destination_et; +} + +#if 0 +internal csv_token_list * +parse_csv_row(string8 row_buffer) +{ + // csv_token_list * + +} +#endif + + +// the lexer acts as a table builder from a csv file +// and parsing indivudal rows and columns +// the next step would be building a the b-tree +internal csv_token * +tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list) +{ + unused(token_list); + + if(buffer.size == 0) return NULL; + + // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit + // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING??? + // forgot what the solution was + // TODO(nasr): check what the problem here was + + // string size tracking across the loop not inside it + s32 start = 0; + + for(s32 index = 0; buffer.data[index] != '\0'; ++index) + { + u8 point = buffer.data[index]; + +#if 0 + if(is_whitespace(point)) + { + warn("csv file is invalid, detected whitespace"); + return NULL; + } +#endif + + if(point == ',') + { + // emit a token for the field that ended before this comma + csv_token *token = PushStructZero(arena, csv_token); + + assert_msg(token != NULL, "did the push struct fail??"); + assert_msg(arena->current_position < arena->capacity, "no more arena size"); + + token->lexeme = StringCast(&buffer.data[start], index - start); + token->type = TOKEN_VALUE; + token->next_token = &nil_csv_token; + csv_token_list_append_token(token_list, token); + + start = index + 1; + + if(table->finding_headers) + { + table->header_count++; + } + } + else if(point == '\n') + { + // emit a token for the field that ended at this newline + csv_token *token = PushStructZero(arena, csv_token); + token->lexeme = StringCast(&buffer.data[start], index - start); + token->type = TOKEN_VALUE; + token->flags |= FL; + token->next_token = &nil_csv_token; + + assert_msg(token_list, "token list invalid"); + assert_msg(token, "you're tring to append an invalid token"); + + csv_token_list_append_token(token_list, token); + + start = index + 1; + + if(table->finding_headers) + { + { + //- map new header token list to table headers + } + table->finding_headers = FALSE; + } + + table->row_count++; + } + } + + // NOTE(nasr): return the first token the caller can walk the list from token_list + return token_list->start_token; +} + +//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future +internal btree * +parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table) +{ + btree *tree = PushStructZero(arena, btree); + + s32 col_index = 0; + s32 row_index = 0; + + // iterate over the token list while the token is not nil + for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token) + { + { + //- are we parsing the first line tokens? + //- if so, do something :)) + if(ct->flags & FL) + { + // NOTE(nasr): FL marks end-of-line; advance row, reset col + row_index++; + col_index = 0; + + // TODO(nasr): replace with nil header check function + // NOTE(nasr): == nil means header hasn't been set yet + if(table->header == &nil_csv_header || table->header == NULL) + { +#if 0 + // - no this should happen in the tokenization + table->headers->next = +#endif + } + else + { + + } + + // FL tokens are structural, no value to index + continue; + } + } + + // skip non-value tokens, only index actual cell values + if (ct->type != TOKEN_VALUE) + { + col_index++; + continue; + } + + // NOTE(nasr): payload is the cten itself so the caller can reach + // row/col metadata without us having to copy it + key k = { + .header_index = col_index, + .row_index = row_index, + }; + + btree_insert(arena, tree, k, (void *)ct); + + col_index++; + } + + return tree; +} diff --git a/source/tb_db/tb_db.c b/source/tb_db/tb_db.c new file mode 100644 index 0000000..4ae247d --- /dev/null +++ b/source/tb_db/tb_db.c @@ -0,0 +1,198 @@ +#define BTREE_IMPLEMENTATION +#define BASE_UNITY +#include "../base/base_include.h" + +internal b32 +is_alpha(u8 point) +{ + return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_')); +} + +internal b32 +is_digit(u8 point) +{ + return (point >= '0' && point <= '9'); +} + +internal b32 +is_alpha_num(u8 point) +{ + return (is_alpha(point) || is_digit(point)); +} + +internal b32 +is_whitespace(u8 point) +{ + return (point == '\n' || point == '\r' || point == ' ' || point == '\t'); +} + +internal b32 +is_delimiter(u8 point) +{ + return (point == ','); +} + +#include "btree_impl.h" +#include "csv_decoder.h" + +typedef struct query_token query_token; +struct query_token +{ + string8 lexeme; + query_token *next; +}; + +typedef struct query_token_list query_token_list; +struct query_token_list +{ + query_token *start_token; + query_token *current_token; +}; + +read_only global_variable +query_token nil_query_token = +{ + .lexeme = {.data = NULL, .size = 0}, + .next = &nil_query_token +}; + + +read_only global_variable +query_token_list nil_query_token_list = +{ + .start_token = &nil_query_token, + .current_token = &nil_query_token, +}; + +internal b32 +is_nil_query_token(query_token *token) +{ + return (token == &nil_query_token) || (token == NULL); +} + +internal b32 +is_nil_query_token_list(query_token *token) +{ + return (token == &nil_query_token) || (token == NULL); +} + +// takes on line of the repl input +// return a reference to the passed list +internal query_token_list * +query_tokenizer(mem_arena *arena, string8 *buffer, query_token_list *list) +{ + b32 initialized = 0; + unused(initialized); + + for (u64 index = 0; index < buffer->size; ++index) + { + u8 codepoint = buffer->data[index]; + + if(codepoint == '\n' || codepoint == '\r') break; + + s32 start = 0; + s32 end = 0; + + if(is_whitespace(codepoint)) end = index; + + // save the token + // TODO(nasr): work on the string macros cuz no work + { + query_token *new_token = PushStruct(arena, query_token); + + //- initialize list + { + if(is_nil_query_token(list->start_token)) + { + list->start_token = new_token; + list->current_token = new_token; + } + else + { + //- all we need to do - we dont track parents or what ever. this is a token stream not a tree + list->current_token->next = new_token; + } + } + + s32 new_token_size = end - start; + + new_token->lexeme = PushString(arena, new_token_size); + new_token->lexeme.data = &buffer->data[index]; + new_token->lexeme.size = new_token_size; + + list->current_token->next = new_token; + + start = index + 1; + } + } + + return list; +} + +int main(int count, char **value) +{ +#if 1 + unused(nil_query_token_list); +#endif + + if(count < 2) value[1] = "./test/data.csv"; + + local_persist b32 running = 1; + + mem_arena *global_arena = arena_create(GiB(1)); + + string8 buffer = load_file(global_arena, value[1]); + + // NOTE(nasr): the use of tables is required for tracking headers etc. + // i think we can optimize this away in the future but for now its fine + csv_table *table = PushStructZero(global_arena, csv_table); + csv_token_list *token_list = PushStructZero(global_arena, csv_token_list); + + token_list->start_token = &nil_csv_token; + token_list->end_token = &nil_csv_token; + + csv_token *tokens = tokenize_csv(buffer, global_arena, table, token_list); + assert_msg(tokens != NULL, "tokens are null"); + + // NOTE(nasr): token_list is now populated — pass it directly, not a fresh empty list + btree *bt = parse_csv(global_arena, token_list, table); + + print("\nDatabase Engine\n"); + + for(;;) + { + if(running) + { + u8 *lbuf = PushArray(global_arena, u8, 256); + s32 err = os_read(STDIN_FD, lbuf, 256); + + if(err < 0) + { + print("error reading from stdin"); + } + + // TODO(nasr): extract this later in the future and make a string copy function/macro + // @params (s32 lbuf_size, string8 lbuf_stringified) + // NOTE(nasr): use err (bytes read) not sizeof(lbuf*) — sizeof a pointer is always 8 + s32 lbuf_size = err; + string8 lbuf_stringified = PushString(global_arena, lbuf_size); + { + memcpy(lbuf_stringified.data, lbuf, lbuf_size); + lbuf_stringified.size = lbuf_size; + } + + query_token_list *qtl = PushStructZero(global_arena, query_token_list); + + + query_tokenizer(global_arena, &lbuf_stringified, qtl); + + // TODO(nasr): dispatch qtl against bt here + + unused(bt); + + sleep(1); + } + } + + return 0; +} -- cgit v1.3