From 180ccc84aac07c7bee2b09a6e07f7406908409b9 Mon Sep 17 00:00:00 2001 From: nasr Date: Mon, 16 Mar 2026 19:20:23 +0000 Subject: feature(main): lots of stuff see description 1. increased compile time warnings to help with some optimizations. 2. impelmeented csv lexing helper functions that do stuff on tokenlists like appending and concatenating lists with each other 3. realiszed that btree design in faulty so disabled it and will refactor it in the next approach --- Makefile | 5 +- source/b_tree.h | 124 ---------------------- source/b_tree_impl.h | 165 +++++++++++++++++++++++++++++ source/csv_decoder.h | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++ source/csv_reader.h | 183 -------------------------------- source/engine.c | 210 ------------------------------------- source/tb_db.c | 206 ++++++++++++++++++++++++++++++++++++ 7 files changed, 662 insertions(+), 520 deletions(-) delete mode 100644 source/b_tree.h create mode 100644 source/b_tree_impl.h create mode 100644 source/csv_decoder.h delete mode 100644 source/csv_reader.h delete mode 100644 source/engine.c create mode 100644 source/tb_db.c diff --git a/Makefile b/Makefile index 0224a42..39e9b87 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,7 @@ BIN = build/engine -SRC = source/engine.c +SRC = source/tb_db.c CC = clang -CFLAGS = -Wall -Wextra -Wfloat-equal -Wswitch-default -Wswitch-enum \ - -Wno-unused-parameter -Wno-implicit-fallthrough -Wno-unused-function -g -Werror +CFLAGS = -Wall -Wextra -Wpedantic -Wno-unused-function -g -Werror $(BIN): $(SRC) mkdir -p build diff --git a/source/b_tree.h b/source/b_tree.h deleted file mode 100644 index 0c4b1e1..0000000 --- a/source/b_tree.h +++ /dev/null @@ -1,124 +0,0 @@ -/* single header b-tree implementation by Abdellah El Morabit */ -#ifndef B_TREE_H -#define B_TREE_H - -// maximum height of the tree the lower the lower the lower amount -// of disk reads which translates into faster? -#define B_TREE_ORDER 4 - -typedef struct b_tree_node b_tree_node; -struct b_tree_node -{ - // store the values - string8 keys[B_TREE_ORDER - 1]; - // TODO(nasr): replace with something more generic? - // NOTE(nasr): cons of void * -> no type safety - // is there a way to still have some sort of that? - void *payload_per_key[B_TREE_ORDER - 1]; - b_tree_node *parent; - // handle to store children faster than linked list - // because now we can iterate over the nodes instead of having cache misses - // when jumping through linked nodes - b_tree_node *children[B_TREE_ORDER]; - // this shouldn't change things but could be a performance improvement on a bigger scale?? - s32 *max_children; - // NOTE(nasr): reference count ::: check how many leaves are using this node - // also not needed for now because we don't free individual node because of arena allocator - // s32 *refc; - s32 key_count; - b32 leaf; -}; - -typedef struct b_tree b_tree; -struct b_tree -{ - // NOTE(nasr): make sure this always stays in memory - // so that initial fetch never requires a disk read - b_tree_node *root; -}; - -#ifdef B_TREE_IMPLEMENTATION -/////////////////////////////////////////////////////////////////////////////// -//- b_tree.c - -// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees? - -internal b_tree_node * -btree_node_alloc(mem_arena *arena) -{ - b_tree_node *node = PushStructZero(arena, b_tree_node); - node->leaf = 1; - return node; -} - -// NOTE(nasr): @return the index of the found element -internal s32 -btree_node_find_pos(string8 value, b_tree_node *node) -{ - s32 i = 0; - for (; i < node->key_count && string8_cmp(node->keys[i], value) < 0; ++i); - return i; -} - -internal void -b_tree_create(mem_arena *arena, b_tree *tree) -{ - tree->root = btree_node_alloc(arena); - tree->root->leaf = 1; - tree->root->key_count = 0; -} - -// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory -internal void * -b_tree_search(b_tree_node *node, string8 key) -{ - s32 i = btree_node_find_pos(key, node); - - if (i < node->key_count && string8_cmp(node->keys[i], key) == 0) - { - return node->payload_per_key[i]; - } - if (node->leaf) - { - return NULL; - } - return b_tree_search(node->children[i], key); -} - -// TODO(nasr): split node when key_count == B_TREE_ORDER - 1 (node is full) -internal void -b_tree_insert(mem_arena *arena, b_tree *tree, string8 key, void *payload) -{ - b_tree_node *current_node = tree->root; - - if (current_node->leaf) - { - // find the insertion position and shift keys right to make room - s32 i = btree_node_find_pos(key, current_node); - for (s32 j = current_node->key_count; j > i; --j) - { - current_node->keys[j] = current_node->keys[j - 1]; - current_node->payload_per_key[j] = current_node->payload_per_key[j - 1]; - } - current_node->keys[i] = key; - current_node->payload_per_key[i] = payload; - - if(current_node->key_count < (B_TREE_ORDER - 1)) - { - current_node->key_count += 1; - } - else { - // TODO(nasr): creating a new branch / tree? - } - } - // TODO(nasr): internal node case walk down then split on the way back up -} - -internal void -b_tree_write(b_tree *bt) -{ - // TODO(nasr): write the b_tree to disk -} - -#endif /* B_TREE_IMPLEMENTATION */ -#endif /* B_TREE_H */ diff --git a/source/b_tree_impl.h b/source/b_tree_impl.h new file mode 100644 index 0000000..8eb0723 --- /dev/null +++ b/source/b_tree_impl.h @@ -0,0 +1,165 @@ +/* single header b-tree implementation by Abdellah El Morabit */ +#ifndef B_TREE_H +#define B_TREE_H + +// maximum height of the tree the lower the lower the lower amount +// of disk reads which translates into faster? +#if 0 +global_variable read_only s16 B_TREE_ORDER = 4; +#endif +#define B_TREE_ORDER 4 + +//- NOTE(nasr): defining a key to improve sorting +// i think saying that a key is a combination of the column + row is a good way of appraoching this +typedef struct key key; +struct key +{ + string8 header; + s32 row; +}; + +typedef struct b_tree_node b_tree_node; +struct b_tree_node +{ + // store the key values of the sub nodes? if they are leaves? + key keys[B_TREE_ORDER - 1]; + // TODO(nasr): replace with something more generic? + // NOTE(nasr): cons of void * -> no type safety + // is there a way to still have some sort of that? + // size not variable + void *payload_per_key[B_TREE_ORDER - 1]; + b_tree_node *parent; + // handle to store children faster than linked list + // because now we can iterate over the nodes instead of having cache misses + // when jumping through linked nodes + b_tree_node *children[B_TREE_ORDER]; + // this shouldn't change things but could be a performance improvement on a bigger scale?? + s32 *max_children; + // NOTE(nasr): reference count ::: check how many leaves are using this node + // also not needed for now because we don't free individual node because of arena allocator + // s32 *refc; + s32 key_count; + b32 leaf; + + + // NOTE(nasr): do we hold the reference to the arena? or do we pass is it as a reference? + // this could solve memory location issues? +}; + +typedef struct b_tree b_tree; +struct b_tree +{ + // NOTE(nasr): make sure this always stays in memory + // so that initial fetch never requires a disk read + b_tree_node *root; +}; + +#ifdef B_TREE_IMPLEMENTATION +/////////////////////////////////////////////////////////////////////////////// +//- b_tree.c + +// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees? + +internal b_tree_node * +btree_node_alloc(mem_arena *arena) +{ + b_tree_node *node = PushStructZero(arena, b_tree_node); + node->leaf = 1; + return node; +} + +// NOTE(nasr): @return the index of the found element +internal s32 +btree_node_find_pos(string8 value, b_tree_node *node) +{ + unused(value); + unused(node); + +#if 0 + s32 i = 0; + for (; i < node->key_count && string8_cmp(node->keys[i], value) < 0; ++i); + return i; +#endif + + return 0; + +} + +internal void +b_tree_create(mem_arena *arena, b_tree *tree) +{ + tree->root = btree_node_alloc(arena); + tree->root->leaf = 1; + tree->root->key_count = 0; +} + + +// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory +internal void * +b_tree_search(b_tree_node *node, string8 key) +{ + unused(node); + unused(key); + +#if 0 + s32 i = btree_node_find_pos(key, node); + + if (i < node->key_count && string8_cmp(node->keys[i], key) == 0) + { + return node->payload_per_key[i]; + } + if (node->leaf) + { + return NULL; + } + return b_tree_search(node->children[i], key); +#endif + + return NULL; +} + + +// TODO(nasr): split node when key_count == B_TREE_ORDER - 1 (node is full) +internal void +b_tree_insert(b_tree *tree, string8 key, void *payload) +{ + unused(tree); + unused(key); + unused(payload); +#if 0 + b_tree_node *current_node = tree->root; + + if (current_node->leaf) + { + // find the insertion position and shift keys right to make room + s32 i = btree_node_find_pos(key, current_node); + for (s32 j = current_node->key_count; j > i; --j) + { + current_node->keys[j] = current_node->keys[j - 1]; + current_node->payload_per_key[j] = current_node->payload_per_key[j - 1]; + } + current_node->keys[i] = key; + current_node->payload_per_key[i] = payload; + + if(current_node->key_count < (B_TREE_ORDER - 1)) + { + current_node->key_count += 1; + } + else { + // TODO(nasr): creating a new branch / tree? + // make a seperate function for this + } + } + // TODO(nasr): internal node case walk down then split on the way back up +#endif +} + +internal void +b_tree_write(b_tree *bt) +{ + // TODO(nasr): write the b_tree to disk + unused(bt); +} + +#endif /* B_TREE_IMPLEMENTATION */ +#endif /* B_TREE_H */ diff --git a/source/csv_decoder.h b/source/csv_decoder.h new file mode 100644 index 0000000..b754ef5 --- /dev/null +++ b/source/csv_decoder.h @@ -0,0 +1,289 @@ +#ifndef ENGINE_LEXER_H +#define ENGINE_LEXER_H + +enum csv_token_flags +{ + FL = 1 << 2, +}; + +enum csv_token_type +{ + // first 255 tokens for ascii characters + TOKEN_UNDEFINED = 255, + TOKEN_IDENTIFIER, + TOKEN_VALUE, +}; + +typedef struct csv_token csv_token; +struct csv_token +{ + string8 lexeme; + csv_token *next_token; + enum csv_token_type type; + enum csv_token_flags flags; +}; + +// NOTE(nasr): i dont think im going to use this. +typedef struct csv_row csv_row; +struct csv_row +{ + // array of size col_count, points into mmap buffer + string8 *fields; + s32 count; +}; + +#if 0 +typedef struct csv_lntity csv_entity; +struct csv_entity +{ + //- not needed because we use key header mapping i think +}; +#endif + +typedef struct csv_header csv_header; +struct csv_header +{ + string8 payload; + csv_header *next_header; +}; + +typedef struct csv_table csv_table; +struct csv_table +{ + // first row, col names + // all data rows + csv_header *header; + s32 row_count; + s32 header_count; +}; + + +typedef struct csv_token_list csv_token_list; +struct csv_token_list +{ + csv_token *start_token; + csv_token *end_token; + +}; + +read_only global_variable +csv_token nil_csv_token= +{ + .lexeme = {.data = NULL, .size =0}, + .type = 0, + .flags = 0, + .next_token = &nil_csv_token, + +}; + +read_only global_variable +csv_header nil_csv_header = +{ + .payload = {.data = NULL, .size = 0}, + .next_header = &nil_csv_header, +}; + +read_only global_variable +csv_token_list nil_csv_token_list = +{ + .start_token = &nil_csv_token, + .end_token = &nil_csv_token, +}; + + +read_only global_variable +csv_row nil_csv_row = +{ + .fields = &nil_string, + .count = 0, +}; + +read_only global_variable +csv_table nil_csv_table = +{ + .header = &nil_csv_header, + .row_count = 0, +}; + +#endif /* ENGINE_LEXER_H */ + +internal b32 +is_nil_csv_token(csv_token *token) +{ + return ((token == NULL) || (token == &nil_csv_token)); +} + +internal void +csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token) +{ + source_token_list->end_token->next_token = source_token; + source_token_list->end_token = source_token; + +} + +//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother +internal void +csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source) +{ + + csv_token *source_ct = source->start_token; + csv_token *destination_end_ct = destination->end_token; + + for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token) + { + destination_end_ct->next_token = source_ct; + } + + destination->end_token = source_ct; +} + +#if 0 +internal csv_token_list * +parse_csv_row(string8 row_buffer) +{ + // csv_token_list * + +} +#endif + + +// the lexer acts as a table builder from a csv file +// and parsing indivudal rows and columns +// the next step would be building a the b-tree +internal csv_token * +tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list) +{ + + unused(token_list); + b32 finding_headers = TRUE; + + if(buffer.size < 0) return NULL; + + csv_token *tok = PushStruct(arena, csv_token); + + // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit + // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING??? + // forgot what the solution was + // TODO(nasr): check what the problem here was + for(s32 index = 0; buffer.data[index] != '\0'; ++index) + { + u8 point = buffer.data[index]; + + s32 start = 0; + s32 end = 0; + + if(is_whitespace(point)) + { + warn("csv file is invalid, detected whitespace"); + return NULL; + } + + + if(point == '\n') + { + if(finding_headers) + { +#if 0 + string8 headers_buffer = {.data = &buffer.data[start], .size = end - start}; +#endif + finding_headers = FALSE; + + { + //- map new header token list to table headers + } + } +#if 0 + else + { + + } +#endif + + + table->row_count++; + } + else if(point == ',') + { + if (finding_headers) + { + table->header_count++; + } + } + + switch(point) + { + case('\n'): + { + tok->flags |= FL; + break; + } + + case(','): + { + end = index - 1; + start = index + 1; + break; + } + default: + { + break; + } + } + + tok->lexeme = StringCast(&buffer.data[start], end - start); + tok->next_token = tok; + } + + return tok; +} + +//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future +internal b_tree * +parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table) +{ + b_tree *tree = PushStructZero(arena, b_tree); + b_tree_create(arena, tree); + + // iterate over the token list while the token is not nil + for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token) + { + + //- TODO(nasr): check initizalization or something tomorrow + { + //- are we parsing the first line tokens? + //- if so, do something :)) + if(ct->flags & FL) + { + // TODO(nasr): replace with nil header check function + if(table->header != &nil_csv_header || table->header == NULL) + { +#if 0 + // - no this should happen in the tokenization + table->headers->next = +#endif + } + else + { + + } + + } + } + + // TODO(nasr): fix this logic tomorrow + csv_token *ct = PushStruct(arena, csv_token); + // skip structural ctens, only index values + if (ct->type != TOKEN_VALUE) + { + continue; + } + + // NOTE(nasr): payload is the cten itself so the caller can reach + // row/col metadata without us having to copy it + // NOTE(nasr): heh why do we void cast again? + b_tree_insert(tree, ct->lexeme, (void *)ct); + } + + return tree; +} + diff --git a/source/csv_reader.h b/source/csv_reader.h deleted file mode 100644 index f5205bf..0000000 --- a/source/csv_reader.h +++ /dev/null @@ -1,183 +0,0 @@ -#ifndef ENGINE_LEXER_H -#define ENGINE_LEXER_H - -typedef enum csv_token_flags csv_token_flags; -enum csv_token_flags -{ - START_FL = 1 << 1, - END_FL = 1 << 2, -}; - -typedef enum csv_token_type csv_token_type; -enum csv_token_type -{ - // first 255 tokens for ascii characters - TOKEN_UNDEFINED = 255, - TOKEN_IDENTIFIER, - TOKEN_VALUE, -}; - -typedef struct csv_token csv_token; -struct csv_token -{ - string8 lexeme; - csv_token_type type; - csv_token_flags flags; - csv_token *next; -}; - -// NOTE(nasr): i dont think im going to use this. -typedef struct csv_row csv_row; -struct csv_row -{ - // array of size col_count, points into mmap buffer - string8 *fields; - s32 count; -}; - -typedef struct csv_table csv_table; -struct csv_table -{ - // first row, col names - // all data rows - string8 *headers; - csv_row *rows; - s32 col_count; - s32 row_count; -}; - - -typedef struct csv_token_list csv_token_list; -struct csv_token_list -{ - csv_token *start_token; - csv_token *end_token; - -}; - -read_only global_variable -csv_token nil_csv_token= -{ - .lexeme = {.data = NULL, .size =0}, - .type = (csv_token_type)0, - .flags = 0, - .next = &nil_csv_token, - -}; - -read_only global_variable -csv_token_list nil_csv_token_list = -{ - .start_token = &nil_csv_token, - .end_token = &nil_csv_token, -}; - - -read_only global_variable -csv_row nil_csv_row = -{ - .fields = &nil_string, - .count = 0, -}; - -read_only global_variable -csv_table nil_csv_table = -{ - .headers = &nil_string, - .rows = &nil_csv_row, - .col_count = 0, - .row_count = 0, -}; - -#endif /* ENGINE_LEXER_H */ - -// the lexer acts as a table builder from a csv file -// and parsing indivudal rows and columns -// the next step would be building a the b-tree -internal csv_token * -tokenize_csv(string8 buffer, mem_arena *arena) -{ - b32 FL = TRUE; - - if(buffer.size < 0) return NULL; - - csv_token *tok = PushStruct(arena, csv_token); - - // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit - // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING??? - for(s32 index = 0; buffer.data[index] != '\0'; ++index) - { - u8 point = buffer.data[index]; - - s32 start = 0; - s32 end = 0; - - if(is_whitespace(point)) - { - warn("csv file is invalid, detected whitespace"); - return NULL; - } - - switch(point) - { - case('\n'): - { - if(FL) tok->flags |= END_FL; - break; - } - - case(','): - { - end = index - 1; - start = index + 1; - break; - } - default: - { - break; - } - } - - tok->lexeme = StringCast(&buffer.data[start], end - start); - tok->next = tok; - } - - return tok; -} - -internal void -read_csv(string8 buffer) -{ - // printf("\nsize:%lu\ndata %s\n", buffer.size, buffer.data); - -} - -internal b_tree * -parse_csv(mem_arena *arena, csv_token_list *ctl) -{ - b_tree *tree = PushStructZero(arena, b_tree); - b_tree_create(arena, tree); - - //- TODO(nasr): check initizalization or something tomorrow - { - - } - // TODO(nasr): fix this logic tomorrow - csv_token *ct = PushStruct(arena, csv_token); - - for (;ct != NULL; ct = ct->next) - { - // skip structural ctens, only index values - if (ct->type != TOKEN_VALUE) - { - continue; - } - - // NOTE(nasr): payload is the cten itself so the caller can reach - // row/col metadata without us having to copy it - // NOTE(nasr): heh why do we void cast again? - b_tree_insert(arena, tree, ct->lexeme, (void *)ct); - } - - return tree; -} diff --git a/source/engine.c b/source/engine.c deleted file mode 100644 index 106f113..0000000 --- a/source/engine.c +++ /dev/null @@ -1,210 +0,0 @@ - - - -#define B_TREE_IMPLEMENTATION -#define BASE_UNITY -#include "base/base_include.h" - -internal b32 -is_alpha(u8 point) -{ - return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_')); -} - -internal b32 -is_digit(u8 point) -{ - return (point >= '0' && point <= '9'); -} - -internal b32 -is_alpha_num(u8 point) -{ - return (is_alpha(point) || is_digit(point)); -} - -internal b32 -is_whitespace(u8 point) -{ - return (point == '\n' || point == '\r' || point == ' ' || point == '\t'); -} - -internal b32 -is_delimiter(u8 point) -{ - return (point == ','); - -} - -#include "b_tree.h" -#include "csv_reader.h" - -typedef struct query_token query_token; -struct query_token -{ - string8 lexeme; - query_token *next; -}; - -typedef struct query_token_list query_token_list; -struct query_token_list -{ - query_token *start_token; - query_token *current_token; -}; - -read_only global_variable -query_token nil_query_token = -{ - .lexeme = {.data = NULL, .size = 0}, - .next = &nil_query_token -}; - - -read_only global_variable -query_token_list nil_query_token_list = -{ - .start_token = &nil_query_token, - .current_token = &nil_query_token, -}; - -internal b32 -is_nil_query_token(query_token *token) -{ - return (token == &nil_query_token) || (token == NULL); -} - -internal b32 -is_nil_query_token_list(query_token *token) -{ - return (token == &nil_query_token) || (token == NULL); -} - - -// takes on line of the repl input -// return a reference to the passed list -internal query_token_list * -query_tokenizer(mem_arena *arena, string8 *buffer, query_token_list *list) -{ - b32 initialized = 0; - unused(initialized); - - for (u64 index = 0; index < buffer->size; ++index) - { - u8 codepoint = buffer->data[index]; - - if(codepoint == '\n' || codepoint == '\r') break; - - s32 start = 0; - s32 end = 0; - - if(is_whitespace(codepoint)) - { - end = index; - } - - // save the token - // TODO(nasr): work on the string macros cuz no work - { - query_token *new_token = PushStruct(arena, query_token); - - //- initialize list - { - if(is_nil_query_token(list->start_token)) - { - list->start_token = new_token; - list->current_token = new_token; - } - else - { - //- all we need to do - we dont track parents or what ever. this is a token stream not a tree - list->current_token->next = new_token; - } - } - - s32 new_token_size = end - start; - - new_token->lexeme = PushString(arena, new_token_size); - new_token->lexeme.data = &buffer->data[index]; - new_token->lexeme.size = new_token_size; - - list->current_token->next = new_token; - - start = index + 1; - } - } - - return list; -} - -int main(int count, char **value) -{ - -#if 1 - unused(nil_query_token_list); -#endif - - if(count < 2) value[1] = "./test/data.csv"; - - local_persist b32 running = 1; - - mem_arena *global_arena = arena_create(MiB(30)); - - // NOTE(nasr): see note down below - // csv_table *global_table = PushStruct(global_arena, csv_table); - - string8 buffer = load_file(global_arena, value[1]); - - print("\nDatabase Engine\n"); - - for(;;) - { - if (running) - { - { - u8 *lbuf = PushArray(global_arena, u8, 256); - s32 err = os_read(STDIN_FD, lbuf, 256); - - if(err < 0) - { - print("error reading from stdin"); - } - - // TODO(nasr): extract this later in the future and make a string copy function/macro - // @params (s32 lbuf_size , string8 lbuf_stringified) - s32 lbuf_size = sizeof(lbuf) - 1; - string8 lbuf_stringified = PushString(global_arena, lbuf_size); - { - memcpy(lbuf_stringified.data, lbuf, lbuf_size); - lbuf_stringified.size = sizeof(lbuf) - 1; - } - - query_token_list *qtl = PushStruct(global_arena, query_token_list); - - query_tokenizer(global_arena, &lbuf_stringified, qtl); - } - - { - read_csv(buffer); - - csv_token *tokens = tokenize_csv(buffer, global_arena); - - assert_msg(tokens != NULL, "Tokens are NULL."); - - csv_token_list *ctl = PushStruct(global_arena, csv_token_list); - b_tree *bt = parse_csv(global_arena, ctl); - - b_tree_write(bt); - } - - - // NOTE(nasr): not sure on how to approach the b-tree and the table format thing - // we kind of want our table format i think? but i wouldnt be sure about the use case - // so we stick to the regular b_tree for now. commenting out the tables. - - sleep(1); - } - } - - return 0; -} diff --git a/source/tb_db.c b/source/tb_db.c new file mode 100644 index 0000000..b992111 --- /dev/null +++ b/source/tb_db.c @@ -0,0 +1,206 @@ +#define B_TREE_IMPLEMENTATION +#define BASE_UNITY +#include "base/base_include.h" + +internal b32 +is_alpha(u8 point) +{ + return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_')); +} + +internal b32 +is_digit(u8 point) +{ + return (point >= '0' && point <= '9'); +} + +internal b32 +is_alpha_num(u8 point) +{ + return (is_alpha(point) || is_digit(point)); +} + +internal b32 +is_whitespace(u8 point) +{ + return (point == '\n' || point == '\r' || point == ' ' || point == '\t'); +} + +internal b32 +is_delimiter(u8 point) +{ + return (point == ','); +} + +#include "b_tree_impl.h" +#include "csv_decoder.h" + +typedef struct query_token query_token; +struct query_token +{ + string8 lexeme; + query_token *next; +}; + +typedef struct query_token_list query_token_list; +struct query_token_list +{ + query_token *start_token; + query_token *current_token; +}; + +read_only global_variable +query_token nil_query_token = +{ + .lexeme = {.data = NULL, .size = 0}, + .next = &nil_query_token +}; + + +read_only global_variable +query_token_list nil_query_token_list = +{ + .start_token = &nil_query_token, + .current_token = &nil_query_token, +}; + +internal b32 +is_nil_query_token(query_token *token) +{ + return (token == &nil_query_token) || (token == NULL); +} + +internal b32 +is_nil_query_token_list(query_token *token) +{ + return (token == &nil_query_token) || (token == NULL); +} + +// takes on line of the repl input +// return a reference to the passed list +internal query_token_list * +query_tokenizer(mem_arena *arena, string8 *buffer, query_token_list *list) +{ + b32 initialized = 0; + unused(initialized); + + for (u64 index = 0; index < buffer->size; ++index) + { + u8 codepoint = buffer->data[index]; + + if(codepoint == '\n' || codepoint == '\r') break; + + s32 start = 0; + s32 end = 0; + + if(is_whitespace(codepoint)) end = index; + + // save the token + // TODO(nasr): work on the string macros cuz no work + { + query_token *new_token = PushStruct(arena, query_token); + + //- initialize list + { + if(is_nil_query_token(list->start_token)) + { + list->start_token = new_token; + list->current_token = new_token; + } + else + { + //- all we need to do - we dont track parents or what ever. this is a token stream not a tree + list->current_token->next = new_token; + } + } + + s32 new_token_size = end - start; + + new_token->lexeme = PushString(arena, new_token_size); + new_token->lexeme.data = &buffer->data[index]; + new_token->lexeme.size = new_token_size; + + list->current_token->next = new_token; + + start = index + 1; + } + } + + return list; +} + +int main(int count, char **value) +{ + +#if 1 + unused(nil_query_token_list); +#endif + + if(count < 2) value[1] = "./test/data.csv"; + + local_persist b32 running = 1; + + mem_arena *global_arena = arena_create(MiB(30)); + + // NOTE(nasr): see note down below + // csv_table *global_table = PushStruct(global_arena, csv_table); + + string8 buffer = load_file(global_arena, value[1]); + + print("\nDatabase Engine\n"); + + for(;;) + { + if (running) + { + { + u8 *lbuf = PushArray(global_arena, u8, 256); + s32 err = os_read(STDIN_FD, lbuf, 256); + + if(err < 0) + { + print("error reading from stdin"); + } + + // TODO(nasr): extract this later in the future and make a string copy function/macro + // @params (s32 lbuf_size , string8 lbuf_stringified) + s32 lbuf_size = sizeof(lbuf) - 1; + string8 lbuf_stringified = PushString(global_arena, lbuf_size); + { + memcpy(lbuf_stringified.data, lbuf, lbuf_size); + lbuf_stringified.size = sizeof(lbuf) - 1; + } + + query_token_list *qtl = PushStruct(global_arena, query_token_list); + + query_tokenizer(global_arena, &lbuf_stringified, qtl); + } + + { + + // NOTE(nasr): the use of tables is required for tracking headers etc. + // i think we can optimize this away in the future but for now its fine + csv_table *table = PushStruct(global_arena, csv_table); + + csv_token_list *token_list = PushStruct(global_arena, csv_token_list); + + csv_token *tokens = tokenize_csv(buffer, global_arena, table, token_list); + + assert_msg(tokens != NULL, "Tokens are NULL."); + + csv_token_list *ctl = PushStruct(global_arena, csv_token_list); + b_tree *bt = parse_csv(global_arena, ctl, table); + + b_tree_write(bt); + } + + // NOTE(nasr): not sure on how to approach the b-tree and the table format thing + // we kind of want our table format i think? but i wouldnt be sure about the use case + // so we stick to the regular b_tree for now. commenting out the tables. + + sleep(1); + } + } + + return 0; +} -- cgit v1.3