From 854d66bb73cd131bda6452aac74aae3d9a77b91a Mon Sep 17 00:00:00 2001 From: nasr Date: Mon, 9 Mar 2026 19:44:59 +0000 Subject: refactor(main): simplified the project. going towards a single header file project maybe... --- source/b_tree.c | 66 --------------------- source/b_tree.h | 44 -------------- source/base/base_string.h | 3 +- source/btree.h | 108 ++++++++++++++++++++++++++++++++++ source/csv_parser.c | 26 --------- source/csv_parser.h | 40 ------------- source/csv_reader.h | 145 ++++++++++++++++++++++++++++++++++++++++++++++ source/engine.c | 101 +++++++++++++++++++++++++++++--- source/lexer.c | 80 ------------------------- source/lexer.h | 30 ---------- source/query.c | 0 source/query.h | 0 12 files changed, 348 insertions(+), 295 deletions(-) delete mode 100644 source/b_tree.c delete mode 100644 source/b_tree.h create mode 100644 source/btree.h delete mode 100644 source/csv_parser.c delete mode 100644 source/csv_parser.h create mode 100644 source/csv_reader.h delete mode 100644 source/lexer.c delete mode 100644 source/lexer.h delete mode 100644 source/query.c delete mode 100644 source/query.h diff --git a/source/b_tree.c b/source/b_tree.c deleted file mode 100644 index e42f709..0000000 --- a/source/b_tree.c +++ /dev/null @@ -1,66 +0,0 @@ - -// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees? - -internal b_tree_node * -node_alloc(mem_arena *arena) -{ - b_tree_node *node = PushStructZero(arena, type); - node->leaf = 1; - return node; -} - -// NOTE(nasr): @return the index of of the found element -internal s32 -node_find_pos(mem_arena *arena, string8 value) -{ - s32 i = 0; - while (i < n->key_count && str8_cmp(n->keys[i], k) < 0) - { - ++i; - } - - return i; -} - -interal void -b_tree_create(mem_arena *arena, b_tree *tree) -{ - tree->root = node_alloc(arena); - tree->root->leaf = 1; - tree->root->key_count = 0; -} - -// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory -internal void -b_tree_search(b_tree_node *node, string8 key) -{ - s32 found_index = node_find_pos(node, key); - - if (found_index < n->key_count && string_compare(n->keys[i], key) == 0) - { - return n->rows[i]; - } - if (n->leaf) - { - return NULL; - - } - - return b_tree_search(n->children[i], key); - - -} - -internal void -b_tree_insert() -{ - -} - -internal void -b_tree_write() -{ - // TODO(nasr): write the b_tree to disk -} - - diff --git a/source/b_tree.h b/source/b_tree.h deleted file mode 100644 index 0fc4d3c..0000000 --- a/source/b_tree.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef BTREE_H -#define BTREE_H - -// maximum height of the tree the lower the lower the lower amount -// of disk reads which translates into faster? -#define B_TREE_ORDER 4 - -typedef struct b_tree_node b_tree_node; -struct b_tree_node -{ - - // store the values - string8 keys[B_TREE_ORDER - 1]; - csv_row *rows[B_TREE_ORDER - 1]; - - b_tree_node *parent; - // handle to store children faster than linked list - // because now we can iteratate over the nodes instead of having cache misses - // when jumping through linked nodes - b_tree_node *children[B_TREE_ORDER]; - - // NOTE(nasr): reference count ::: check how many leaves are using this node - // also not needed for now because we don't free individual node because of arena allocator - // s32 *refc; - - s32 key_count; - - b32 leaf; - -}; - -typedef struct b_tree b_tree; -struct b_tree -{ - // NOTE(nasr): make sure this always stays in memory - // so that initial fetch never requires a disk read - b_tree_node *root; - -}; - - - - -#endif /* BTREE_H */ diff --git a/source/base/base_string.h b/source/base/base_string.h index 4e0d923..ef2cf16 100644 --- a/source/base/base_string.h +++ b/source/base/base_string.h @@ -12,7 +12,8 @@ result; \ }) -#define String8Cast(literal, literal_count) ( string8 ){( u8 * )( literal ),( u64 )( literal_count ) } +#define StringCast(data, size) (string8){(u8 *)(data), (u64)(size) } +#define StringPCast(data, size) (string8 *){(u8 *)(data), (u64)(size) } #define StringFmt "%.*s" #define ULongFmt "%lu" diff --git a/source/btree.h b/source/btree.h new file mode 100644 index 0000000..42c8e21 --- /dev/null +++ b/source/btree.h @@ -0,0 +1,108 @@ +#ifndef BTREE_H +#define BTREE_H + +// maximum height of the tree the lower the lower the lower amount +// of disk reads which translates into faster? +#define B_TREE_ORDER 4 + +typedef struct b_tree_node b_tree_node; +struct b_tree_node +{ + + // store the values + string8 keys[B_TREE_ORDER - 1]; + csv_row *rows[B_TREE_ORDER - 1]; + + b_tree_node *parent; + // handle to store children faster than linked list + // because now we can iteratate over the nodes instead of having cache misses + // when jumping through linked nodes + b_tree_node *children[B_TREE_ORDER]; + + // NOTE(nasr): reference count ::: check how many leaves are using this node + // also not needed for now because we don't free individual node because of arena allocator + // s32 *refc; + + s32 key_count; + + b32 leaf; + +}; + +typedef struct b_tree b_tree; +struct b_tree +{ + // NOTE(nasr): make sure this always stays in memory + // so that initial fetch never requires a disk read + b_tree_node *root; + +}; + + + + +// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees? + +internal b_tree_node * +node_alloc(mem_arena *arena) +{ + b_tree_node *node = PushStructZero(arena, type); + node->leaf = 1; + return node; +} + +// NOTE(nasr): @return the index of of the found element +internal s32 +node_find_pos(mem_arena *arena, string8 value) +{ + s32 i = 0; + while (i < n->key_count && str8_cmp(n->keys[i], k) < 0) + { + ++i; + } + + return i; +} + +interal void +b_tree_create(mem_arena *arena, b_tree *tree) +{ + tree->root = node_alloc(arena); + tree->root->leaf = 1; + tree->root->key_count = 0; +} + +// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory +internal void +b_tree_search(b_tree_node *node, string8 key) +{ + s32 found_index = node_find_pos(node, key); + + if (found_index < n->key_count && string_compare(n->keys[i], key) == 0) + { + return n->rows[i]; + } + if (n->leaf) + { + return NULL; + + } + + return b_tree_search(n->children[i], key); + + +} + +internal void +b_tree_insert() +{ + +} + +internal void +b_tree_write() +{ + // TODO(nasr): write the b_tree to disk +} + +#endif /* BTREE_H */ diff --git a/source/csv_parser.c b/source/csv_parser.c deleted file mode 100644 index 6679d2f..0000000 --- a/source/csv_parser.c +++ /dev/null @@ -1,26 +0,0 @@ -internal void -strip_new_line(string8 buffer) -{ - - for (u64 index = 0; index < buffer.size; index++) - { - - } - - return; - -} - -internal void -read_csv(string8 buffer) -{ - printf("\nsize:%lu\ndata %s\n", buffer.size, buffer.data); - -} - -internal csv_table * -parse_csv(token *tokens, csv_table *table) -{ - - return NULL; -} diff --git a/source/csv_parser.h b/source/csv_parser.h deleted file mode 100644 index 1aa96b9..0000000 --- a/source/csv_parser.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef CSV_PARSER_H -#define CSV_PARSER_H - -typedef struct csv_row csv_row; -struct csv_row -{ - // array of size col_count, points into mmap buffer - string8 *fields; - s32 count; -}; - -typedef struct csv_table csv_table; -struct csv_table -{ - // first row, col names - // all data rows - string8 *headers; - csv_row *rows; - s32 col_count; - s32 row_count; -}; - -read_only global_variable -csv_row nil_csv_row = -{ - .fields = &nil_string, - .count = 0, -}; - -read_only global_variable -csv_table nil_csv_table = -{ - .headers = &nil_string, - .rows = &nil_csv_row, - .col_count = 0, - .row_count = 0, -}; - - -#endif /* CSV_PARSER_H */ diff --git a/source/csv_reader.h b/source/csv_reader.h new file mode 100644 index 0000000..6c8e625 --- /dev/null +++ b/source/csv_reader.h @@ -0,0 +1,145 @@ +#ifndef ENGINE_LEXER_H +#define ENGINE_LEXER_H + +typedef enum token_flags token_flags; +enum token_flags +{ + START_FL = 1 << 1, + END_FL = 1 << 2, +}; + + +typedef enum token_type token_type; +enum token_type +{ + // first 255 tokens for ascii characters + TOKEN_UNDEFINED = 255, + TOKEN_IDENTIFIER, + TOKEN_VALUE, +}; + +typedef struct token token; +struct token +{ + string8 lexeme; + token_type type; + token_flags flags; + token *next; +}; + +typedef struct csv_row csv_row; +struct csv_row +{ + // array of size col_count, points into mmap buffer + string8 *fields; + s32 count; +}; + +typedef struct csv_table csv_table; +struct csv_table +{ + // first row, col names + // all data rows + string8 *headers; + csv_row *rows; + s32 col_count; + s32 row_count; +}; + +read_only global_variable +csv_row nil_csv_row = +{ + .fields = &nil_string, + .count = 0, +}; + +read_only global_variable +csv_table nil_csv_table = +{ + .headers = &nil_string, + .rows = &nil_csv_row, + .col_count = 0, + .row_count = 0, +}; + + +// the lexer acts as a table builder from a csv file +// and parsing indivudal rows and columns +// the next step would be building a the b-tree +internal token * +tokenize_csv(string8 buffer, mem_arena *arena) +{ + + b32 FL = TRUE; + + if(buffer.size < 0) return NULL; + for(s32 index = 0; buffer.data[index] != '\0'; ++index) + { + token *tok = PushStruct(arena, token); + u8 point = buffer.data[index]; + + s32 start = 0; + s32 end = 0; + + if(is_whitespace(point)) + { + print("csv file is invalid"); + return NULL; + } + + switch(point) + { + case('\n'): + { + if(FL) tok->flags |= END_FL; + break; + } + + case(','): + { + end = index - 1; + start = index + 1; + break; + } + default: + { + break; + } + } + + tok->lexeme = StringCast(&buffer.data[start], end - start); + tok->next = tok; + } + + return NULL; +} + +internal void +strip_new_line(string8 buffer) +{ + + for (u64 index = 0; index < buffer.size; index++) + { + + } + + return; + +} + +internal void +read_csv(string8 buffer) +{ + // printf("\nsize:%lu\ndata %s\n", buffer.size, buffer.data); + +} + +internal csv_table * +parse_csv(token *tokens, csv_table *table) +{ + + return NULL; +} + + +#endif /* ENGINE_LEXER_H */ diff --git a/source/engine.c b/source/engine.c index fe609e9..3527e27 100644 --- a/source/engine.c +++ b/source/engine.c @@ -1,17 +1,88 @@ #define BASE_UNITY #include "base/base_include.h" -#include +internal b32 +is_alpha(u8 point) +{ + return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_')); +} + +internal b32 +is_digit(u8 point) +{ + return (point >= '0' && point <= '9'); +} -#include "csv_parser.h" -#include "lexer.h" +internal b32 +is_alpha_num(u8 point) +{ + return (is_alpha(point) || is_digit(point)); +} + +internal b32 +is_whitespace(u8 point) +{ + return (point == '\n' || point == '\r' || point == ' ' || point == '\t'); +} + +internal b32 +is_delimiter(u8 point) +{ + return (point == ','); +} + +#include "csv_reader.h" +// #include "btree.h" + +typedef struct query_token query_token; +struct query_token +{ + string8 *lexeme; + query_token *next; +}; + + +// takes on line of the repl input +internal query_token * +query_tokenizer(mem_arena *arena, string8 *buffer) +{ + query_token *tok = PushStruct(arena, query_token); + unused(tok); -#include "lexer.c" -#include "csv_parser.c" + for (u64 index = 0; index < buffer->size; ++index) + { + u8 codepoint = buffer->data[index]; + + if(codepoint == '\n' || codepoint == '\r') break; + + s32 start = 0; + s32 end = 0; + + if(is_whitespace(codepoint)) + { + end = index; + } + + s32 new_token_size = end - start; + + tok->lexeme->data = &buffer->data[index]; + tok->lexeme->size = new_token_size; + + tok->next = tok; + start = index + 1; + + } + + return tok; +} int main(int c, char **v) { - if(c < 2) return -999; + if(c < 2) + { + print("bad file, setting default file\n"); + } + else v[1] = "./test/customers-10000.csv"; local_persist b32 running = 1; @@ -26,13 +97,27 @@ int main(int c, char **v) { if (running) { - // TODO(nasr): check for return key { u8 line_buffer[256] = {}; - s64 codepoint = os_read(STDIN_FD, line_buffer, 256); unused(codepoint); + for(s32 index = 0; index < 256; ++index) + { + if(line_buffer[index] == '\n') + { + print("exiting"); + return 0; + } + else if(line_buffer[index] == ' ') + { + print("TODO(nasr): "); + } + + } + } + + { read_csv(buffer); token *tokens = tokenize_csv(buffer, global_arena); global_table = parse_csv(tokens, global_table); diff --git a/source/lexer.c b/source/lexer.c deleted file mode 100644 index decfb7a..0000000 --- a/source/lexer.c +++ /dev/null @@ -1,80 +0,0 @@ -// the lexer acts as a table builder from a csv file -// and parsing indivudal rows and columns -// the next step would be building a the b-tree -internal b32 -is_alpha(u8 point) -{ - return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_')); -} - -internal b32 -is_digit(u8 point) -{ - return (point >= '0' && point <= '9'); -} - -internal b32 -is_alpha_num(u8 point) -{ - return (is_alpha(point) || is_digit(point)); -} - -internal b32 -is_whitespace(u8 point) -{ - return (point == '\n' || point == '\r' || point == ' ' || point == '\t'); -} - -internal b32 -is_delimiter(u8 point) -{ - return (point == ','); -} - -internal token * -tokenize_csv(string8 buffer, mem_arena *arena) -{ - - b32 FL = TRUE; - - if(buffer.size < 0) return NULL; - for(s32 index = 0; buffer.data[index] != '\0'; ++index) - { - token *tok = PushStruct(arena, token); - u8 point = buffer.data[index]; - - s32 start = 0; - s32 end = 0; - - if(is_whitespace(point)) - { - print("csv file is invalid"); - return NULL; - } - - switch(point) - { - case('\n'): - { - if(FL) tok->flags |= END_FL; - break; - } - - case(','): - { - end = index - 1; - start = index + 1; - break; - } - default: - { - break; - } - } - - tok->lexeme = String8Cast(&buffer.data[start], end - start); - tok->next = tok; - } - - return NULL; -} diff --git a/source/lexer.h b/source/lexer.h deleted file mode 100644 index c950a58..0000000 --- a/source/lexer.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef ENGINE_LEXER_H -#define ENGINE_LEXER_H - -typedef enum token_flags token_flags; -enum token_flags -{ - START_FL = 1 << 1, - END_FL = 1 << 2, -}; - - -typedef enum token_type token_type; -enum token_type -{ - // first 255 tokens for ascii characters - TOKEN_UNDEFINED = 255, - TOKEN_IDENTIFIER, - TOKEN_VALUE, -}; - -typedef struct token token; -struct token -{ - string8 lexeme; - token_type type; - token_flags flags; - token *next; -}; - -#endif /* ENGINE_LEXER_H */ diff --git a/source/query.c b/source/query.c deleted file mode 100644 index e69de29..0000000 diff --git a/source/query.h b/source/query.h deleted file mode 100644 index e69de29..0000000 -- cgit v1.3