From 71ced998122c357bc62e54d9bb4e124c88acf94b Mon Sep 17 00:00:00 2001 From: nasr Date: Sun, 8 Mar 2026 21:01:43 +0000 Subject: refactor(main): worked on string handling in C and other stuff --- .gitignore | 1 + Makefile | 2 +- source/b_tree.c | 66 +++++++++++++++++++++++++++++ source/b_tree.h | 44 +++++++++++++++++++ source/base/base.h | 16 +++---- source/base/base_include.h | 2 + source/base/base_io.h | 29 ++++++++++++- source/base/base_os.h | 10 +---- source/base/base_string.h | 15 ++++--- source/base/base_test.h | 2 +- source/csv_parser.c | 32 ++++++++++++++ source/csv_parser.h | 40 ++++++++++++++++++ source/engine.c | 46 ++++++++++++++++++++ source/engine/engine.c | 37 ---------------- source/lexer.c | 94 +++++++++++++++++++++++++++++++++++++++++ source/lexer.h | 32 ++++++++++++++ source/lexer/lexer.c | 100 -------------------------------------------- source/lexer/lexer.h | 23 ---------- source/parser/parser.c | 16 ------- source/parser/parser.h | 16 ------- source/query.c | 0 source/query.h | 0 source/query/query.c | 0 source/query/query.h | 0 source/repl/repl.c | 12 ------ source/repl/repl.h | 16 ------- source/storage/b_tree.c | 45 -------------------- source/storage/b_tree.h | 44 ------------------- source/storage/csv_reader.c | 7 ---- source/storage/csv_reader.h | 43 ------------------- source/storage/table.c | 0 source/storage/table.h | 0 32 files changed, 405 insertions(+), 385 deletions(-) create mode 100644 source/b_tree.c create mode 100644 source/b_tree.h create mode 100644 source/csv_parser.c create mode 100644 source/csv_parser.h create mode 100644 source/engine.c delete mode 100644 source/engine/engine.c create mode 100644 source/lexer.c create mode 100644 source/lexer.h delete mode 100644 source/lexer/lexer.c delete mode 100644 source/lexer/lexer.h delete mode 100644 source/parser/parser.c delete mode 100644 source/parser/parser.h create mode 100644 source/query.c create mode 100644 source/query.h delete mode 100644 source/query/query.c delete mode 100644 source/query/query.h delete mode 100644 source/repl/repl.c delete mode 100644 source/repl/repl.h delete mode 100644 source/storage/b_tree.c delete mode 100644 source/storage/b_tree.h delete mode 100644 source/storage/csv_reader.c delete mode 100644 source/storage/csv_reader.h delete mode 100644 source/storage/table.c delete mode 100644 source/storage/table.h diff --git a/.gitignore b/.gitignore index 00fe5ef..05a7b85 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ idea/* /helper.sh /tests /tags +/.cache/clangd/index diff --git a/Makefile b/Makefile index f9c5906..0224a42 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ BIN = build/engine -SRC = source/engine/engine.c +SRC = source/engine.c CC = clang CFLAGS = -Wall -Wextra -Wfloat-equal -Wswitch-default -Wswitch-enum \ -Wno-unused-parameter -Wno-implicit-fallthrough -Wno-unused-function -g -Werror diff --git a/source/b_tree.c b/source/b_tree.c new file mode 100644 index 0000000..e42f709 --- /dev/null +++ b/source/b_tree.c @@ -0,0 +1,66 @@ + +// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees? + +internal b_tree_node * +node_alloc(mem_arena *arena) +{ + b_tree_node *node = PushStructZero(arena, type); + node->leaf = 1; + return node; +} + +// NOTE(nasr): @return the index of of the found element +internal s32 +node_find_pos(mem_arena *arena, string8 value) +{ + s32 i = 0; + while (i < n->key_count && str8_cmp(n->keys[i], k) < 0) + { + ++i; + } + + return i; +} + +interal void +b_tree_create(mem_arena *arena, b_tree *tree) +{ + tree->root = node_alloc(arena); + tree->root->leaf = 1; + tree->root->key_count = 0; +} + +// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory +internal void +b_tree_search(b_tree_node *node, string8 key) +{ + s32 found_index = node_find_pos(node, key); + + if (found_index < n->key_count && string_compare(n->keys[i], key) == 0) + { + return n->rows[i]; + } + if (n->leaf) + { + return NULL; + + } + + return b_tree_search(n->children[i], key); + + +} + +internal void +b_tree_insert() +{ + +} + +internal void +b_tree_write() +{ + // TODO(nasr): write the b_tree to disk +} + + diff --git a/source/b_tree.h b/source/b_tree.h new file mode 100644 index 0000000..0fc4d3c --- /dev/null +++ b/source/b_tree.h @@ -0,0 +1,44 @@ +#ifndef BTREE_H +#define BTREE_H + +// maximum height of the tree the lower the lower the lower amount +// of disk reads which translates into faster? +#define B_TREE_ORDER 4 + +typedef struct b_tree_node b_tree_node; +struct b_tree_node +{ + + // store the values + string8 keys[B_TREE_ORDER - 1]; + csv_row *rows[B_TREE_ORDER - 1]; + + b_tree_node *parent; + // handle to store children faster than linked list + // because now we can iteratate over the nodes instead of having cache misses + // when jumping through linked nodes + b_tree_node *children[B_TREE_ORDER]; + + // NOTE(nasr): reference count ::: check how many leaves are using this node + // also not needed for now because we don't free individual node because of arena allocator + // s32 *refc; + + s32 key_count; + + b32 leaf; + +}; + +typedef struct b_tree b_tree; +struct b_tree +{ + // NOTE(nasr): make sure this always stays in memory + // so that initial fetch never requires a disk read + b_tree_node *root; + +}; + + + + +#endif /* BTREE_H */ diff --git a/source/base/base.h b/source/base/base.h index ef23391..cf2a15f 100755 --- a/source/base/base.h +++ b/source/base/base.h @@ -52,22 +52,22 @@ typedef uint32_t u32; typedef uint16_t u16; typedef uint8_t u8; -typedef int8_t i8; -typedef int16_t i16; -typedef int32_t i32; -typedef int64_t i64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; typedef float f32; typedef double f64; -typedef i32 b32; -typedef i16 b16; +typedef s32 b32; +typedef s16 b16; typedef u8 b8; typedef uintptr_t umm; typedef intptr_t smm; -#define TRUE (0 == 0) -#define FALSE (0 != 0) +#define TRUE (1 == 1) +#define FALSE (1 != 1) #endif diff --git a/source/base/base_include.h b/source/base/base_include.h index 40ae5ea..07856d0 100755 --- a/source/base/base_include.h +++ b/source/base/base_include.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,7 @@ #include "base_stack.h" #include "base_test.h" #include "base_string.h" +#include "base_io.h" #include "base_os.h" #ifdef BASE_UNITY diff --git a/source/base/base_io.h b/source/base/base_io.h index ece4d7c..85fedc7 100644 --- a/source/base/base_io.h +++ b/source/base/base_io.h @@ -1,11 +1,38 @@ #ifndef BASE_IO_H #define BASE_IO_H +#define STDIN_FD 0 +#define STDOUT_FD 1 +#define STDERR_FD 2 + +internal s64 +os_write(s32 fd, void const *buf, u64 count) +{ + return syscall(SYS_write, fd, buf, count); +} + +internal s64 +os_read(s32 fd, void *buf, u64 count) +{ + return syscall(SYS_read, fd, buf, count); +} + internal void -input_read() +print_s8(string8 s) { + os_write(STDOUT_FILENO, s.data, s.size); +} +internal void +print(const char *str) +{ + s32 len = 0; + while (str[len]) len++; + os_write(STDOUT_FILENO, str, len); } +#define Os_read(buffer, buffer_count) + + #endif /* BASE_IO_H */ diff --git a/source/base/base_os.h b/source/base/base_os.h index 23587c6..abe8628 100644 --- a/source/base/base_os.h +++ b/source/base/base_os.h @@ -1,21 +1,13 @@ #ifndef BASE_OS_H #define BASE_OS_H -internal void -print(const char *str) -{ - i32 len = 0; - while (str[len]) len++; - write(STDOUT_FILENO, str, len); -} - internal string8 load_file(const char *path) { string8 result = {0}; struct stat sbuf = {0}; - i32 file = open(path, O_RDONLY); + s32 file = open(path, O_RDONLY); if(file == -1) return result; if(fstat(file, &sbuf) == -1) diff --git a/source/base/base_string.h b/source/base/base_string.h index 189b38a..4e0d923 100644 --- a/source/base/base_string.h +++ b/source/base/base_string.h @@ -1,13 +1,18 @@ #ifndef BASE_STRING_H #define BASE_STRING_H -#include - #define StringLit(string) \ (string8){ .data = (u8 *)(string), .size = (sizeof(string) - 1) } - #define PushString(arena, size) \ - (string8){ (u8 *)PushArray((arena), u8, (size)), (u64)(size) } +#define PushString(arena, size) \ + ({ \ + string8 *result = PushStruct((arena), string8); \ + result->data = PushArray((arena), u8, (size)); \ + result->size = (u64)(size); \ + result; \ + }) + +#define String8Cast(literal, literal_count) ( string8 ){( u8 * )( literal ),( u64 )( literal_count ) } #define StringFmt "%.*s" #define ULongFmt "%lu" @@ -50,10 +55,8 @@ string8_append_char(string8 *buf, u8 c) read_only global_variable string8 nil_string = { - .data = NULL, .size = 0, - }; #endif /* BASE_STRING_H */ diff --git a/source/base/base_test.h b/source/base/base_test.h index 412797b..7619cfb 100644 --- a/source/base/base_test.h +++ b/source/base/base_test.h @@ -9,7 +9,7 @@ #define LEN(s) (sizeof(s) - 1) internal void -write_int(i32 num) +write_int(s32 num) { if (num < 0) diff --git a/source/csv_parser.c b/source/csv_parser.c new file mode 100644 index 0000000..048e089 --- /dev/null +++ b/source/csv_parser.c @@ -0,0 +1,32 @@ +internal void +strip_new_line(string8 buffer) +{ + + for (u64 index = 0; index < buffer.size; index++) + { + + } + + return; + +} + +internal void +read_csv(string8 buffer) +{ + printf("\nsize:%lu\ndata %s\n", buffer.size, buffer.data); + +} + +internal csv_table * +parse_csv(token *tokens, csv_table *table) +{ + + + + + + + + return NULL; +} diff --git a/source/csv_parser.h b/source/csv_parser.h new file mode 100644 index 0000000..1aa96b9 --- /dev/null +++ b/source/csv_parser.h @@ -0,0 +1,40 @@ +#ifndef CSV_PARSER_H +#define CSV_PARSER_H + +typedef struct csv_row csv_row; +struct csv_row +{ + // array of size col_count, points into mmap buffer + string8 *fields; + s32 count; +}; + +typedef struct csv_table csv_table; +struct csv_table +{ + // first row, col names + // all data rows + string8 *headers; + csv_row *rows; + s32 col_count; + s32 row_count; +}; + +read_only global_variable +csv_row nil_csv_row = +{ + .fields = &nil_string, + .count = 0, +}; + +read_only global_variable +csv_table nil_csv_table = +{ + .headers = &nil_string, + .rows = &nil_csv_row, + .col_count = 0, + .row_count = 0, +}; + + +#endif /* CSV_PARSER_H */ diff --git a/source/engine.c b/source/engine.c new file mode 100644 index 0000000..8f701e3 --- /dev/null +++ b/source/engine.c @@ -0,0 +1,46 @@ +#define BASE_UNITY +#include "base/base_include.h" + +#include + +#include "csv_parser.h" +#include "lexer.h" + +#include "lexer.c" +#include "csv_parser.c" + +int main(int c, char **v) +{ + if(c < 2) return -999; + + b32 running = 0; + + mem_arena *global_arena = arena_create(MiB(30)); + csv_table *global_table = PushStruct(global_arena, csv_table); + + string8 buffer = load_file(v[1]); + + print("database engine in nasr"); + + for(;;) + { + if (running) + { + // TODO(nasr): check for return key + { + u8 line_buffer[256] = {}; + + s64 codepoint = os_read(STDIN_FD, line_buffer, 256); + unused(codepoint); + + read_csv(buffer); + token *tokens = tokenize_csv(buffer, global_arena); + global_table = parse_csv(tokens, global_table); + } + + sleep(1); + } + } + + return 0; +} diff --git a/source/engine/engine.c b/source/engine/engine.c deleted file mode 100644 index 18b52be..0000000 --- a/source/engine/engine.c +++ /dev/null @@ -1,37 +0,0 @@ -#define BASE_UNITY -#include "../base/base_include.h" - -#include - - - -#include "../parser/parser.h" -#include "../parser/parser.c" - -#include "../repl/repl.h" -#include "../repl/repl.c" - -#include "../storage/csv_reader.h" -#include "../storage/csv_reader.c" - -#include "../lexer/lexer.h" -#include "../lexer/lexer.c" - - - -int main(int c, char **v) -{ - if(c < 2) return -999; - - mem_arena *global_arena = arena_create(MiB(30)); - csv_table *global_table = PushStruct(global_arena, csv_table); - - string8 buffer = load_file(v[1]); - // read_csv(buffer); - tokenize_csv(buffer, global_table, global_arena); - - - return 0; -} - - diff --git a/source/lexer.c b/source/lexer.c new file mode 100644 index 0000000..c84a831 --- /dev/null +++ b/source/lexer.c @@ -0,0 +1,94 @@ +// the lexer acts as a table builder from a csv file +// and parsing indivudal rows and columns +// the next step would be building a the b-tree +internal b32 +is_alpha(u8 point) +{ + return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_')); +} + +internal b32 +is_digit(u8 point) +{ + return (point >= '0' && point <= '9'); +} + +internal b32 +is_alpha_num(u8 point) +{ + return (is_alpha(point) || is_digit(point)); +} + +internal b32 +is_whitespace(u8 point) +{ + return (point == '\n' || point == '\r' || point == ' ' || point == '\t'); +} + +internal b32 +is_delimiter(u8 point) +{ + return (point == ','); +} + +internal token * +tokenize_csv(string8 buffer, mem_arena *arena) +{ + s32 count = 0; + string8 **tokens = PushString(arena, buffer.size); + + b32 FL = TRUE; + + if(buffer.size < 0) return NULL; + for(s32 index = 0; + buffer.data[index] != '\0'; + ++index) + { + csv_row *row = PushStruct(arena, csv_row); + token *tok = PushStruct(arena, token); + + u8 point = buffer.data[index]; + + u8 *start = buffer.data; + u8 *end = NULL; + + unused(row); + + switch(point) + { + case('\n'): + { + + if(FL) + { + FL = FALSE; + tok->flags |= END_FL; + } + + break; + + } + + case(','): + { + end = start - 1; + break; + } + + default: + { + printf("point: %c\n", point); + count++; + break; + } + } + + token->lexeme = String8Cast(start, end - start); + + *tokens = token; + ++tokens; + + + return NULL; + } +} diff --git a/source/lexer.h b/source/lexer.h new file mode 100644 index 0000000..7bafc0d --- /dev/null +++ b/source/lexer.h @@ -0,0 +1,32 @@ +#ifndef ENGINE_LEXER_H +#define ENGINE_LEXER_H + +typedef enum token_flags token_flags; +enum token_flags +{ + START_FL = 1 << 1, + END_FL = 1 << 2, +}; + + +typedef enum token_type token_type; +enum token_type +{ + // first 255 tokens for ascii characters + TOKEN_UNDEFINED = 255, + TOKEN_IDENTIFIER, + TOKEN_VALUE, + +}; + +typedef struct token token; +struct token +{ + string8 lexeme; + token_type type; + token_flags flags; +}; + + + +#endif /* ENGINE_LEXER_H */ diff --git a/source/lexer/lexer.c b/source/lexer/lexer.c deleted file mode 100644 index 948afd0..0000000 --- a/source/lexer/lexer.c +++ /dev/null @@ -1,100 +0,0 @@ -// the lexer acts as a table builder from a csv file -// and parsing indivudal rows and columns -// the next step would be building a the b-tree -internal b32 -is_alpha(u8 point) -{ - return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_')); -} - -internal b32 -is_digit(u8 point) -{ - return (point >= '0' && point <= '9'); -} - -internal b32 -is_alpha_num(u8 point) -{ - return (is_alpha(point) || is_digit(point)); -} - -internal b32 -is_whitespace(u8 point) -{ - return (point == '\n' || point == '\r' || point == ' ' || point == '\t'); -} - -internal b32 -is_delimiter(u8 point) -{ - return (point == ','); -} - -internal token * -tokenize_csv(string8 buffer, csv_table *global_table, mem_arena *arena) -{ - i32 count = 0; - string8 **tokens = PushArray(arena, string8 *, buffer.size / 10); - b32 first_line = 1; - - if(buffer.size < 0) return NULL; - for(i32 index = 0; - buffer.data[index] != '\0'; - ++index) - { - csv_row *row = PushStruct(arena, csv_row); - string8 token = {0}; - - u8 point = buffer.data[index]; - - u8 *start = buffer.data; - u8 *end = NULL; - - unused(row); - - switch (point) - { - case '\n': - { - first_line = -1; - break; - } - case ',': - { - end = start - 1; - - if (first_line) - { - global_table->headers = &token; - ++global_table->headers; - break; - } - else - { - - break; - } - } - - default: - { - printf("point: %c\n", point); - count++; - break; - } - } - - token = (string8){ - .data = start, - .size = end - start, - }; - - **tokens = token; - ++*tokens; - } - - printf("%d", count); - - return NULL; -} diff --git a/source/lexer/lexer.h b/source/lexer/lexer.h deleted file mode 100644 index 86f8427..0000000 --- a/source/lexer/lexer.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef ENGINE_LEXER_H -#define ENGINE_LEXER_H - -typedef enum token_type token_type; -enum token_type -{ - // first 255 tokens for ascii characters - TOKEN_UNDEFINED = 255, - TOKEN_IDENTIFIER, - TOKEN_VALUE, - -}; - -typedef struct token token; -struct token -{ - string8 lexeme; - token_type type; - -}; - - -#endif /* ENGINE_LEXER_H */ diff --git a/source/parser/parser.c b/source/parser/parser.c deleted file mode 100644 index 4c57345..0000000 --- a/source/parser/parser.c +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef ENGINE_REPL_H -#define ENGINE_REPL_H - -typedef struct node node; -struct node -{ - -}; - -typedef struct btree btree; -struct btree -{ - -}; - -#endif /* ENGINE_H */ diff --git a/source/parser/parser.h b/source/parser/parser.h deleted file mode 100644 index 4c57345..0000000 --- a/source/parser/parser.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef ENGINE_REPL_H -#define ENGINE_REPL_H - -typedef struct node node; -struct node -{ - -}; - -typedef struct btree btree; -struct btree -{ - -}; - -#endif /* ENGINE_H */ diff --git a/source/query.c b/source/query.c new file mode 100644 index 0000000..e69de29 diff --git a/source/query.h b/source/query.h new file mode 100644 index 0000000..e69de29 diff --git a/source/query/query.c b/source/query/query.c deleted file mode 100644 index e69de29..0000000 diff --git a/source/query/query.h b/source/query/query.h deleted file mode 100644 index e69de29..0000000 diff --git a/source/repl/repl.c b/source/repl/repl.c deleted file mode 100644 index dd289d8..0000000 --- a/source/repl/repl.c +++ /dev/null @@ -1,12 +0,0 @@ -internal void -init_repl() -{ - for(;;) - { - print("reading user input..."); - // TODO(nasr): design a repl system - - sleep(1); - } - -} diff --git a/source/repl/repl.h b/source/repl/repl.h deleted file mode 100644 index 4c57345..0000000 --- a/source/repl/repl.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef ENGINE_REPL_H -#define ENGINE_REPL_H - -typedef struct node node; -struct node -{ - -}; - -typedef struct btree btree; -struct btree -{ - -}; - -#endif /* ENGINE_H */ diff --git a/source/storage/b_tree.c b/source/storage/b_tree.c deleted file mode 100644 index 6a0e76d..0000000 --- a/source/storage/b_tree.c +++ /dev/null @@ -1,45 +0,0 @@ - -// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees? - -internal b_tree_node * -node_alloc(mem_arena *arena) -{ - b_tree_node *node = PushStructZero(arena, type); - node->leaf = 1; - return node; -} - -// NOTE(nasr): @return the index of of the found element -internal i32 -node_find_pos(mem_arena *arena, string8 value) -{ - i32 i = 0; - while (i < n->key_count && str8_cmp(n->keys[i], k) < 0) - { - ++i; - } - - return i; -} - -// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory -internal void -b_tree_search(node *node) -{ - - -} - -internal void -b_tree_insert() -{ - -} - -internal void -b_tree_write() -{ - // TODO(nasr): write the b_tree to disk -} - - diff --git a/source/storage/b_tree.h b/source/storage/b_tree.h deleted file mode 100644 index 10ad00d..0000000 --- a/source/storage/b_tree.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef BTREE_H -#define BTREE_H - -// maximum height of the tree the lower the lower the lower amount -// of disk reads which translates into faster? -#define B_TREE_ORDER 4 - -typedef struct b_tree_node b_tree_node; -struct b_tree_node -{ - - // store the values - string8 keys[B_TREE_ORDER - 1]; - csv_row *rows[B_TREE_ORDER - 1]; - - b_tree_node *parent; - // handle to store children faster than linked list - // because now we can iteratate over the nodes instead of having cache misses - // when jumping through linked nodes - b_tree_node *children[B_TREE_ORDER]; - - // NOTE(nasr): reference count ::: check how many leaves are using this node - // also not needed for now because we don't free individual node because of arena allocator - // i32 *refc; - - i32 key_count; - - b32 leaf; - -}; - -typedef struct b_tree b_tree; -struct b_tree -{ - // NOTE(nasr): make sure this always stays in memory - // so that initial fetch never requires a disk read - b_tree_node *root; - -}; - - - - -#endif /* BTREE_H */ diff --git a/source/storage/csv_reader.c b/source/storage/csv_reader.c deleted file mode 100644 index 2fcbe04..0000000 --- a/source/storage/csv_reader.c +++ /dev/null @@ -1,7 +0,0 @@ -internal void -read_csv(string8 buffer) -{ - printf("\nsize:%lu\ndata %s\n", buffer.size, buffer.data); - -} - diff --git a/source/storage/csv_reader.h b/source/storage/csv_reader.h deleted file mode 100644 index 36e07a4..0000000 --- a/source/storage/csv_reader.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef CSV_READER_H -#define CSV_READER_H - -typedef struct csv_row csv_row; -struct csv_row -{ - // array of size col_count, points into mmap buffer - string8 *fields; - i32 count; -}; - -typedef struct csv_table csv_table; -struct csv_table -{ - // first row, col names - // all data rows - string8 *headers; - csv_row *rows; - i32 col_count; - i32 row_count; -}; - -read_only global_variable -csv_row nil_csv_row = -{ - .fields = &nil_string, - .count = 0, -}; - - - -read_only global_variable -csv_table nil_csv_table = -{ - .headers = &nil_string, - .rows = &nil_csv_row, - .col_count = 0, - .row_count = 0, -}; - - - -#endif /* CSV_READER_H */ diff --git a/source/storage/table.c b/source/storage/table.c deleted file mode 100644 index e69de29..0000000 diff --git a/source/storage/table.h b/source/storage/table.h deleted file mode 100644 index e69de29..0000000 -- cgit v1.3