3 files changed, 701 insertions, 0 deletions
diff --git a/source/tb_db/btree_impl.h b/source/tb_db/btree_impl.h
new file mode 100644
index 0000000..b7015bd
--- /dev/null
+++ b/source/tb_db/btree_impl.h
@@ -0,0 +1,209 @@
+/**
+ * >> api notes
+ * >> by Abdellah El Morabit
+ *
+ * - creation of a btree happens when doing an insertion on an
+ *  empty btree where the root is null or nil
+ *
+ **/
+/* single header b-tree implementation by Abdellah El Morabit */
+#ifndef BTREE_H
+#define BTREE_H
+// maximum height of the tree the lower the lower the lower amount
+// of disk reads which translates into faster?
+#if 0
+global_variable read_only s16 btree_ORDER = 4;
+#endif
+#define BTREE_ORDER  4
+//- NOTE(nasr): defining a key to improve sorting
+//  i think saying that a key is a combination of the column + row is a good way of appraoching this
+typedef struct key key;
+struct key
+{
+    s32     header_index; //- col number
+    s32     row_index;
+};
+typedef struct btree_node btree_node;
+struct btree_node
+{
+    // store the key values of the sub nodes? if they are leaves?
+    key keys[BTREE_ORDER - 1];
+    // TODO(nasr): replace with something more generic?
+    // NOTE(nasr): cons of void * -> no type safety
+    // is there a way to still have some sort of that? size not variable
+    void *payload_per_key[BTREE_ORDER - 1];
+    btree_node *parent;
+    // handle to store children faster than linked list
+    // because now we can iterate over the nodes instead of having cache misses
+    // when jumping through linked nodes
+    btree_node *children[BTREE_ORDER];
+    // this shouldn't change things but could be a performance improvement on a bigger scale??
+    s32 *max_children;
+    // NOTE(nasr): reference count ::: check how many leaves are using this node
+    // also not needed for now because we don't free individual node because of arena allocator
+    // s32 *refc;
+    s32 key_count;
+    b32 leaf;
+    // NOTE(nasr): do we hold the reference to the arena? or do we pass is it as a reference? 
+    // this could solve memory location issues?
+};
+typedef struct btree btree;
+struct btree
+{
+    // NOTE(nasr): make sure this always stays in memory
+    // so that initial fetch never requires a disk read
+    btree_node *root;
+};
+#ifdef BTREE_IMPLEMENTATION
+///////////////////////////////////////////////////////////////////////////////
+//- btree.c
+internal b32
+is_nil_btree_node(btree_node *node)
+{
+    // NOTE(nasr): looks like a useless function but could be usefull when we implement a nil_btree_node 
+    return (node == NULL);
+}
+// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees?
+internal b8 
+key_compare(key destination_key, key source_key)
+{
+    s32 source_index      = source_key.header_index      + source_key.row_index;
+    s32 destination_index = destination_key.header_index + destination_key.row_index;
+    if(destination_index > source_index) return 1;
+    else if(destination_index < source_index) return -1;
+    else return 0;
+}
+// NOTE(nasr): @return the index of the found element
+//- @param: start node could be the root node for all we care. but this enables optimizations in the future if we need them
+internal s32
+btree_find_ipos(key k, btree_node *start_node)
+{
+    s32 index = 0;
+    // scan right while the node's key at [index] is strictly less than k
+    while (index < start_node->key_count && key_compare(start_node->keys[index], k) < 0)
+    {
+        ++index;
+    }
+    return index;
+}
+// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory
+internal void *
+btree_search(btree_node *node, key key)
+{
+    s32 index = btree_find_ipos(key, node);
+    if (index < node->key_count && key_compare(node->keys[index], key) == 0)
+    {
+        return node->payload_per_key[index];
+    }
+    if (node->leaf)
+    {
+        return NULL;
+    }
+    return btree_search(node->children[index], key);
+}
+// TODO(nasr): split node when key_count == btree_ORDER - 1 (node is full)
+// -- each node has max btree_ORDER - 1 keys
+//
+internal void
+btree_insert(mem_arena *arena, btree *tree, key key, void *payload)
+{
+    if(is_nil_btree_node(tree->root)) 
+    {
+        tree->root = PushStructZero(arena, btree_node);
+        tree->root->leaf = 1;
+        tree->root->key_count = 0;
+    }
+    btree_node *current_node = tree->root;
+    if (current_node->leaf)
+    {
+        // find the insertion position and shift keys right to make room
+        //- after: but what is this. we are expecting a new key.but we are looking for 
+        //- its position in the tree is useless because it's non existent
+        s32 i = btree_find_ipos(key, current_node);
+        for (s32 j = current_node->key_count; j > i; --j)
+        {
+            current_node->keys[j]            = current_node->keys[j - 1];
+            current_node->payload_per_key[j] = current_node->payload_per_key[j - 1];
+        }
+        current_node->keys[i]            = key;
+        current_node->payload_per_key[i] = payload;
+        if(current_node->key_count < (BTREE_ORDER - 1))
+        {
+            current_node->key_count += 1;
+        }
+        else {
+            // TODO(nasr): creating a new branch / tree?
+            // make a seperate function for this
+            // NOTE(nasr): only split through the parent when one actually exists;
+            // root splits need to be handled separately (promote mid key, create two children)
+            if(current_node->parent != NULL)
+            {
+                s32 split_pos = current_node->parent->key_count / 2 - 1;
+                s32 updated_keycount_p = 0;
+                for(s32 index = 0; index <= split_pos; ++index)
+                {
+                    ++updated_keycount_p;
+                }
+                current_node->parent->key_count = updated_keycount_p;
+            }
+        }
+    }
+    // TODO(nasr): internal node case walk down then split on the way back up
+}
+internal void
+btree_write(btree *bt)
+{
+    unused(bt);
+#if 0
+    temp_arena *temp_arena = temp_arena_begin(arena);
+    btree_node *root = bt->root;
+    for(btree_node *bt = PushStruct(arena); root->next; next = root;)
+    {
+        
+    }
+    temp_arena_end(temp);
+#endif
+}
+// - load the entire db into memory.
+#if 0
+internal void
+btree_load(mem_arena *arena)
+{
+}
+#endif
+#endif /* BTREE_IMPLEMENTATION */
+#endif /* BTREE_H */
diff --git a/source/tb_db/csv_decoder.h b/source/tb_db/csv_decoder.h
new file mode 100644
index 0000000..3d09dc6
--- /dev/null
+++ b/source/tb_db/csv_decoder.h
@@ -0,0 +1,294 @@
+#ifndef ENGINE_LEXER_H
+#define ENGINE_LEXER_H
+enum csv_token_flags
+{
+    FL      = 1 << 2,
+};
+enum csv_token_type
+{
+    // first 255 tokens  for ascii characters
+    TOKEN_UNDEFINED = 255,
+    TOKEN_IDENTIFIER,
+    TOKEN_VALUE,
+};
+typedef struct csv_token csv_token;
+struct csv_token
+{
+    string8 lexeme;
+    csv_token *next_token;
+    enum csv_token_type type;
+    enum csv_token_flags flags;
+};
+// NOTE(nasr): i dont think im going to use this.
+typedef struct csv_row csv_row;
+struct csv_row
+{
+    // array of size col_count, points into mmap buffer
+    string8 *fields;
+    s32      count;
+};
+#if 0
+typedef struct csv_lntity csv_entity;
+struct csv_entity 
+{
+    //- not needed because we use key header mapping i think
+};
+#endif
+typedef struct csv_header csv_header;
+struct csv_header 
+{
+    string8  payload;
+    csv_header *next_header;
+};
+typedef struct csv_table csv_table;
+struct csv_table
+{
+    // first row, col names
+    // all data rows
+    csv_header  *header;
+    s32         row_count;
+    s32         header_count;
+    b32         finding_headers;
+};
+typedef struct csv_token_list csv_token_list;
+struct csv_token_list
+{
+    csv_token *start_token;
+    csv_token *end_token;
+};
+read_only global_variable
+csv_token nil_csv_token=
+{
+    .lexeme         = {.data = NULL, .size = 0},
+    .type           = 0,
+    .flags          = 0,
+    .next_token     = &nil_csv_token,
+};
+read_only global_variable
+csv_header nil_csv_header = 
+{
+    .payload =  {.data = NULL, .size = 0},
+    .next_header = &nil_csv_header,
+};
+read_only global_variable
+csv_token_list nil_csv_token_list =
+{
+    .start_token = &nil_csv_token,
+    .end_token   = &nil_csv_token,
+};
+read_only global_variable
+csv_row  nil_csv_row =
+{
+    .fields     = &nil_string,
+    .count      = 0,
+};
+read_only global_variable
+csv_table nil_csv_table =
+{
+    .header     = &nil_csv_header,
+    .row_count   = 0,
+};
+#endif /* ENGINE_LEXER_H */
+internal b32 
+is_nil_csv_token(csv_token *token)
+{
+    return ((token == NULL) || (token == &nil_csv_token)); 
+}
+// TODO(nasr): segfaulting because end_token not allocated
+internal void
+csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
+{
+    source_token_list->end_token->next_token = source_token;
+    source_token_list->end_token             = source_token;
+}
+//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
+internal void
+csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
+{
+    if(is_nil_csv_token(source->start_token)) return;
+    csv_token *source_ct      = source->start_token;
+    csv_token *destination_et = destination->end_token;
+    // walk source and stitch each node onto destination's tail
+    for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
+    {
+        destination_et->next_token = source_ct;
+        destination_et             = source_ct;
+    }
+    // destination_et now points at the last real source node (not the nil sentinel)
+    destination->end_token = destination_et;
+}
+#if 0
+internal csv_token_list *
+parse_csv_row(string8 row_buffer)
+{
+    // csv_token_list *
+}
+#endif
+// the lexer acts as a table builder from a csv  file
+// and parsing indivudal rows and columns
+// the next step would be building a the b-tree
+internal csv_token *
+tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
+{
+    unused(token_list);
+    if(buffer.size == 0) return NULL;
+    // URGENT(nasr): segfaulting because memcpy of strring value doesnt  work dammit
+    // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
+    // forgot what the solution was
+    // TODO(nasr): check what the problem here was
+    // string size tracking across the loop not inside it
+    s32 start = 0;
+    for(s32 index = 0; buffer.data[index] != '\0'; ++index)
+    {
+        u8 point = buffer.data[index];
+#if 0
+        if(is_whitespace(point))
+        {
+            warn("csv file is invalid, detected whitespace");
+            return NULL;
+        }
+#endif
+        if(point == ',')
+        {
+            // emit a token for the field that ended before this comma
+            csv_token *token  = PushStructZero(arena, csv_token);
+            assert_msg(token != NULL, "did the push struct fail??");
+            assert_msg(arena->current_position < arena->capacity, "no more arena size");
+            token->lexeme     = StringCast(&buffer.data[start], index - start);
+            token->type       = TOKEN_VALUE;
+            token->next_token = &nil_csv_token;
+            csv_token_list_append_token(token_list, token);
+            start = index + 1;
+            if(table->finding_headers)
+            {
+                table->header_count++;
+            }
+        }
+        else if(point == '\n')
+        {
+            // emit a token for the field that ended at this newline
+            csv_token *token  = PushStructZero(arena, csv_token);
+            token->lexeme     = StringCast(&buffer.data[start], index - start);
+            token->type       = TOKEN_VALUE;
+            token->flags     |= FL;
+            token->next_token = &nil_csv_token;
+            assert_msg(token_list, "token list invalid");
+            assert_msg(token, "you're tring to append an invalid token");
+            csv_token_list_append_token(token_list, token);
+            start = index + 1;
+            if(table->finding_headers)
+            {
+                {
+                    //- map new header token list to table headers
+                }
+                table->finding_headers = FALSE;
+            }
+            table->row_count++;
+        }
+    }
+    // NOTE(nasr): return the first token the caller can walk the list from token_list
+    return token_list->start_token;
+}
+//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
+internal btree *
+parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
+{
+    btree *tree = PushStructZero(arena, btree);
+    s32 col_index = 0;
+    s32 row_index = 0;
+    // iterate over the token list while the token is not nil
+    for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
+    {
+        {
+            //- are we parsing the first line tokens?
+            //- if so, do something :))
+            if(ct->flags & FL) 
+            {
+                // NOTE(nasr): FL marks end-of-line; advance row, reset col
+                row_index++;
+                col_index = 0;
+                // TODO(nasr): replace with nil header check function
+                // NOTE(nasr): == nil means header hasn't been set yet
+                if(table->header == &nil_csv_header || table->header == NULL)
+                {
+#if 0
+                    // - no this should happen in the tokenization
+                    table->headers->next =     
+#endif
+                }
+                else 
+                {
+                }
+                // FL tokens are structural, no value to index
+                continue;
+            }
+        }
+        // skip non-value tokens, only index actual cell values
+        if (ct->type != TOKEN_VALUE)
+        {
+            col_index++;
+            continue;
+        }
+        // NOTE(nasr): payload is the cten itself so the caller can reach
+        // row/col metadata without us having to copy it
+        key k = {
+            .header_index = col_index,
+            .row_index    = row_index,
+        };
+        btree_insert(arena, tree, k, (void *)ct);
+        col_index++;
+    }
+    return tree;
+}
diff --git a/source/tb_db/tb_db.c b/source/tb_db/tb_db.c
new file mode 100644
index 0000000..4ae247d
--- /dev/null
+++ b/source/tb_db/tb_db.c
@@ -0,0 +1,198 @@
+#define BTREE_IMPLEMENTATION
+#define BASE_UNITY
+#include "../base/base_include.h"
+internal b32
+is_alpha(u8 point)
+{
+    return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_'));
+}
+internal b32
+is_digit(u8 point)
+{
+    return (point >= '0' && point <= '9');
+}
+internal b32
+is_alpha_num(u8 point)
+{
+    return (is_alpha(point) || is_digit(point));
+}
+internal b32
+is_whitespace(u8 point)
+{
+    return (point == '\n' || point == '\r' || point == ' ' || point == '\t');
+}
+internal b32
+is_delimiter(u8 point)
+{
+    return (point == ',');
+}
+#include "btree_impl.h"
+#include "csv_decoder.h"
+typedef struct query_token query_token;
+struct query_token
+{
+    string8      lexeme;
+    query_token  *next;
+};
+typedef struct query_token_list query_token_list;
+struct query_token_list
+{
+    query_token *start_token;
+    query_token *current_token;
+};
+read_only global_variable
+query_token nil_query_token =
+{
+    .lexeme = {.data = NULL, .size = 0},
+    .next = &nil_query_token
+};
+read_only global_variable
+query_token_list nil_query_token_list =
+{
+    .start_token = &nil_query_token,
+    .current_token = &nil_query_token,
+};
+internal b32
+is_nil_query_token(query_token *token)
+{
+    return (token == &nil_query_token) || (token == NULL);
+}
+internal b32
+is_nil_query_token_list(query_token *token)
+{
+    return (token == &nil_query_token) || (token == NULL);
+}
+// takes on line of the repl input
+// return a reference to the passed list
+internal query_token_list *
+query_tokenizer(mem_arena *arena, string8 *buffer, query_token_list *list)
+{
+    b32 initialized = 0;
+    unused(initialized);
+    for (u64 index = 0; index < buffer->size; ++index)
+    {
+        u8 codepoint = buffer->data[index];
+        if(codepoint == '\n' || codepoint == '\r') break;
+        s32 start   = 0; 
+        s32 end     = 0; 
+        if(is_whitespace(codepoint)) end = index; 
+        // save the token
+        // TODO(nasr): work on the string macros cuz no work
+        {
+            query_token *new_token = PushStruct(arena, query_token);
+            //- initialize list
+            {
+                if(is_nil_query_token(list->start_token))
+                {
+                    list->start_token   = new_token;
+                    list->current_token = new_token; 
+                }
+                else 
+                {
+                    //- all we need to do - we dont track parents or what ever. this is a token stream not a tree
+                    list->current_token->next = new_token;
+                }
+            }
+            s32 new_token_size = end - start;
+            new_token->lexeme = PushString(arena, new_token_size);
+            new_token->lexeme.data = &buffer->data[index]; 
+            new_token->lexeme.size = new_token_size;
+            list->current_token->next = new_token;
+            start = index + 1;
+        }
+    }
+    return list;
+}
+int main(int count, char **value)
+{
+#if 1
+    unused(nil_query_token_list);
+#endif
+    if(count < 2) value[1] = "./test/data.csv";
+    local_persist b32 running = 1;
+    mem_arena *global_arena = arena_create(GiB(1));
+    string8 buffer = load_file(global_arena, value[1]);
+    // NOTE(nasr): the use of tables is required for tracking headers etc.
+    // i think we can optimize this away in the future but for now its fine
+    csv_table      *table      = PushStructZero(global_arena, csv_table);
+    csv_token_list *token_list = PushStructZero(global_arena, csv_token_list);
+    token_list->start_token = &nil_csv_token; 
+    token_list->end_token   = &nil_csv_token;   
+    csv_token *tokens = tokenize_csv(buffer, global_arena, table, token_list);
+    assert_msg(tokens != NULL, "tokens are null");
+    // NOTE(nasr): token_list is now populated — pass it directly, not a fresh empty list
+    btree *bt = parse_csv(global_arena, token_list, table);
+    print("\nDatabase Engine\n");
+    for(;;)
+    {
+        if(running)
+        {
+            u8  *lbuf     = PushArray(global_arena, u8, 256);
+            s32  err      = os_read(STDIN_FD, lbuf, 256);
+            if(err < 0)
+            {
+                print("error reading from stdin");
+            }
+            // TODO(nasr): extract this later in the future and make a string copy function/macro
+            // @params (s32 lbuf_size, string8 lbuf_stringified)
+            // NOTE(nasr): use err (bytes read) not sizeof(lbuf*) — sizeof a pointer is always 8
+            s32     lbuf_size       = err;
+            string8 lbuf_stringified = PushString(global_arena, lbuf_size);
+            {
+                memcpy(lbuf_stringified.data, lbuf, lbuf_size);
+                lbuf_stringified.size = lbuf_size;
+            }
+            query_token_list *qtl = PushStructZero(global_arena, query_token_list);
+            query_tokenizer(global_arena, &lbuf_stringified, qtl);
+            // TODO(nasr): dispatch qtl against bt here
+            unused(bt);
+            sleep(1);
+        }
+    }
+    return 0;
+}

diff --git a/source/tb_db/btree_impl.h b/source/tb_db/btree_impl.h new file mode 100644 index 0000000..b7015bd --- /dev/null +++ b/source/tb_db/btree_impl.h
@@ -0,0 +1,209 @@
	1	/**
	2	* >> api notes
	3	* >> by Abdellah El Morabit
	4	*
	5	* - creation of a btree happens when doing an insertion on an
	6	* empty btree where the root is null or nil
	7	*
	8	**/
	9
	10	/* single header b-tree implementation by Abdellah El Morabit */
	11	#ifndef BTREE_H
	12	#define BTREE_H
	13
	14	// maximum height of the tree the lower the lower the lower amount
	15	// of disk reads which translates into faster?
	16	#if 0
	17	global_variable read_only s16 btree_ORDER = 4;
	18	#endif
	19	#define BTREE_ORDER 4
	20
	21	//- NOTE(nasr): defining a key to improve sorting
	22	// i think saying that a key is a combination of the column + row is a good way of appraoching this
	23	typedef struct key key;
	24	struct key
	25	{
	26	s32 header_index; //- col number
	27	s32 row_index;
	28	};
	29
	30	typedef struct btree_node btree_node;
	31	struct btree_node
	32	{
	33	// store the key values of the sub nodes? if they are leaves?
	34	key keys[BTREE_ORDER - 1];
	35	// TODO(nasr): replace with something more generic?
	36	// NOTE(nasr): cons of void * -> no type safety
	37	// is there a way to still have some sort of that? size not variable
	38	void *payload_per_key[BTREE_ORDER - 1];
	39	btree_node *parent;
	40	// handle to store children faster than linked list
	41	// because now we can iterate over the nodes instead of having cache misses
	42	// when jumping through linked nodes
	43	btree_node *children[BTREE_ORDER];
	44	// this shouldn't change things but could be a performance improvement on a bigger scale??
	45	s32 *max_children;
	46	// NOTE(nasr): reference count ::: check how many leaves are using this node
	47	// also not needed for now because we don't free individual node because of arena allocator
	48	// s32 *refc;
	49	s32 key_count;
	50	b32 leaf;
	51
	52
	53	// NOTE(nasr): do we hold the reference to the arena? or do we pass is it as a reference?
	54	// this could solve memory location issues?
	55	};
	56
	57
	58	typedef struct btree btree;
	59	struct btree
	60	{
	61	// NOTE(nasr): make sure this always stays in memory
	62	// so that initial fetch never requires a disk read
	63	btree_node *root;
	64	};
	65
	66
	67
	68	#ifdef BTREE_IMPLEMENTATION
	69	///////////////////////////////////////////////////////////////////////////////
	70	//- btree.c
	71
	72
	73	internal b32
	74	is_nil_btree_node(btree_node *node)
	75	{
	76	// NOTE(nasr): looks like a useless function but could be usefull when we implement a nil_btree_node
	77	return (node == NULL);
	78	}
	79
	80	// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees?
	81
	82	internal b8
	83	key_compare(key destination_key, key source_key)
	84	{
	85	s32 source_index = source_key.header_index + source_key.row_index;
	86	s32 destination_index = destination_key.header_index + destination_key.row_index;
	87
	88	if(destination_index > source_index) return 1;
	89	else if(destination_index < source_index) return -1;
	90	else return 0;
	91	}
	92
	93	// NOTE(nasr): @return the index of the found element
	94	//- @param: start node could be the root node for all we care. but this enables optimizations in the future if we need them
	95	internal s32
	96	btree_find_ipos(key k, btree_node *start_node)
	97	{
	98	s32 index = 0;
	99	// scan right while the node's key at [index] is strictly less than k
	100	while (index < start_node->key_count && key_compare(start_node->keys[index], k) < 0)
	101	{
	102	++index;
	103	}
	104	return index;
	105	}
	106
	107	// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory
	108	internal void *
	109	btree_search(btree_node *node, key key)
	110	{
	111	s32 index = btree_find_ipos(key, node);
	112
	113	if (index < node->key_count && key_compare(node->keys[index], key) == 0)
	114	{
	115	return node->payload_per_key[index];
	116	}
	117	if (node->leaf)
	118	{
	119	return NULL;
	120	}
	121	return btree_search(node->children[index], key);
	122	}
	123
	124	// TODO(nasr): split node when key_count == btree_ORDER - 1 (node is full)
	125	// -- each node has max btree_ORDER - 1 keys
	126	//
	127	internal void
	128	btree_insert(mem_arena arena, btree tree, key key, void *payload)
	129	{
	130	if(is_nil_btree_node(tree->root))
	131	{
	132	tree->root = PushStructZero(arena, btree_node);
	133	tree->root->leaf = 1;
	134	tree->root->key_count = 0;
	135	}
	136
	137	btree_node *current_node = tree->root;
	138
	139	if (current_node->leaf)
	140	{
	141	// find the insertion position and shift keys right to make room
	142	//- after: but what is this. we are expecting a new key.but we are looking for
	143	//- its position in the tree is useless because it's non existent
	144
	145	s32 i = btree_find_ipos(key, current_node);
	146
	147	for (s32 j = current_node->key_count; j > i; --j)
	148	{
	149	current_node->keys[j] = current_node->keys[j - 1];
	150	current_node->payload_per_key[j] = current_node->payload_per_key[j - 1];
	151	}
	152
	153	current_node->keys[i] = key;
	154	current_node->payload_per_key[i] = payload;
	155
	156	if(current_node->key_count < (BTREE_ORDER - 1))
	157	{
	158	current_node->key_count += 1;
	159	}
	160	else {
	161	// TODO(nasr): creating a new branch / tree?
	162	// make a seperate function for this
	163	// NOTE(nasr): only split through the parent when one actually exists;
	164	// root splits need to be handled separately (promote mid key, create two children)
	165	if(current_node->parent != NULL)
	166	{
	167	s32 split_pos = current_node->parent->key_count / 2 - 1;
	168	s32 updated_keycount_p = 0;
	169	for(s32 index = 0; index <= split_pos; ++index)
	170	{
	171	++updated_keycount_p;
	172	}
	173	current_node->parent->key_count = updated_keycount_p;
	174	}
	175	}
	176	}
	177	// TODO(nasr): internal node case walk down then split on the way back up
	178	}
	179
	180
	181	internal void
	182	btree_write(btree *bt)
	183	{
	184	unused(bt);
	185	#if 0
	186	temp_arena *temp_arena = temp_arena_begin(arena);
	187	btree_node *root = bt->root;
	188
	189	for(btree_node *bt = PushStruct(arena); root->next; next = root;)
	190	{
	191
	192	}
	193
	194	temp_arena_end(temp);
	195	#endif
	196	}
	197
	198	// - load the entire db into memory.
	199	#if 0
	200	internal void
	201	btree_load(mem_arena *arena)
	202	{
	203
	204
	205	}
	206	#endif
	207
	208	#endif /* BTREE_IMPLEMENTATION */
	209	#endif /* BTREE_H */


diff --git a/source/tb_db/csv_decoder.h b/source/tb_db/csv_decoder.h new file mode 100644 index 0000000..3d09dc6 --- /dev/null +++ b/source/tb_db/csv_decoder.h
@@ -0,0 +1,294 @@
	1	#ifndef ENGINE_LEXER_H
	2	#define ENGINE_LEXER_H
	3
	4	enum csv_token_flags
	5	{
	6	FL = 1 << 2,
	7	};
	8
	9	enum csv_token_type
	10	{
	11	// first 255 tokens for ascii characters
	12	TOKEN_UNDEFINED = 255,
	13	TOKEN_IDENTIFIER,
	14	TOKEN_VALUE,
	15	};
	16
	17	typedef struct csv_token csv_token;
	18	struct csv_token
	19	{
	20	string8 lexeme;
	21	csv_token *next_token;
	22	enum csv_token_type type;
	23	enum csv_token_flags flags;
	24	};
	25
	26	// NOTE(nasr): i dont think im going to use this.
	27	typedef struct csv_row csv_row;
	28	struct csv_row
	29	{
	30	// array of size col_count, points into mmap buffer
	31	string8 *fields;
	32	s32 count;
	33	};
	34
	35	#if 0
	36	typedef struct csv_lntity csv_entity;
	37	struct csv_entity
	38	{
	39	//- not needed because we use key header mapping i think
	40	};
	41	#endif
	42
	43	typedef struct csv_header csv_header;
	44	struct csv_header
	45	{
	46	string8 payload;
	47	csv_header *next_header;
	48	};
	49
	50	typedef struct csv_table csv_table;
	51	struct csv_table
	52	{
	53	// first row, col names
	54	// all data rows
	55	csv_header *header;
	56	s32 row_count;
	57	s32 header_count;
	58	b32 finding_headers;
	59	};
	60
	61
	62	typedef struct csv_token_list csv_token_list;
	63	struct csv_token_list
	64	{
	65	csv_token *start_token;
	66	csv_token *end_token;
	67	};
	68
	69	read_only global_variable
	70	csv_token nil_csv_token=
	71	{
	72	.lexeme = {.data = NULL, .size = 0},
	73	.type = 0,
	74	.flags = 0,
	75	.next_token = &nil_csv_token,
	76	};
	77
	78	read_only global_variable
	79	csv_header nil_csv_header =
	80	{
	81	.payload = {.data = NULL, .size = 0},
	82	.next_header = &nil_csv_header,
	83	};
	84
	85	read_only global_variable
	86	csv_token_list nil_csv_token_list =
	87	{
	88	.start_token = &nil_csv_token,
	89	.end_token = &nil_csv_token,
	90	};
	91
	92	read_only global_variable
	93	csv_row nil_csv_row =
	94	{
	95	.fields = &nil_string,
	96	.count = 0,
	97	};
	98
	99	read_only global_variable
	100	csv_table nil_csv_table =
	101	{
	102	.header = &nil_csv_header,
	103	.row_count = 0,
	104	};
	105
	106	#endif /* ENGINE_LEXER_H */
	107
	108	internal b32
	109	is_nil_csv_token(csv_token *token)
	110	{
	111	return ((token == NULL) \|\| (token == &nil_csv_token));
	112	}
	113
	114	// TODO(nasr): segfaulting because end_token not allocated
	115	internal void
	116	csv_token_list_append_token(csv_token_list source_token_list, csv_token source_token)
	117	{
	118	source_token_list->end_token->next_token = source_token;
	119	source_token_list->end_token = source_token;
	120	}
	121
	122	//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
	123	internal void
	124	csv_token_list_concat_list(csv_token_list destination, csv_token_list source)
	125	{
	126	if(is_nil_csv_token(source->start_token)) return;
	127
	128	csv_token *source_ct = source->start_token;
	129	csv_token *destination_et = destination->end_token;
	130
	131	// walk source and stitch each node onto destination's tail
	132	for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
	133	{
	134	destination_et->next_token = source_ct;
	135	destination_et = source_ct;
	136	}
	137
	138	// destination_et now points at the last real source node (not the nil sentinel)
	139	destination->end_token = destination_et;
	140	}
	141
	142	#if 0
	143	internal csv_token_list *
	144	parse_csv_row(string8 row_buffer)
	145	{
	146	// csv_token_list *
	147
	148	}
	149	#endif
	150
	151
	152	// the lexer acts as a table builder from a csv file
	153	// and parsing indivudal rows and columns
	154	// the next step would be building a the b-tree
	155	internal csv_token *
	156	tokenize_csv(string8 buffer, mem_arena arena, csv_table table, csv_token_list *token_list)
	157	{
	158	unused(token_list);
	159
	160	if(buffer.size == 0) return NULL;
	161
	162	// URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
	163	// NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
	164	// forgot what the solution was
	165	// TODO(nasr): check what the problem here was
	166
	167	// string size tracking across the loop not inside it
	168	s32 start = 0;
	169
	170	for(s32 index = 0; buffer.data[index] != '\0'; ++index)
	171	{
	172	u8 point = buffer.data[index];
	173
	174	#if 0
	175	if(is_whitespace(point))
	176	{
	177	warn("csv file is invalid, detected whitespace");
	178	return NULL;
	179	}
	180	#endif
	181
	182	if(point == ',')
	183	{
	184	// emit a token for the field that ended before this comma
	185	csv_token *token = PushStructZero(arena, csv_token);
	186
	187	assert_msg(token != NULL, "did the push struct fail??");
	188	assert_msg(arena->current_position < arena->capacity, "no more arena size");
	189
	190	token->lexeme = StringCast(&buffer.data[start], index - start);
	191	token->type = TOKEN_VALUE;
	192	token->next_token = &nil_csv_token;
	193	csv_token_list_append_token(token_list, token);
	194
	195	start = index + 1;
	196
	197	if(table->finding_headers)
	198	{
	199	table->header_count++;
	200	}
	201	}
	202	else if(point == '\n')
	203	{
	204	// emit a token for the field that ended at this newline
	205	csv_token *token = PushStructZero(arena, csv_token);
	206	token->lexeme = StringCast(&buffer.data[start], index - start);
	207	token->type = TOKEN_VALUE;
	208	token->flags \|= FL;
	209	token->next_token = &nil_csv_token;
	210
	211	assert_msg(token_list, "token list invalid");
	212	assert_msg(token, "you're tring to append an invalid token");
	213
	214	csv_token_list_append_token(token_list, token);
	215
	216	start = index + 1;
	217
	218	if(table->finding_headers)
	219	{
	220	{
	221	//- map new header token list to table headers
	222	}
	223	table->finding_headers = FALSE;
	224	}
	225
	226	table->row_count++;
	227	}
	228	}
	229
	230	// NOTE(nasr): return the first token the caller can walk the list from token_list
	231	return token_list->start_token;
	232	}
	233
	234	//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
	235	internal btree *
	236	parse_csv(mem_arena arena, csv_token_list ctl, csv_table *table)
	237	{
	238	btree *tree = PushStructZero(arena, btree);
	239
	240	s32 col_index = 0;
	241	s32 row_index = 0;
	242
	243	// iterate over the token list while the token is not nil
	244	for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
	245	{
	246	{
	247	//- are we parsing the first line tokens?
	248	//- if so, do something :))
	249	if(ct->flags & FL)
	250	{
	251	// NOTE(nasr): FL marks end-of-line; advance row, reset col
	252	row_index++;
	253	col_index = 0;
	254
	255	// TODO(nasr): replace with nil header check function
	256	// NOTE(nasr): == nil means header hasn't been set yet
	257	if(table->header == &nil_csv_header \|\| table->header == NULL)
	258	{
	259	#if 0
	260	// - no this should happen in the tokenization
	261	table->headers->next =
	262	#endif
	263	}
	264	else
	265	{
	266
	267	}
	268
	269	// FL tokens are structural, no value to index
	270	continue;
	271	}
	272	}
	273
	274	// skip non-value tokens, only index actual cell values
	275	if (ct->type != TOKEN_VALUE)
	276	{
	277	col_index++;
	278	continue;
	279	}
	280
	281	// NOTE(nasr): payload is the cten itself so the caller can reach
	282	// row/col metadata without us having to copy it
	283	key k = {
	284	.header_index = col_index,
	285	.row_index = row_index,
	286	};
	287
	288	btree_insert(arena, tree, k, (void *)ct);
	289
	290	col_index++;
	291	}
	292
	293	return tree;
	294	}


diff --git a/source/tb_db/tb_db.c b/source/tb_db/tb_db.c new file mode 100644 index 0000000..4ae247d --- /dev/null +++ b/source/tb_db/tb_db.c
@@ -0,0 +1,198 @@
	1	#define BTREE_IMPLEMENTATION
	2	#define BASE_UNITY
	3	#include "../base/base_include.h"
	4
	5	internal b32
	6	is_alpha(u8 point)
	7	{
	8	return ((point >= 'a' && point <= 'z') \|\| (point >= 'A' && point <= 'Z') \|\| (point == '_'));
	9	}
	10
	11	internal b32
	12	is_digit(u8 point)
	13	{
	14	return (point >= '0' && point <= '9');
	15	}
	16
	17	internal b32
	18	is_alpha_num(u8 point)
	19	{
	20	return (is_alpha(point) \|\| is_digit(point));
	21	}
	22
	23	internal b32
	24	is_whitespace(u8 point)
	25	{
	26	return (point == '\n' \|\| point == '\r' \|\| point == ' ' \|\| point == '\t');
	27	}
	28
	29	internal b32
	30	is_delimiter(u8 point)
	31	{
	32	return (point == ',');
	33	}
	34
	35	#include "btree_impl.h"
	36	#include "csv_decoder.h"
	37
	38	typedef struct query_token query_token;
	39	struct query_token
	40	{
	41	string8 lexeme;
	42	query_token *next;
	43	};
	44
	45	typedef struct query_token_list query_token_list;
	46	struct query_token_list
	47	{
	48	query_token *start_token;
	49	query_token *current_token;
	50	};
	51
	52	read_only global_variable
	53	query_token nil_query_token =
	54	{
	55	.lexeme = {.data = NULL, .size = 0},
	56	.next = &nil_query_token
	57	};
	58
	59
	60	read_only global_variable
	61	query_token_list nil_query_token_list =
	62	{
	63	.start_token = &nil_query_token,
	64	.current_token = &nil_query_token,
	65	};
	66
	67	internal b32
	68	is_nil_query_token(query_token *token)
	69	{
	70	return (token == &nil_query_token) \|\| (token == NULL);
	71	}
	72
	73	internal b32
	74	is_nil_query_token_list(query_token *token)
	75	{
	76	return (token == &nil_query_token) \|\| (token == NULL);
	77	}
	78
	79	// takes on line of the repl input
	80	// return a reference to the passed list
	81	internal query_token_list *
	82	query_tokenizer(mem_arena arena, string8 buffer, query_token_list *list)
	83	{
	84	b32 initialized = 0;
	85	unused(initialized);
	86
	87	for (u64 index = 0; index < buffer->size; ++index)
	88	{
	89	u8 codepoint = buffer->data[index];
	90
	91	if(codepoint == '\n' \|\| codepoint == '\r') break;
	92
	93	s32 start = 0;
	94	s32 end = 0;
	95
	96	if(is_whitespace(codepoint)) end = index;
	97
	98	// save the token
	99	// TODO(nasr): work on the string macros cuz no work
	100	{
	101	query_token *new_token = PushStruct(arena, query_token);
	102
	103	//- initialize list
	104	{
	105	if(is_nil_query_token(list->start_token))
	106	{
	107	list->start_token = new_token;
	108	list->current_token = new_token;
	109	}
	110	else
	111	{
	112	//- all we need to do - we dont track parents or what ever. this is a token stream not a tree
	113	list->current_token->next = new_token;
	114	}
	115	}
	116
	117	s32 new_token_size = end - start;
	118
	119	new_token->lexeme = PushString(arena, new_token_size);
	120	new_token->lexeme.data = &buffer->data[index];
	121	new_token->lexeme.size = new_token_size;
	122
	123	list->current_token->next = new_token;
	124
	125	start = index + 1;
	126	}
	127	}
	128
	129	return list;
	130	}
	131
	132	int main(int count, char **value)
	133	{
	134	#if 1
	135	unused(nil_query_token_list);
	136	#endif
	137
	138	if(count < 2) value[1] = "./test/data.csv";
	139
	140	local_persist b32 running = 1;
	141
	142	mem_arena *global_arena = arena_create(GiB(1));
	143
	144	string8 buffer = load_file(global_arena, value[1]);
	145
	146	// NOTE(nasr): the use of tables is required for tracking headers etc.
	147	// i think we can optimize this away in the future but for now its fine
	148	csv_table *table = PushStructZero(global_arena, csv_table);
	149	csv_token_list *token_list = PushStructZero(global_arena, csv_token_list);
	150
	151	token_list->start_token = &nil_csv_token;
	152	token_list->end_token = &nil_csv_token;
	153
	154	csv_token *tokens = tokenize_csv(buffer, global_arena, table, token_list);
	155	assert_msg(tokens != NULL, "tokens are null");
	156
	157	// NOTE(nasr): token_list is now populated — pass it directly, not a fresh empty list
	158	btree *bt = parse_csv(global_arena, token_list, table);
	159
	160	print("\nDatabase Engine\n");
	161
	162	for(;;)
	163	{
	164	if(running)
	165	{
	166	u8 *lbuf = PushArray(global_arena, u8, 256);
	167	s32 err = os_read(STDIN_FD, lbuf, 256);
	168
	169	if(err < 0)
	170	{
	171	print("error reading from stdin");
	172	}
	173
	174	// TODO(nasr): extract this later in the future and make a string copy function/macro
	175	// @params (s32 lbuf_size, string8 lbuf_stringified)
	176	// NOTE(nasr): use err (bytes read) not sizeof(lbuf*) — sizeof a pointer is always 8
	177	s32 lbuf_size = err;
	178	string8 lbuf_stringified = PushString(global_arena, lbuf_size);
	179	{
	180	memcpy(lbuf_stringified.data, lbuf, lbuf_size);
	181	lbuf_stringified.size = lbuf_size;
	182	}
	183
	184	query_token_list *qtl = PushStructZero(global_arena, query_token_list);
	185
	186
	187	query_tokenizer(global_arena, &lbuf_stringified, qtl);
	188
	189	// TODO(nasr): dispatch qtl against bt here
	190
	191	unused(bt);
	192
	193	sleep(1);
	194	}
	195	}
	196
	197	return 0;
	198	}