From a9cb228861a6b0fad4d508c05c0614757a7f0a34 Mon Sep 17 00:00:00 2001
From: nasr <nsrddyn@gmail.com>
Date: Mon, 13 Apr 2026 14:58:49 +0200
Subject: refactor(main): refactor directory structure

---
 source/tb_db/csv_decoder.h | 294 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 294 insertions(+)
 create mode 100644 source/tb_db/csv_decoder.h

(limited to 'source/tb_db/csv_decoder.h')

diff --git a/source/tb_db/csv_decoder.h b/source/tb_db/csv_decoder.h
new file mode 100644
index 0000000..3d09dc6
--- /dev/null
+++ b/source/tb_db/csv_decoder.h
@@ -0,0 +1,294 @@
+#ifndef ENGINE_LEXER_H
+#define ENGINE_LEXER_H
+
+enum csv_token_flags
+{
+    FL      = 1 << 2,
+};
+
+enum csv_token_type
+{
+    // first 255 tokens  for ascii characters
+    TOKEN_UNDEFINED = 255,
+    TOKEN_IDENTIFIER,
+    TOKEN_VALUE,
+};
+
+typedef struct csv_token csv_token;
+struct csv_token
+{
+    string8 lexeme;
+    csv_token *next_token;
+    enum csv_token_type type;
+    enum csv_token_flags flags;
+};
+
+// NOTE(nasr): i dont think im going to use this.
+typedef struct csv_row csv_row;
+struct csv_row
+{
+    // array of size col_count, points into mmap buffer
+    string8 *fields;
+    s32      count;
+};
+
+#if 0
+typedef struct csv_lntity csv_entity;
+struct csv_entity 
+{
+    //- not needed because we use key header mapping i think
+};
+#endif
+
+typedef struct csv_header csv_header;
+struct csv_header 
+{
+    string8  payload;
+    csv_header *next_header;
+};
+
+typedef struct csv_table csv_table;
+struct csv_table
+{
+    // first row, col names
+    // all data rows
+    csv_header  *header;
+    s32         row_count;
+    s32         header_count;
+    b32         finding_headers;
+};
+
+
+typedef struct csv_token_list csv_token_list;
+struct csv_token_list
+{
+    csv_token *start_token;
+    csv_token *end_token;
+};
+
+read_only global_variable
+csv_token nil_csv_token=
+{
+    .lexeme         = {.data = NULL, .size = 0},
+    .type           = 0,
+    .flags          = 0,
+    .next_token     = &nil_csv_token,
+};
+
+read_only global_variable
+csv_header nil_csv_header = 
+{
+    .payload =  {.data = NULL, .size = 0},
+    .next_header = &nil_csv_header,
+};
+
+read_only global_variable
+csv_token_list nil_csv_token_list =
+{
+    .start_token = &nil_csv_token,
+    .end_token   = &nil_csv_token,
+};
+
+read_only global_variable
+csv_row  nil_csv_row =
+{
+    .fields     = &nil_string,
+    .count      = 0,
+};
+
+read_only global_variable
+csv_table nil_csv_table =
+{
+    .header     = &nil_csv_header,
+    .row_count   = 0,
+};
+
+#endif /* ENGINE_LEXER_H */
+
+internal b32 
+is_nil_csv_token(csv_token *token)
+{
+    return ((token == NULL) || (token == &nil_csv_token)); 
+}
+
+// TODO(nasr): segfaulting because end_token not allocated
+internal void
+csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
+{
+    source_token_list->end_token->next_token = source_token;
+    source_token_list->end_token             = source_token;
+}
+
+//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
+internal void
+csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
+{
+    if(is_nil_csv_token(source->start_token)) return;
+
+    csv_token *source_ct      = source->start_token;
+    csv_token *destination_et = destination->end_token;
+
+    // walk source and stitch each node onto destination's tail
+    for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
+    {
+        destination_et->next_token = source_ct;
+        destination_et             = source_ct;
+    }
+
+    // destination_et now points at the last real source node (not the nil sentinel)
+    destination->end_token = destination_et;
+}
+
+#if 0
+internal csv_token_list *
+parse_csv_row(string8 row_buffer)
+{
+    // csv_token_list *
+
+}
+#endif
+
+
+// the lexer acts as a table builder from a csv  file
+// and parsing indivudal rows and columns
+// the next step would be building a the b-tree
+internal csv_token *
+tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
+{
+    unused(token_list);
+
+    if(buffer.size == 0) return NULL;
+
+    // URGENT(nasr): segfaulting because memcpy of strring value doesnt  work dammit
+    // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
+    // forgot what the solution was
+    // TODO(nasr): check what the problem here was
+
+    // string size tracking across the loop not inside it
+    s32 start = 0;
+
+    for(s32 index = 0; buffer.data[index] != '\0'; ++index)
+    {
+        u8 point = buffer.data[index];
+
+#if 0
+        if(is_whitespace(point))
+        {
+            warn("csv file is invalid, detected whitespace");
+            return NULL;
+        }
+#endif
+
+        if(point == ',')
+        {
+            // emit a token for the field that ended before this comma
+            csv_token *token  = PushStructZero(arena, csv_token);
+
+            assert_msg(token != NULL, "did the push struct fail??");
+            assert_msg(arena->current_position < arena->capacity, "no more arena size");
+
+            token->lexeme     = StringCast(&buffer.data[start], index - start);
+            token->type       = TOKEN_VALUE;
+            token->next_token = &nil_csv_token;
+            csv_token_list_append_token(token_list, token);
+
+            start = index + 1;
+
+            if(table->finding_headers)
+            {
+                table->header_count++;
+            }
+        }
+        else if(point == '\n')
+        {
+            // emit a token for the field that ended at this newline
+            csv_token *token  = PushStructZero(arena, csv_token);
+            token->lexeme     = StringCast(&buffer.data[start], index - start);
+            token->type       = TOKEN_VALUE;
+            token->flags     |= FL;
+            token->next_token = &nil_csv_token;
+
+            assert_msg(token_list, "token list invalid");
+            assert_msg(token, "you're tring to append an invalid token");
+
+            csv_token_list_append_token(token_list, token);
+
+            start = index + 1;
+
+            if(table->finding_headers)
+            {
+                {
+                    //- map new header token list to table headers
+                }
+                table->finding_headers = FALSE;
+            }
+
+            table->row_count++;
+        }
+    }
+
+    // NOTE(nasr): return the first token the caller can walk the list from token_list
+    return token_list->start_token;
+}
+
+//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
+internal btree *
+parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
+{
+    btree *tree = PushStructZero(arena, btree);
+
+    s32 col_index = 0;
+    s32 row_index = 0;
+
+    // iterate over the token list while the token is not nil
+    for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
+    {
+        {
+            //- are we parsing the first line tokens?
+            //- if so, do something :))
+            if(ct->flags & FL) 
+            {
+                // NOTE(nasr): FL marks end-of-line; advance row, reset col
+                row_index++;
+                col_index = 0;
+
+                // TODO(nasr): replace with nil header check function
+                // NOTE(nasr): == nil means header hasn't been set yet
+                if(table->header == &nil_csv_header || table->header == NULL)
+                {
+#if 0
+                    // - no this should happen in the tokenization
+                    table->headers->next =     
+#endif
+                }
+                else 
+                {
+
+                }
+
+                // FL tokens are structural, no value to index
+                continue;
+            }
+        }
+
+        // skip non-value tokens, only index actual cell values
+        if (ct->type != TOKEN_VALUE)
+        {
+            col_index++;
+            continue;
+        }
+
+        // NOTE(nasr): payload is the cten itself so the caller can reach
+        // row/col metadata without us having to copy it
+        key k = {
+            .header_index = col_index,
+            .row_index    = row_index,
+        };
+
+        btree_insert(arena, tree, k, (void *)ct);
+
+        col_index++;
+    }
+
+    return tree;
+}
-- 
cgit v1.3