1 files changed, 294 insertions, 0 deletions
diff --git a/source/tb_db/csv_decoder.h b/source/tb_db/csv_decoder.h
new file mode 100644
index 0000000..3d09dc6
--- /dev/null
+++ b/source/tb_db/csv_decoder.h
@@ -0,0 +1,294 @@
+#ifndef ENGINE_LEXER_H
+#define ENGINE_LEXER_H
+enum csv_token_flags
+{
+    FL      = 1 << 2,
+};
+enum csv_token_type
+{
+    // first 255 tokens  for ascii characters
+    TOKEN_UNDEFINED = 255,
+    TOKEN_IDENTIFIER,
+    TOKEN_VALUE,
+};
+typedef struct csv_token csv_token;
+struct csv_token
+{
+    string8 lexeme;
+    csv_token *next_token;
+    enum csv_token_type type;
+    enum csv_token_flags flags;
+};
+// NOTE(nasr): i dont think im going to use this.
+typedef struct csv_row csv_row;
+struct csv_row
+{
+    // array of size col_count, points into mmap buffer
+    string8 *fields;
+    s32      count;
+};
+#if 0
+typedef struct csv_lntity csv_entity;
+struct csv_entity 
+{
+    //- not needed because we use key header mapping i think
+};
+#endif
+typedef struct csv_header csv_header;
+struct csv_header 
+{
+    string8  payload;
+    csv_header *next_header;
+};
+typedef struct csv_table csv_table;
+struct csv_table
+{
+    // first row, col names
+    // all data rows
+    csv_header  *header;
+    s32         row_count;
+    s32         header_count;
+    b32         finding_headers;
+};
+typedef struct csv_token_list csv_token_list;
+struct csv_token_list
+{
+    csv_token *start_token;
+    csv_token *end_token;
+};
+read_only global_variable
+csv_token nil_csv_token=
+{
+    .lexeme         = {.data = NULL, .size = 0},
+    .type           = 0,
+    .flags          = 0,
+    .next_token     = &nil_csv_token,
+};
+read_only global_variable
+csv_header nil_csv_header = 
+{
+    .payload =  {.data = NULL, .size = 0},
+    .next_header = &nil_csv_header,
+};
+read_only global_variable
+csv_token_list nil_csv_token_list =
+{
+    .start_token = &nil_csv_token,
+    .end_token   = &nil_csv_token,
+};
+read_only global_variable
+csv_row  nil_csv_row =
+{
+    .fields     = &nil_string,
+    .count      = 0,
+};
+read_only global_variable
+csv_table nil_csv_table =
+{
+    .header     = &nil_csv_header,
+    .row_count   = 0,
+};
+#endif /* ENGINE_LEXER_H */
+internal b32 
+is_nil_csv_token(csv_token *token)
+{
+    return ((token == NULL) || (token == &nil_csv_token)); 
+}
+// TODO(nasr): segfaulting because end_token not allocated
+internal void
+csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
+{
+    source_token_list->end_token->next_token = source_token;
+    source_token_list->end_token             = source_token;
+}
+//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
+internal void
+csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
+{
+    if(is_nil_csv_token(source->start_token)) return;
+    csv_token *source_ct      = source->start_token;
+    csv_token *destination_et = destination->end_token;
+    // walk source and stitch each node onto destination's tail
+    for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
+    {
+        destination_et->next_token = source_ct;
+        destination_et             = source_ct;
+    }
+    // destination_et now points at the last real source node (not the nil sentinel)
+    destination->end_token = destination_et;
+}
+#if 0
+internal csv_token_list *
+parse_csv_row(string8 row_buffer)
+{
+    // csv_token_list *
+}
+#endif
+// the lexer acts as a table builder from a csv  file
+// and parsing indivudal rows and columns
+// the next step would be building a the b-tree
+internal csv_token *
+tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
+{
+    unused(token_list);
+    if(buffer.size == 0) return NULL;
+    // URGENT(nasr): segfaulting because memcpy of strring value doesnt  work dammit
+    // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
+    // forgot what the solution was
+    // TODO(nasr): check what the problem here was
+    // string size tracking across the loop not inside it
+    s32 start = 0;
+    for(s32 index = 0; buffer.data[index] != '\0'; ++index)
+    {
+        u8 point = buffer.data[index];
+#if 0
+        if(is_whitespace(point))
+        {
+            warn("csv file is invalid, detected whitespace");
+            return NULL;
+        }
+#endif
+        if(point == ',')
+        {
+            // emit a token for the field that ended before this comma
+            csv_token *token  = PushStructZero(arena, csv_token);
+            assert_msg(token != NULL, "did the push struct fail??");
+            assert_msg(arena->current_position < arena->capacity, "no more arena size");
+            token->lexeme     = StringCast(&buffer.data[start], index - start);
+            token->type       = TOKEN_VALUE;
+            token->next_token = &nil_csv_token;
+            csv_token_list_append_token(token_list, token);
+            start = index + 1;
+            if(table->finding_headers)
+            {
+                table->header_count++;
+            }
+        }
+        else if(point == '\n')
+        {
+            // emit a token for the field that ended at this newline
+            csv_token *token  = PushStructZero(arena, csv_token);
+            token->lexeme     = StringCast(&buffer.data[start], index - start);
+            token->type       = TOKEN_VALUE;
+            token->flags     |= FL;
+            token->next_token = &nil_csv_token;
+            assert_msg(token_list, "token list invalid");
+            assert_msg(token, "you're tring to append an invalid token");
+            csv_token_list_append_token(token_list, token);
+            start = index + 1;
+            if(table->finding_headers)
+            {
+                {
+                    //- map new header token list to table headers
+                }
+                table->finding_headers = FALSE;
+            }
+            table->row_count++;
+        }
+    }
+    // NOTE(nasr): return the first token the caller can walk the list from token_list
+    return token_list->start_token;
+}
+//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
+internal btree *
+parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
+{
+    btree *tree = PushStructZero(arena, btree);
+    s32 col_index = 0;
+    s32 row_index = 0;
+    // iterate over the token list while the token is not nil
+    for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
+    {
+        {
+            //- are we parsing the first line tokens?
+            //- if so, do something :))
+            if(ct->flags & FL) 
+            {
+                // NOTE(nasr): FL marks end-of-line; advance row, reset col
+                row_index++;
+                col_index = 0;
+                // TODO(nasr): replace with nil header check function
+                // NOTE(nasr): == nil means header hasn't been set yet
+                if(table->header == &nil_csv_header || table->header == NULL)
+                {
+#if 0
+                    // - no this should happen in the tokenization
+                    table->headers->next =     
+#endif
+                }
+                else 
+                {
+                }
+                // FL tokens are structural, no value to index
+                continue;
+            }
+        }
+        // skip non-value tokens, only index actual cell values
+        if (ct->type != TOKEN_VALUE)
+        {
+            col_index++;
+            continue;
+        }
+        // NOTE(nasr): payload is the cten itself so the caller can reach
+        // row/col metadata without us having to copy it
+        key k = {
+            .header_index = col_index,
+            .row_index    = row_index,
+        };
+        btree_insert(arena, tree, k, (void *)ct);
+        col_index++;
+    }
+    return tree;
+}

diff --git a/source/tb_db/csv_decoder.h b/source/tb_db/csv_decoder.h new file mode 100644 index 0000000..3d09dc6 --- /dev/null +++ b/source/tb_db/csv_decoder.h
@@ -0,0 +1,294 @@
	1	#ifndef ENGINE_LEXER_H
	2	#define ENGINE_LEXER_H
	3
	4	enum csv_token_flags
	5	{
	6	FL = 1 << 2,
	7	};
	8
	9	enum csv_token_type
	10	{
	11	// first 255 tokens for ascii characters
	12	TOKEN_UNDEFINED = 255,
	13	TOKEN_IDENTIFIER,
	14	TOKEN_VALUE,
	15	};
	16
	17	typedef struct csv_token csv_token;
	18	struct csv_token
	19	{
	20	string8 lexeme;
	21	csv_token *next_token;
	22	enum csv_token_type type;
	23	enum csv_token_flags flags;
	24	};
	25
	26	// NOTE(nasr): i dont think im going to use this.
	27	typedef struct csv_row csv_row;
	28	struct csv_row
	29	{
	30	// array of size col_count, points into mmap buffer
	31	string8 *fields;
	32	s32 count;
	33	};
	34
	35	#if 0
	36	typedef struct csv_lntity csv_entity;
	37	struct csv_entity
	38	{
	39	//- not needed because we use key header mapping i think
	40	};
	41	#endif
	42
	43	typedef struct csv_header csv_header;
	44	struct csv_header
	45	{
	46	string8 payload;
	47	csv_header *next_header;
	48	};
	49
	50	typedef struct csv_table csv_table;
	51	struct csv_table
	52	{
	53	// first row, col names
	54	// all data rows
	55	csv_header *header;
	56	s32 row_count;
	57	s32 header_count;
	58	b32 finding_headers;
	59	};
	60
	61
	62	typedef struct csv_token_list csv_token_list;
	63	struct csv_token_list
	64	{
	65	csv_token *start_token;
	66	csv_token *end_token;
	67	};
	68
	69	read_only global_variable
	70	csv_token nil_csv_token=
	71	{
	72	.lexeme = {.data = NULL, .size = 0},
	73	.type = 0,
	74	.flags = 0,
	75	.next_token = &nil_csv_token,
	76	};
	77
	78	read_only global_variable
	79	csv_header nil_csv_header =
	80	{
	81	.payload = {.data = NULL, .size = 0},
	82	.next_header = &nil_csv_header,
	83	};
	84
	85	read_only global_variable
	86	csv_token_list nil_csv_token_list =
	87	{
	88	.start_token = &nil_csv_token,
	89	.end_token = &nil_csv_token,
	90	};
	91
	92	read_only global_variable
	93	csv_row nil_csv_row =
	94	{
	95	.fields = &nil_string,
	96	.count = 0,
	97	};
	98
	99	read_only global_variable
	100	csv_table nil_csv_table =
	101	{
	102	.header = &nil_csv_header,
	103	.row_count = 0,
	104	};
	105
	106	#endif /* ENGINE_LEXER_H */
	107
	108	internal b32
	109	is_nil_csv_token(csv_token *token)
	110	{
	111	return ((token == NULL) \|\| (token == &nil_csv_token));
	112	}
	113
	114	// TODO(nasr): segfaulting because end_token not allocated
	115	internal void
	116	csv_token_list_append_token(csv_token_list source_token_list, csv_token source_token)
	117	{
	118	source_token_list->end_token->next_token = source_token;
	119	source_token_list->end_token = source_token;
	120	}
	121
	122	//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
	123	internal void
	124	csv_token_list_concat_list(csv_token_list destination, csv_token_list source)
	125	{
	126	if(is_nil_csv_token(source->start_token)) return;
	127
	128	csv_token *source_ct = source->start_token;
	129	csv_token *destination_et = destination->end_token;
	130
	131	// walk source and stitch each node onto destination's tail
	132	for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
	133	{
	134	destination_et->next_token = source_ct;
	135	destination_et = source_ct;
	136	}
	137
	138	// destination_et now points at the last real source node (not the nil sentinel)
	139	destination->end_token = destination_et;
	140	}
	141
	142	#if 0
	143	internal csv_token_list *
	144	parse_csv_row(string8 row_buffer)
	145	{
	146	// csv_token_list *
	147
	148	}
	149	#endif
	150
	151
	152	// the lexer acts as a table builder from a csv file
	153	// and parsing indivudal rows and columns
	154	// the next step would be building a the b-tree
	155	internal csv_token *
	156	tokenize_csv(string8 buffer, mem_arena arena, csv_table table, csv_token_list *token_list)
	157	{
	158	unused(token_list);
	159
	160	if(buffer.size == 0) return NULL;
	161
	162	// URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
	163	// NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
	164	// forgot what the solution was
	165	// TODO(nasr): check what the problem here was
	166
	167	// string size tracking across the loop not inside it
	168	s32 start = 0;
	169
	170	for(s32 index = 0; buffer.data[index] != '\0'; ++index)
	171	{
	172	u8 point = buffer.data[index];
	173
	174	#if 0
	175	if(is_whitespace(point))
	176	{
	177	warn("csv file is invalid, detected whitespace");
	178	return NULL;
	179	}
	180	#endif
	181
	182	if(point == ',')
	183	{
	184	// emit a token for the field that ended before this comma
	185	csv_token *token = PushStructZero(arena, csv_token);
	186
	187	assert_msg(token != NULL, "did the push struct fail??");
	188	assert_msg(arena->current_position < arena->capacity, "no more arena size");
	189
	190	token->lexeme = StringCast(&buffer.data[start], index - start);
	191	token->type = TOKEN_VALUE;
	192	token->next_token = &nil_csv_token;
	193	csv_token_list_append_token(token_list, token);
	194
	195	start = index + 1;
	196
	197	if(table->finding_headers)
	198	{
	199	table->header_count++;
	200	}
	201	}
	202	else if(point == '\n')
	203	{
	204	// emit a token for the field that ended at this newline
	205	csv_token *token = PushStructZero(arena, csv_token);
	206	token->lexeme = StringCast(&buffer.data[start], index - start);
	207	token->type = TOKEN_VALUE;
	208	token->flags \|= FL;
	209	token->next_token = &nil_csv_token;
	210
	211	assert_msg(token_list, "token list invalid");
	212	assert_msg(token, "you're tring to append an invalid token");
	213
	214	csv_token_list_append_token(token_list, token);
	215
	216	start = index + 1;
	217
	218	if(table->finding_headers)
	219	{
	220	{
	221	//- map new header token list to table headers
	222	}
	223	table->finding_headers = FALSE;
	224	}
	225
	226	table->row_count++;
	227	}
	228	}
	229
	230	// NOTE(nasr): return the first token the caller can walk the list from token_list
	231	return token_list->start_token;
	232	}
	233
	234	//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
	235	internal btree *
	236	parse_csv(mem_arena arena, csv_token_list ctl, csv_table *table)
	237	{
	238	btree *tree = PushStructZero(arena, btree);
	239
	240	s32 col_index = 0;
	241	s32 row_index = 0;
	242
	243	// iterate over the token list while the token is not nil
	244	for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
	245	{
	246	{
	247	//- are we parsing the first line tokens?
	248	//- if so, do something :))
	249	if(ct->flags & FL)
	250	{
	251	// NOTE(nasr): FL marks end-of-line; advance row, reset col
	252	row_index++;
	253	col_index = 0;
	254
	255	// TODO(nasr): replace with nil header check function
	256	// NOTE(nasr): == nil means header hasn't been set yet
	257	if(table->header == &nil_csv_header \|\| table->header == NULL)
	258	{
	259	#if 0
	260	// - no this should happen in the tokenization
	261	table->headers->next =
	262	#endif
	263	}
	264	else
	265	{
	266
	267	}
	268
	269	// FL tokens are structural, no value to index
	270	continue;
	271	}
	272	}
	273
	274	// skip non-value tokens, only index actual cell values
	275	if (ct->type != TOKEN_VALUE)
	276	{
	277	col_index++;
	278	continue;
	279	}
	280
	281	// NOTE(nasr): payload is the cten itself so the caller can reach
	282	// row/col metadata without us having to copy it
	283	key k = {
	284	.header_index = col_index,
	285	.row_index = row_index,
	286	};
	287
	288	btree_insert(arena, tree, k, (void *)ct);
	289
	290	col_index++;
	291	}
	292
	293	return tree;
	294	}