feature(main): lots of stuff see description

1. increased compile time warnings to help with some optimizations. 2. impelmeented csv lexing helper functions that do stuff on tokenlists like appending and concatenating lists with each other 3. realiszed that btree design in faulty so disabled it and will refactor it in the next approach
author: nasr <nsrddyn@gmail.com> 2026-03-16 19:20:23 +0000
committer: nasr <nsrddyn@gmail.com> 2026-03-16 19:20:23 +0000
commit: 180ccc84aac07c7bee2b09a6e07f7406908409b9 (patch)
tree: efa39665e41c3132626f2c08b2f3ae0d18adc17a /source/csv_decoder.h
parent: 2e258673171c2e4663a8b5d58e2ad174bb0ecd96 (diff)
1 files changed, 289 insertions, 0 deletions
diff --git a/source/csv_decoder.h b/source/csv_decoder.h
new file mode 100644
index 0000000..b754ef5
--- /dev/null
+++ b/source/csv_decoder.h
@@ -0,0 +1,289 @@
+#ifndef ENGINE_LEXER_H
+#define ENGINE_LEXER_H
+enum csv_token_flags
+{
+    FL      = 1 << 2,
+};
+enum csv_token_type
+{
+    // first 255 tokens  for ascii characters
+    TOKEN_UNDEFINED = 255,
+    TOKEN_IDENTIFIER,
+    TOKEN_VALUE,
+};
+typedef struct csv_token csv_token;
+struct csv_token
+{
+    string8 lexeme;
+    csv_token *next_token;
+    enum csv_token_type type;
+    enum csv_token_flags flags;
+};
+// NOTE(nasr): i dont think im going to use this.
+typedef struct csv_row csv_row;
+struct csv_row
+{
+    // array of size col_count, points into mmap buffer
+    string8 *fields;
+    s32      count;
+};
+#if 0
+typedef struct csv_lntity csv_entity;
+struct csv_entity 
+{
+    //- not needed because we use key header mapping i think
+};
+#endif
+typedef struct csv_header csv_header;
+struct csv_header 
+{
+    string8  payload;
+    csv_header *next_header;
+};
+typedef struct csv_table csv_table;
+struct csv_table
+{
+    // first row, col names
+    // all data rows
+    csv_header  *header;
+    s32         row_count;
+    s32         header_count;
+};
+typedef struct csv_token_list csv_token_list;
+struct csv_token_list
+{
+    csv_token *start_token;
+    csv_token *end_token;
+};
+read_only global_variable
+csv_token nil_csv_token=
+{
+    .lexeme         = {.data = NULL, .size =0},
+    .type           = 0,
+    .flags          = 0,
+    .next_token     = &nil_csv_token,
+};
+read_only global_variable
+csv_header nil_csv_header = 
+{
+    .payload =  {.data = NULL, .size = 0},
+    .next_header = &nil_csv_header,
+};
+read_only global_variable
+csv_token_list nil_csv_token_list =
+{
+    .start_token = &nil_csv_token,
+    .end_token   = &nil_csv_token,
+};
+read_only global_variable
+csv_row  nil_csv_row =
+{
+    .fields     = &nil_string,
+    .count      = 0,
+};
+read_only global_variable
+csv_table nil_csv_table =
+{
+    .header     = &nil_csv_header,
+    .row_count   = 0,
+};
+#endif /* ENGINE_LEXER_H */
+internal b32 
+is_nil_csv_token(csv_token *token)
+{
+    return ((token == NULL) || (token == &nil_csv_token)); 
+}
+internal void
+csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
+{
+    source_token_list->end_token->next_token = source_token;
+    source_token_list->end_token             = source_token;
+}
+//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
+internal void
+csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
+{
+    csv_token *source_ct = source->start_token;
+    csv_token *destination_end_ct = destination->end_token;
+     for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
+     {
+         destination_end_ct->next_token = source_ct;
+     }
+     destination->end_token = source_ct;
+}
+#if 0
+internal csv_token_list *
+parse_csv_row(string8 row_buffer)
+{
+    // csv_token_list *
+}
+#endif
+// the lexer acts as a table builder from a csv  file
+// and parsing indivudal rows and columns
+// the next step would be building a the b-tree
+internal csv_token *
+tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
+{
+    unused(token_list);
+    b32 finding_headers = TRUE;
+    if(buffer.size < 0) return NULL;
+    csv_token *tok = PushStruct(arena, csv_token);
+    // URGENT(nasr): segfaulting because memcpy of strring value doesnt  work dammit
+    // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
+    // forgot what the solution was
+    // TODO(nasr): check what the problem here was
+    for(s32 index = 0; buffer.data[index] != '\0'; ++index)
+    {
+        u8 point = buffer.data[index];
+        s32 start   = 0;
+        s32 end     = 0;
+        if(is_whitespace(point))
+        {
+            warn("csv file is invalid, detected whitespace");
+            return NULL;
+        }
+        if(point == '\n') 
+        {
+            if(finding_headers)
+            {
+#if 0
+                string8 headers_buffer = {.data = &buffer.data[start], .size = end - start};
+#endif
+                finding_headers = FALSE;
+                {
+                    //- map new header token list to table headers
+                }
+            }
+#if 0
+            else
+            {
+            }
+#endif
+            table->row_count++;
+        }
+        else if(point == ',')
+        {
+            if (finding_headers)
+            {
+                table->header_count++;
+            }
+        }
+        switch(point)
+        {
+            case('\n'):
+                {
+                    tok->flags |= FL;
+                    break;
+                }
+            case(','):
+                {
+                    end = index - 1;
+                    start = index + 1;
+                    break;
+                }
+            default:
+                {
+                    break;
+                }
+        }
+        tok->lexeme         = StringCast(&buffer.data[start], end - start);
+        tok->next_token     = tok;
+    }
+    return tok;
+}
+//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
+internal b_tree *
+parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
+{
+    b_tree *tree = PushStructZero(arena, b_tree);
+    b_tree_create(arena, tree);
+    // iterate over the token list while the token is not nil
+    for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
+    {
+        //- TODO(nasr): check initizalization or something tomorrow
+        {
+            //- are we parsing the first line tokens?
+            //- if so, do something :))
+            if(ct->flags & FL) 
+            {
+                // TODO(nasr): replace with nil header check function
+                if(table->header != &nil_csv_header || table->header == NULL)
+                {
+#if 0
+                    // - no this should happen in the tokenization
+                    table->headers->next =     
+#endif
+                }
+                else 
+                {
+                }
+            }
+        }
+        // TODO(nasr): fix this logic tomorrow
+        csv_token *ct = PushStruct(arena, csv_token);
+        // skip structural ctens, only index values
+        if (ct->type != TOKEN_VALUE)
+        {
+            continue;
+        }
+        // NOTE(nasr): payload is the cten itself so the caller can reach
+        // row/col metadata without us having to copy it
+        // NOTE(nasr): heh why do we void cast again?
+        b_tree_insert(tree, ct->lexeme, (void *)ct);
+    }
+    return tree;
+}
author	nasr <nsrddyn@gmail.com>	2026-03-16 19:20:23 +0000
committer	nasr <nsrddyn@gmail.com>	2026-03-16 19:20:23 +0000
commit	180ccc84aac07c7bee2b09a6e07f7406908409b9 (patch)
tree	efa39665e41c3132626f2c08b2f3ae0d18adc17a /source/csv_decoder.h
parent	2e258673171c2e4663a8b5d58e2ad174bb0ecd96 (diff)

diff --git a/source/csv_decoder.h b/source/csv_decoder.h new file mode 100644 index 0000000..b754ef5 --- /dev/null +++ b/source/csv_decoder.h
@@ -0,0 +1,289 @@
	1	#ifndef ENGINE_LEXER_H
	2	#define ENGINE_LEXER_H
	3
	4	enum csv_token_flags
	5	{
	6	FL = 1 << 2,
	7	};
	8
	9	enum csv_token_type
	10	{
	11	// first 255 tokens for ascii characters
	12	TOKEN_UNDEFINED = 255,
	13	TOKEN_IDENTIFIER,
	14	TOKEN_VALUE,
	15	};
	16
	17	typedef struct csv_token csv_token;
	18	struct csv_token
	19	{
	20	string8 lexeme;
	21	csv_token *next_token;
	22	enum csv_token_type type;
	23	enum csv_token_flags flags;
	24	};
	25
	26	// NOTE(nasr): i dont think im going to use this.
	27	typedef struct csv_row csv_row;
	28	struct csv_row
	29	{
	30	// array of size col_count, points into mmap buffer
	31	string8 *fields;
	32	s32 count;
	33	};
	34
	35	#if 0
	36	typedef struct csv_lntity csv_entity;
	37	struct csv_entity
	38	{
	39	//- not needed because we use key header mapping i think
	40	};
	41	#endif
	42
	43	typedef struct csv_header csv_header;
	44	struct csv_header
	45	{
	46	string8 payload;
	47	csv_header *next_header;
	48	};
	49
	50	typedef struct csv_table csv_table;
	51	struct csv_table
	52	{
	53	// first row, col names
	54	// all data rows
	55	csv_header *header;
	56	s32 row_count;
	57	s32 header_count;
	58	};
	59
	60
	61	typedef struct csv_token_list csv_token_list;
	62	struct csv_token_list
	63	{
	64	csv_token *start_token;
	65	csv_token *end_token;
	66
	67	};
	68
	69	read_only global_variable
	70	csv_token nil_csv_token=
	71	{
	72	.lexeme = {.data = NULL, .size =0},
	73	.type = 0,
	74	.flags = 0,
	75	.next_token = &nil_csv_token,
	76
	77	};
	78
	79	read_only global_variable
	80	csv_header nil_csv_header =
	81	{
	82	.payload = {.data = NULL, .size = 0},
	83	.next_header = &nil_csv_header,
	84	};
	85
	86	read_only global_variable
	87	csv_token_list nil_csv_token_list =
	88	{
	89	.start_token = &nil_csv_token,
	90	.end_token = &nil_csv_token,
	91	};
	92
	93
	94	read_only global_variable
	95	csv_row nil_csv_row =
	96	{
	97	.fields = &nil_string,
	98	.count = 0,
	99	};
	100
	101	read_only global_variable
	102	csv_table nil_csv_table =
	103	{
	104	.header = &nil_csv_header,
	105	.row_count = 0,
	106	};
	107
	108	#endif /* ENGINE_LEXER_H */
	109
	110	internal b32
	111	is_nil_csv_token(csv_token *token)
	112	{
	113	return ((token == NULL) \|\| (token == &nil_csv_token));
	114	}
	115
	116	internal void
	117	csv_token_list_append_token(csv_token_list source_token_list, csv_token source_token)
	118	{
	119	source_token_list->end_token->next_token = source_token;
	120	source_token_list->end_token = source_token;
	121
	122	}
	123
	124	//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
	125	internal void
	126	csv_token_list_concat_list(csv_token_list destination, csv_token_list source)
	127	{
	128
	129	csv_token *source_ct = source->start_token;
	130	csv_token *destination_end_ct = destination->end_token;
	131
	132	for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
	133	{
	134	destination_end_ct->next_token = source_ct;
	135	}
	136
	137	destination->end_token = source_ct;
	138	}
	139
	140	#if 0
	141	internal csv_token_list *
	142	parse_csv_row(string8 row_buffer)
	143	{
	144	// csv_token_list *
	145
	146	}
	147	#endif
	148
	149
	150	// the lexer acts as a table builder from a csv file
	151	// and parsing indivudal rows and columns
	152	// the next step would be building a the b-tree
	153	internal csv_token *
	154	tokenize_csv(string8 buffer, mem_arena arena, csv_table table, csv_token_list *token_list)
	155	{
	156
	157	unused(token_list);
	158	b32 finding_headers = TRUE;
	159
	160	if(buffer.size < 0) return NULL;
	161
	162	csv_token *tok = PushStruct(arena, csv_token);
	163
	164	// URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
	165	// NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
	166	// forgot what the solution was
	167	// TODO(nasr): check what the problem here was
	168	for(s32 index = 0; buffer.data[index] != '\0'; ++index)
	169	{
	170	u8 point = buffer.data[index];
	171
	172	s32 start = 0;
	173	s32 end = 0;
	174
	175	if(is_whitespace(point))
	176	{
	177	warn("csv file is invalid, detected whitespace");
	178	return NULL;
	179	}
	180
	181
	182	if(point == '\n')
	183	{
	184	if(finding_headers)
	185	{
	186	#if 0
	187	string8 headers_buffer = {.data = &buffer.data[start], .size = end - start};
	188	#endif
	189	finding_headers = FALSE;
	190
	191	{
	192	//- map new header token list to table headers
	193	}
	194	}
	195	#if 0
	196	else
	197	{
	198
	199	}
	200	#endif
	201
	202
	203	table->row_count++;
	204	}
	205	else if(point == ',')
	206	{
	207	if (finding_headers)
	208	{
	209	table->header_count++;
	210	}
	211	}
	212
	213	switch(point)
	214	{
	215	case('\n'):
	216	{
	217	tok->flags \|= FL;
	218	break;
	219	}
	220
	221	case(','):
	222	{
	223	end = index - 1;
	224	start = index + 1;
	225	break;
	226	}
	227	default:
	228	{
	229	break;
	230	}
	231	}
	232
	233	tok->lexeme = StringCast(&buffer.data[start], end - start);
	234	tok->next_token = tok;
	235	}
	236
	237	return tok;
	238	}
	239
	240	//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
	241	internal b_tree *
	242	parse_csv(mem_arena arena, csv_token_list ctl, csv_table *table)
	243	{
	244	b_tree *tree = PushStructZero(arena, b_tree);
	245	b_tree_create(arena, tree);
	246
	247	// iterate over the token list while the token is not nil
	248	for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
	249	{
	250
	251	//- TODO(nasr): check initizalization or something tomorrow
	252	{
	253	//- are we parsing the first line tokens?
	254	//- if so, do something :))
	255	if(ct->flags & FL)
	256	{
	257	// TODO(nasr): replace with nil header check function
	258	if(table->header != &nil_csv_header \|\| table->header == NULL)
	259	{
	260	#if 0
	261	// - no this should happen in the tokenization
	262	table->headers->next =
	263	#endif
	264	}
	265	else
	266	{
	267
	268	}
	269
	270	}
	271	}
	272
	273	// TODO(nasr): fix this logic tomorrow
	274	csv_token *ct = PushStruct(arena, csv_token);
	275	// skip structural ctens, only index values
	276	if (ct->type != TOKEN_VALUE)
	277	{
	278	continue;
	279	}
	280
	281	// NOTE(nasr): payload is the cten itself so the caller can reach
	282	// row/col metadata without us having to copy it
	283	// NOTE(nasr): heh why do we void cast again?
	284	b_tree_insert(tree, ct->lexeme, (void *)ct);
	285	}
	286
	287	return tree;
	288	}
	289