1 files changed, 289 insertions, 0 deletions
diff --git a/source/csv_decoder.h b/source/csv_decoder.h
new file mode 100644
index 0000000..b754ef5
--- /dev/null
+++ b/source/csv_decoder.h
@@ -0,0 +1,289 @@
+#ifndef ENGINE_LEXER_H
+#define ENGINE_LEXER_H
+enum csv_token_flags
+{
+    FL      = 1 << 2,
+};
+enum csv_token_type
+{
+    // first 255 tokens  for ascii characters
+    TOKEN_UNDEFINED = 255,
+    TOKEN_IDENTIFIER,
+    TOKEN_VALUE,
+};
+typedef struct csv_token csv_token;
+struct csv_token
+{
+    string8 lexeme;
+    csv_token *next_token;
+    enum csv_token_type type;
+    enum csv_token_flags flags;
+};
+// NOTE(nasr): i dont think im going to use this.
+typedef struct csv_row csv_row;
+struct csv_row
+{
+    // array of size col_count, points into mmap buffer
+    string8 *fields;
+    s32      count;
+};
+#if 0
+typedef struct csv_lntity csv_entity;
+struct csv_entity 
+{
+    //- not needed because we use key header mapping i think
+};
+#endif
+typedef struct csv_header csv_header;
+struct csv_header 
+{
+    string8  payload;
+    csv_header *next_header;
+};
+typedef struct csv_table csv_table;
+struct csv_table
+{
+    // first row, col names
+    // all data rows
+    csv_header  *header;
+    s32         row_count;
+    s32         header_count;
+};
+typedef struct csv_token_list csv_token_list;
+struct csv_token_list
+{
+    csv_token *start_token;
+    csv_token *end_token;
+};
+read_only global_variable
+csv_token nil_csv_token=
+{
+    .lexeme         = {.data = NULL, .size =0},
+    .type           = 0,
+    .flags          = 0,
+    .next_token     = &nil_csv_token,
+};
+read_only global_variable
+csv_header nil_csv_header = 
+{
+    .payload =  {.data = NULL, .size = 0},
+    .next_header = &nil_csv_header,
+};
+read_only global_variable
+csv_token_list nil_csv_token_list =
+{
+    .start_token = &nil_csv_token,
+    .end_token   = &nil_csv_token,
+};
+read_only global_variable
+csv_row  nil_csv_row =
+{
+    .fields     = &nil_string,
+    .count      = 0,
+};
+read_only global_variable
+csv_table nil_csv_table =
+{
+    .header     = &nil_csv_header,
+    .row_count   = 0,
+};
+#endif /* ENGINE_LEXER_H */
+internal b32 
+is_nil_csv_token(csv_token *token)
+{
+    return ((token == NULL) || (token == &nil_csv_token)); 
+}
+internal void
+csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
+{
+    source_token_list->end_token->next_token = source_token;
+    source_token_list->end_token             = source_token;
+}
+//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
+internal void
+csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
+{
+    csv_token *source_ct = source->start_token;
+    csv_token *destination_end_ct = destination->end_token;
+     for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
+     {
+         destination_end_ct->next_token = source_ct;
+     }
+     destination->end_token = source_ct;
+}
+#if 0
+internal csv_token_list *
+parse_csv_row(string8 row_buffer)
+{
+    // csv_token_list *
+}
+#endif
+// the lexer acts as a table builder from a csv  file
+// and parsing indivudal rows and columns
+// the next step would be building a the b-tree
+internal csv_token *
+tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
+{
+    unused(token_list);
+    b32 finding_headers = TRUE;
+    if(buffer.size < 0) return NULL;
+    csv_token *tok = PushStruct(arena, csv_token);
+    // URGENT(nasr): segfaulting because memcpy of strring value doesnt  work dammit
+    // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
+    // forgot what the solution was
+    // TODO(nasr): check what the problem here was
+    for(s32 index = 0; buffer.data[index] != '\0'; ++index)
+    {
+        u8 point = buffer.data[index];
+        s32 start   = 0;
+        s32 end     = 0;
+        if(is_whitespace(point))
+        {
+            warn("csv file is invalid, detected whitespace");
+            return NULL;
+        }
+        if(point == '\n') 
+        {
+            if(finding_headers)
+            {
+#if 0
+                string8 headers_buffer = {.data = &buffer.data[start], .size = end - start};
+#endif
+                finding_headers = FALSE;
+                {
+                    //- map new header token list to table headers
+                }
+            }
+#if 0
+            else
+            {
+            }
+#endif
+            table->row_count++;
+        }
+        else if(point == ',')
+        {
+            if (finding_headers)
+            {
+                table->header_count++;
+            }
+        }
+        switch(point)
+        {
+            case('\n'):
+                {
+                    tok->flags |= FL;
+                    break;
+                }
+            case(','):
+                {
+                    end = index - 1;
+                    start = index + 1;
+                    break;
+                }
+            default:
+                {
+                    break;
+                }
+        }
+        tok->lexeme         = StringCast(&buffer.data[start], end - start);
+        tok->next_token     = tok;
+    }
+    return tok;
+}
+//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
+internal b_tree *
+parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
+{
+    b_tree *tree = PushStructZero(arena, b_tree);
+    b_tree_create(arena, tree);
+    // iterate over the token list while the token is not nil
+    for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
+    {
+        //- TODO(nasr): check initizalization or something tomorrow
+        {
+            //- are we parsing the first line tokens?
+            //- if so, do something :))
+            if(ct->flags & FL) 
+            {
+                // TODO(nasr): replace with nil header check function
+                if(table->header != &nil_csv_header || table->header == NULL)
+                {
+#if 0
+                    // - no this should happen in the tokenization
+                    table->headers->next =     
+#endif
+                }
+                else 
+                {
+                }
+            }
+        }
+        // TODO(nasr): fix this logic tomorrow
+        csv_token *ct = PushStruct(arena, csv_token);
+        // skip structural ctens, only index values
+        if (ct->type != TOKEN_VALUE)
+        {
+            continue;
+        }
+        // NOTE(nasr): payload is the cten itself so the caller can reach
+        // row/col metadata without us having to copy it
+        // NOTE(nasr): heh why do we void cast again?
+        b_tree_insert(tree, ct->lexeme, (void *)ct);
+    }
+    return tree;
+}

diff --git a/source/csv_decoder.h b/source/csv_decoder.h new file mode 100644 index 0000000..b754ef5 --- /dev/null +++ b/source/csv_decoder.h
@@ -0,0 +1,289 @@
	1	#ifndef ENGINE_LEXER_H
	2	#define ENGINE_LEXER_H
	3
	4	enum csv_token_flags
	5	{
	6	FL = 1 << 2,
	7	};
	8
	9	enum csv_token_type
	10	{
	11	// first 255 tokens for ascii characters
	12	TOKEN_UNDEFINED = 255,
	13	TOKEN_IDENTIFIER,
	14	TOKEN_VALUE,
	15	};
	16
	17	typedef struct csv_token csv_token;
	18	struct csv_token
	19	{
	20	string8 lexeme;
	21	csv_token *next_token;
	22	enum csv_token_type type;
	23	enum csv_token_flags flags;
	24	};
	25
	26	// NOTE(nasr): i dont think im going to use this.
	27	typedef struct csv_row csv_row;
	28	struct csv_row
	29	{
	30	// array of size col_count, points into mmap buffer
	31	string8 *fields;
	32	s32 count;
	33	};
	34
	35	#if 0
	36	typedef struct csv_lntity csv_entity;
	37	struct csv_entity
	38	{
	39	//- not needed because we use key header mapping i think
	40	};
	41	#endif
	42
	43	typedef struct csv_header csv_header;
	44	struct csv_header
	45	{
	46	string8 payload;
	47	csv_header *next_header;
	48	};
	49
	50	typedef struct csv_table csv_table;
	51	struct csv_table
	52	{
	53	// first row, col names
	54	// all data rows
	55	csv_header *header;
	56	s32 row_count;
	57	s32 header_count;
	58	};
	59
	60
	61	typedef struct csv_token_list csv_token_list;
	62	struct csv_token_list
	63	{
	64	csv_token *start_token;
	65	csv_token *end_token;
	66
	67	};
	68
	69	read_only global_variable
	70	csv_token nil_csv_token=
	71	{
	72	.lexeme = {.data = NULL, .size =0},
	73	.type = 0,
	74	.flags = 0,
	75	.next_token = &nil_csv_token,
	76
	77	};
	78
	79	read_only global_variable
	80	csv_header nil_csv_header =
	81	{
	82	.payload = {.data = NULL, .size = 0},
	83	.next_header = &nil_csv_header,
	84	};
	85
	86	read_only global_variable
	87	csv_token_list nil_csv_token_list =
	88	{
	89	.start_token = &nil_csv_token,
	90	.end_token = &nil_csv_token,
	91	};
	92
	93
	94	read_only global_variable
	95	csv_row nil_csv_row =
	96	{
	97	.fields = &nil_string,
	98	.count = 0,
	99	};
	100
	101	read_only global_variable
	102	csv_table nil_csv_table =
	103	{
	104	.header = &nil_csv_header,
	105	.row_count = 0,
	106	};
	107
	108	#endif /* ENGINE_LEXER_H */
	109
	110	internal b32
	111	is_nil_csv_token(csv_token *token)
	112	{
	113	return ((token == NULL) \|\| (token == &nil_csv_token));
	114	}
	115
	116	internal void
	117	csv_token_list_append_token(csv_token_list source_token_list, csv_token source_token)
	118	{
	119	source_token_list->end_token->next_token = source_token;
	120	source_token_list->end_token = source_token;
	121
	122	}
	123
	124	//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
	125	internal void
	126	csv_token_list_concat_list(csv_token_list destination, csv_token_list source)
	127	{
	128
	129	csv_token *source_ct = source->start_token;
	130	csv_token *destination_end_ct = destination->end_token;
	131
	132	for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
	133	{
	134	destination_end_ct->next_token = source_ct;
	135	}
	136
	137	destination->end_token = source_ct;
	138	}
	139
	140	#if 0
	141	internal csv_token_list *
	142	parse_csv_row(string8 row_buffer)
	143	{
	144	// csv_token_list *
	145
	146	}
	147	#endif
	148
	149
	150	// the lexer acts as a table builder from a csv file
	151	// and parsing indivudal rows and columns
	152	// the next step would be building a the b-tree
	153	internal csv_token *
	154	tokenize_csv(string8 buffer, mem_arena arena, csv_table table, csv_token_list *token_list)
	155	{
	156
	157	unused(token_list);
	158	b32 finding_headers = TRUE;
	159
	160	if(buffer.size < 0) return NULL;
	161
	162	csv_token *tok = PushStruct(arena, csv_token);
	163
	164	// URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
	165	// NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
	166	// forgot what the solution was
	167	// TODO(nasr): check what the problem here was
	168	for(s32 index = 0; buffer.data[index] != '\0'; ++index)
	169	{
	170	u8 point = buffer.data[index];
	171
	172	s32 start = 0;
	173	s32 end = 0;
	174
	175	if(is_whitespace(point))
	176	{
	177	warn("csv file is invalid, detected whitespace");
	178	return NULL;
	179	}
	180
	181
	182	if(point == '\n')
	183	{
	184	if(finding_headers)
	185	{
	186	#if 0
	187	string8 headers_buffer = {.data = &buffer.data[start], .size = end - start};
	188	#endif
	189	finding_headers = FALSE;
	190
	191	{
	192	//- map new header token list to table headers
	193	}
	194	}
	195	#if 0
	196	else
	197	{
	198
	199	}
	200	#endif
	201
	202
	203	table->row_count++;
	204	}
	205	else if(point == ',')
	206	{
	207	if (finding_headers)
	208	{
	209	table->header_count++;
	210	}
	211	}
	212
	213	switch(point)
	214	{
	215	case('\n'):
	216	{
	217	tok->flags \|= FL;
	218	break;
	219	}
	220
	221	case(','):
	222	{
	223	end = index - 1;
	224	start = index + 1;
	225	break;
	226	}
	227	default:
	228	{
	229	break;
	230	}
	231	}
	232
	233	tok->lexeme = StringCast(&buffer.data[start], end - start);
	234	tok->next_token = tok;
	235	}
	236
	237	return tok;
	238	}
	239
	240	//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
	241	internal b_tree *
	242	parse_csv(mem_arena arena, csv_token_list ctl, csv_table *table)
	243	{
	244	b_tree *tree = PushStructZero(arena, b_tree);
	245	b_tree_create(arena, tree);
	246
	247	// iterate over the token list while the token is not nil
	248	for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
	249	{
	250
	251	//- TODO(nasr): check initizalization or something tomorrow
	252	{
	253	//- are we parsing the first line tokens?
	254	//- if so, do something :))
	255	if(ct->flags & FL)
	256	{
	257	// TODO(nasr): replace with nil header check function
	258	if(table->header != &nil_csv_header \|\| table->header == NULL)
	259	{
	260	#if 0
	261	// - no this should happen in the tokenization
	262	table->headers->next =
	263	#endif
	264	}
	265	else
	266	{
	267
	268	}
	269
	270	}
	271	}
	272
	273	// TODO(nasr): fix this logic tomorrow
	274	csv_token *ct = PushStruct(arena, csv_token);
	275	// skip structural ctens, only index values
	276	if (ct->type != TOKEN_VALUE)
	277	{
	278	continue;
	279	}
	280
	281	// NOTE(nasr): payload is the cten itself so the caller can reach
	282	// row/col metadata without us having to copy it
	283	// NOTE(nasr): heh why do we void cast again?
	284	b_tree_insert(tree, ct->lexeme, (void *)ct);
	285	}
	286
	287	return tree;
	288	}
	289