summaryrefslogtreecommitdiff
path: root/source/csv_decoder.h
diff options
context:
space:
mode:
authornasr <nsrddyn@gmail.com>2026-03-16 19:20:23 +0000
committernasr <nsrddyn@gmail.com>2026-03-16 19:20:23 +0000
commit180ccc84aac07c7bee2b09a6e07f7406908409b9 (patch)
treeefa39665e41c3132626f2c08b2f3ae0d18adc17a /source/csv_decoder.h
parent2e258673171c2e4663a8b5d58e2ad174bb0ecd96 (diff)
feature(main): lots of stuff see description
1. increased compile time warnings to help with some optimizations. 2. impelmeented csv lexing helper functions that do stuff on tokenlists like appending and concatenating lists with each other 3. realiszed that btree design in faulty so disabled it and will refactor it in the next approach
Diffstat (limited to 'source/csv_decoder.h')
-rw-r--r--source/csv_decoder.h289
1 files changed, 289 insertions, 0 deletions
diff --git a/source/csv_decoder.h b/source/csv_decoder.h
new file mode 100644
index 0000000..b754ef5
--- /dev/null
+++ b/source/csv_decoder.h
@@ -0,0 +1,289 @@
1#ifndef ENGINE_LEXER_H
2#define ENGINE_LEXER_H
3
4enum csv_token_flags
5{
6 FL = 1 << 2,
7};
8
9enum csv_token_type
10{
11 // first 255 tokens for ascii characters
12 TOKEN_UNDEFINED = 255,
13 TOKEN_IDENTIFIER,
14 TOKEN_VALUE,
15};
16
17typedef struct csv_token csv_token;
18struct csv_token
19{
20 string8 lexeme;
21 csv_token *next_token;
22 enum csv_token_type type;
23 enum csv_token_flags flags;
24};
25
26// NOTE(nasr): i dont think im going to use this.
27typedef struct csv_row csv_row;
28struct csv_row
29{
30 // array of size col_count, points into mmap buffer
31 string8 *fields;
32 s32 count;
33};
34
35#if 0
36typedef struct csv_lntity csv_entity;
37struct csv_entity
38{
39 //- not needed because we use key header mapping i think
40};
41#endif
42
43typedef struct csv_header csv_header;
44struct csv_header
45{
46 string8 payload;
47 csv_header *next_header;
48};
49
50typedef struct csv_table csv_table;
51struct csv_table
52{
53 // first row, col names
54 // all data rows
55 csv_header *header;
56 s32 row_count;
57 s32 header_count;
58};
59
60
61typedef struct csv_token_list csv_token_list;
62struct csv_token_list
63{
64 csv_token *start_token;
65 csv_token *end_token;
66
67};
68
69read_only global_variable
70csv_token nil_csv_token=
71{
72 .lexeme = {.data = NULL, .size =0},
73 .type = 0,
74 .flags = 0,
75 .next_token = &nil_csv_token,
76
77};
78
79read_only global_variable
80csv_header nil_csv_header =
81{
82 .payload = {.data = NULL, .size = 0},
83 .next_header = &nil_csv_header,
84};
85
86read_only global_variable
87csv_token_list nil_csv_token_list =
88{
89 .start_token = &nil_csv_token,
90 .end_token = &nil_csv_token,
91};
92
93
94read_only global_variable
95csv_row nil_csv_row =
96{
97 .fields = &nil_string,
98 .count = 0,
99};
100
101read_only global_variable
102csv_table nil_csv_table =
103{
104 .header = &nil_csv_header,
105 .row_count = 0,
106};
107
108#endif /* ENGINE_LEXER_H */
109
110internal b32
111is_nil_csv_token(csv_token *token)
112{
113 return ((token == NULL) || (token == &nil_csv_token));
114}
115
116internal void
117csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
118{
119 source_token_list->end_token->next_token = source_token;
120 source_token_list->end_token = source_token;
121
122}
123
124//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
125internal void
126csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
127{
128
129 csv_token *source_ct = source->start_token;
130 csv_token *destination_end_ct = destination->end_token;
131
132 for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
133 {
134 destination_end_ct->next_token = source_ct;
135 }
136
137 destination->end_token = source_ct;
138}
139
140#if 0
141internal csv_token_list *
142parse_csv_row(string8 row_buffer)
143{
144 // csv_token_list *
145
146}
147#endif
148
149
150// the lexer acts as a table builder from a csv file
151// and parsing indivudal rows and columns
152// the next step would be building a the b-tree
153internal csv_token *
154tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
155{
156
157 unused(token_list);
158 b32 finding_headers = TRUE;
159
160 if(buffer.size < 0) return NULL;
161
162 csv_token *tok = PushStruct(arena, csv_token);
163
164 // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
165 // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
166 // forgot what the solution was
167 // TODO(nasr): check what the problem here was
168 for(s32 index = 0; buffer.data[index] != '\0'; ++index)
169 {
170 u8 point = buffer.data[index];
171
172 s32 start = 0;
173 s32 end = 0;
174
175 if(is_whitespace(point))
176 {
177 warn("csv file is invalid, detected whitespace");
178 return NULL;
179 }
180
181
182 if(point == '\n')
183 {
184 if(finding_headers)
185 {
186#if 0
187 string8 headers_buffer = {.data = &buffer.data[start], .size = end - start};
188#endif
189 finding_headers = FALSE;
190
191 {
192 //- map new header token list to table headers
193 }
194 }
195#if 0
196 else
197 {
198
199 }
200#endif
201
202
203 table->row_count++;
204 }
205 else if(point == ',')
206 {
207 if (finding_headers)
208 {
209 table->header_count++;
210 }
211 }
212
213 switch(point)
214 {
215 case('\n'):
216 {
217 tok->flags |= FL;
218 break;
219 }
220
221 case(','):
222 {
223 end = index - 1;
224 start = index + 1;
225 break;
226 }
227 default:
228 {
229 break;
230 }
231 }
232
233 tok->lexeme = StringCast(&buffer.data[start], end - start);
234 tok->next_token = tok;
235 }
236
237 return tok;
238}
239
240//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
241internal b_tree *
242parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
243{
244 b_tree *tree = PushStructZero(arena, b_tree);
245 b_tree_create(arena, tree);
246
247 // iterate over the token list while the token is not nil
248 for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
249 {
250
251 //- TODO(nasr): check initizalization or something tomorrow
252 {
253 //- are we parsing the first line tokens?
254 //- if so, do something :))
255 if(ct->flags & FL)
256 {
257 // TODO(nasr): replace with nil header check function
258 if(table->header != &nil_csv_header || table->header == NULL)
259 {
260#if 0
261 // - no this should happen in the tokenization
262 table->headers->next =
263#endif
264 }
265 else
266 {
267
268 }
269
270 }
271 }
272
273 // TODO(nasr): fix this logic tomorrow
274 csv_token *ct = PushStruct(arena, csv_token);
275 // skip structural ctens, only index values
276 if (ct->type != TOKEN_VALUE)
277 {
278 continue;
279 }
280
281 // NOTE(nasr): payload is the cten itself so the caller can reach
282 // row/col metadata without us having to copy it
283 // NOTE(nasr): heh why do we void cast again?
284 b_tree_insert(tree, ct->lexeme, (void *)ct);
285 }
286
287 return tree;
288}
289