summaryrefslogtreecommitdiff
path: root/source/csv_decoder.h
diff options
context:
space:
mode:
Diffstat (limited to 'source/csv_decoder.h')
-rw-r--r--source/csv_decoder.h289
1 files changed, 289 insertions, 0 deletions
diff --git a/source/csv_decoder.h b/source/csv_decoder.h
new file mode 100644
index 0000000..b754ef5
--- /dev/null
+++ b/source/csv_decoder.h
@@ -0,0 +1,289 @@
1#ifndef ENGINE_LEXER_H
2#define ENGINE_LEXER_H
3
4enum csv_token_flags
5{
6 FL = 1 << 2,
7};
8
9enum csv_token_type
10{
11 // first 255 tokens for ascii characters
12 TOKEN_UNDEFINED = 255,
13 TOKEN_IDENTIFIER,
14 TOKEN_VALUE,
15};
16
17typedef struct csv_token csv_token;
18struct csv_token
19{
20 string8 lexeme;
21 csv_token *next_token;
22 enum csv_token_type type;
23 enum csv_token_flags flags;
24};
25
26// NOTE(nasr): i dont think im going to use this.
27typedef struct csv_row csv_row;
28struct csv_row
29{
30 // array of size col_count, points into mmap buffer
31 string8 *fields;
32 s32 count;
33};
34
35#if 0
36typedef struct csv_lntity csv_entity;
37struct csv_entity
38{
39 //- not needed because we use key header mapping i think
40};
41#endif
42
43typedef struct csv_header csv_header;
44struct csv_header
45{
46 string8 payload;
47 csv_header *next_header;
48};
49
50typedef struct csv_table csv_table;
51struct csv_table
52{
53 // first row, col names
54 // all data rows
55 csv_header *header;
56 s32 row_count;
57 s32 header_count;
58};
59
60
61typedef struct csv_token_list csv_token_list;
62struct csv_token_list
63{
64 csv_token *start_token;
65 csv_token *end_token;
66
67};
68
69read_only global_variable
70csv_token nil_csv_token=
71{
72 .lexeme = {.data = NULL, .size =0},
73 .type = 0,
74 .flags = 0,
75 .next_token = &nil_csv_token,
76
77};
78
79read_only global_variable
80csv_header nil_csv_header =
81{
82 .payload = {.data = NULL, .size = 0},
83 .next_header = &nil_csv_header,
84};
85
86read_only global_variable
87csv_token_list nil_csv_token_list =
88{
89 .start_token = &nil_csv_token,
90 .end_token = &nil_csv_token,
91};
92
93
94read_only global_variable
95csv_row nil_csv_row =
96{
97 .fields = &nil_string,
98 .count = 0,
99};
100
101read_only global_variable
102csv_table nil_csv_table =
103{
104 .header = &nil_csv_header,
105 .row_count = 0,
106};
107
108#endif /* ENGINE_LEXER_H */
109
110internal b32
111is_nil_csv_token(csv_token *token)
112{
113 return ((token == NULL) || (token == &nil_csv_token));
114}
115
116internal void
117csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
118{
119 source_token_list->end_token->next_token = source_token;
120 source_token_list->end_token = source_token;
121
122}
123
124//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
125internal void
126csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
127{
128
129 csv_token *source_ct = source->start_token;
130 csv_token *destination_end_ct = destination->end_token;
131
132 for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
133 {
134 destination_end_ct->next_token = source_ct;
135 }
136
137 destination->end_token = source_ct;
138}
139
140#if 0
141internal csv_token_list *
142parse_csv_row(string8 row_buffer)
143{
144 // csv_token_list *
145
146}
147#endif
148
149
150// the lexer acts as a table builder from a csv file
151// and parsing indivudal rows and columns
152// the next step would be building a the b-tree
153internal csv_token *
154tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
155{
156
157 unused(token_list);
158 b32 finding_headers = TRUE;
159
160 if(buffer.size < 0) return NULL;
161
162 csv_token *tok = PushStruct(arena, csv_token);
163
164 // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
165 // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
166 // forgot what the solution was
167 // TODO(nasr): check what the problem here was
168 for(s32 index = 0; buffer.data[index] != '\0'; ++index)
169 {
170 u8 point = buffer.data[index];
171
172 s32 start = 0;
173 s32 end = 0;
174
175 if(is_whitespace(point))
176 {
177 warn("csv file is invalid, detected whitespace");
178 return NULL;
179 }
180
181
182 if(point == '\n')
183 {
184 if(finding_headers)
185 {
186#if 0
187 string8 headers_buffer = {.data = &buffer.data[start], .size = end - start};
188#endif
189 finding_headers = FALSE;
190
191 {
192 //- map new header token list to table headers
193 }
194 }
195#if 0
196 else
197 {
198
199 }
200#endif
201
202
203 table->row_count++;
204 }
205 else if(point == ',')
206 {
207 if (finding_headers)
208 {
209 table->header_count++;
210 }
211 }
212
213 switch(point)
214 {
215 case('\n'):
216 {
217 tok->flags |= FL;
218 break;
219 }
220
221 case(','):
222 {
223 end = index - 1;
224 start = index + 1;
225 break;
226 }
227 default:
228 {
229 break;
230 }
231 }
232
233 tok->lexeme = StringCast(&buffer.data[start], end - start);
234 tok->next_token = tok;
235 }
236
237 return tok;
238}
239
240//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
241internal b_tree *
242parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
243{
244 b_tree *tree = PushStructZero(arena, b_tree);
245 b_tree_create(arena, tree);
246
247 // iterate over the token list while the token is not nil
248 for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
249 {
250
251 //- TODO(nasr): check initizalization or something tomorrow
252 {
253 //- are we parsing the first line tokens?
254 //- if so, do something :))
255 if(ct->flags & FL)
256 {
257 // TODO(nasr): replace with nil header check function
258 if(table->header != &nil_csv_header || table->header == NULL)
259 {
260#if 0
261 // - no this should happen in the tokenization
262 table->headers->next =
263#endif
264 }
265 else
266 {
267
268 }
269
270 }
271 }
272
273 // TODO(nasr): fix this logic tomorrow
274 csv_token *ct = PushStruct(arena, csv_token);
275 // skip structural ctens, only index values
276 if (ct->type != TOKEN_VALUE)
277 {
278 continue;
279 }
280
281 // NOTE(nasr): payload is the cten itself so the caller can reach
282 // row/col metadata without us having to copy it
283 // NOTE(nasr): heh why do we void cast again?
284 b_tree_insert(tree, ct->lexeme, (void *)ct);
285 }
286
287 return tree;
288}
289