summaryrefslogtreecommitdiff
path: root/source/tb_db/csv_decoder.h
diff options
context:
space:
mode:
Diffstat (limited to 'source/tb_db/csv_decoder.h')
-rw-r--r--source/tb_db/csv_decoder.h294
1 files changed, 294 insertions, 0 deletions
diff --git a/source/tb_db/csv_decoder.h b/source/tb_db/csv_decoder.h
new file mode 100644
index 0000000..3d09dc6
--- /dev/null
+++ b/source/tb_db/csv_decoder.h
@@ -0,0 +1,294 @@
1#ifndef ENGINE_LEXER_H
2#define ENGINE_LEXER_H
3
4enum csv_token_flags
5{
6 FL = 1 << 2,
7};
8
9enum csv_token_type
10{
11 // first 255 tokens for ascii characters
12 TOKEN_UNDEFINED = 255,
13 TOKEN_IDENTIFIER,
14 TOKEN_VALUE,
15};
16
17typedef struct csv_token csv_token;
18struct csv_token
19{
20 string8 lexeme;
21 csv_token *next_token;
22 enum csv_token_type type;
23 enum csv_token_flags flags;
24};
25
26// NOTE(nasr): i dont think im going to use this.
27typedef struct csv_row csv_row;
28struct csv_row
29{
30 // array of size col_count, points into mmap buffer
31 string8 *fields;
32 s32 count;
33};
34
35#if 0
36typedef struct csv_lntity csv_entity;
37struct csv_entity
38{
39 //- not needed because we use key header mapping i think
40};
41#endif
42
43typedef struct csv_header csv_header;
44struct csv_header
45{
46 string8 payload;
47 csv_header *next_header;
48};
49
50typedef struct csv_table csv_table;
51struct csv_table
52{
53 // first row, col names
54 // all data rows
55 csv_header *header;
56 s32 row_count;
57 s32 header_count;
58 b32 finding_headers;
59};
60
61
62typedef struct csv_token_list csv_token_list;
63struct csv_token_list
64{
65 csv_token *start_token;
66 csv_token *end_token;
67};
68
69read_only global_variable
70csv_token nil_csv_token=
71{
72 .lexeme = {.data = NULL, .size = 0},
73 .type = 0,
74 .flags = 0,
75 .next_token = &nil_csv_token,
76};
77
78read_only global_variable
79csv_header nil_csv_header =
80{
81 .payload = {.data = NULL, .size = 0},
82 .next_header = &nil_csv_header,
83};
84
85read_only global_variable
86csv_token_list nil_csv_token_list =
87{
88 .start_token = &nil_csv_token,
89 .end_token = &nil_csv_token,
90};
91
92read_only global_variable
93csv_row nil_csv_row =
94{
95 .fields = &nil_string,
96 .count = 0,
97};
98
99read_only global_variable
100csv_table nil_csv_table =
101{
102 .header = &nil_csv_header,
103 .row_count = 0,
104};
105
106#endif /* ENGINE_LEXER_H */
107
108internal b32
109is_nil_csv_token(csv_token *token)
110{
111 return ((token == NULL) || (token == &nil_csv_token));
112}
113
114// TODO(nasr): segfaulting because end_token not allocated
115internal void
116csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
117{
118 source_token_list->end_token->next_token = source_token;
119 source_token_list->end_token = source_token;
120}
121
122//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
123internal void
124csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
125{
126 if(is_nil_csv_token(source->start_token)) return;
127
128 csv_token *source_ct = source->start_token;
129 csv_token *destination_et = destination->end_token;
130
131 // walk source and stitch each node onto destination's tail
132 for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
133 {
134 destination_et->next_token = source_ct;
135 destination_et = source_ct;
136 }
137
138 // destination_et now points at the last real source node (not the nil sentinel)
139 destination->end_token = destination_et;
140}
141
142#if 0
143internal csv_token_list *
144parse_csv_row(string8 row_buffer)
145{
146 // csv_token_list *
147
148}
149#endif
150
151
152// the lexer acts as a table builder from a csv file
153// and parsing indivudal rows and columns
154// the next step would be building a the b-tree
155internal csv_token *
156tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
157{
158 unused(token_list);
159
160 if(buffer.size == 0) return NULL;
161
162 // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
163 // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
164 // forgot what the solution was
165 // TODO(nasr): check what the problem here was
166
167 // string size tracking across the loop not inside it
168 s32 start = 0;
169
170 for(s32 index = 0; buffer.data[index] != '\0'; ++index)
171 {
172 u8 point = buffer.data[index];
173
174#if 0
175 if(is_whitespace(point))
176 {
177 warn("csv file is invalid, detected whitespace");
178 return NULL;
179 }
180#endif
181
182 if(point == ',')
183 {
184 // emit a token for the field that ended before this comma
185 csv_token *token = PushStructZero(arena, csv_token);
186
187 assert_msg(token != NULL, "did the push struct fail??");
188 assert_msg(arena->current_position < arena->capacity, "no more arena size");
189
190 token->lexeme = StringCast(&buffer.data[start], index - start);
191 token->type = TOKEN_VALUE;
192 token->next_token = &nil_csv_token;
193 csv_token_list_append_token(token_list, token);
194
195 start = index + 1;
196
197 if(table->finding_headers)
198 {
199 table->header_count++;
200 }
201 }
202 else if(point == '\n')
203 {
204 // emit a token for the field that ended at this newline
205 csv_token *token = PushStructZero(arena, csv_token);
206 token->lexeme = StringCast(&buffer.data[start], index - start);
207 token->type = TOKEN_VALUE;
208 token->flags |= FL;
209 token->next_token = &nil_csv_token;
210
211 assert_msg(token_list, "token list invalid");
212 assert_msg(token, "you're tring to append an invalid token");
213
214 csv_token_list_append_token(token_list, token);
215
216 start = index + 1;
217
218 if(table->finding_headers)
219 {
220 {
221 //- map new header token list to table headers
222 }
223 table->finding_headers = FALSE;
224 }
225
226 table->row_count++;
227 }
228 }
229
230 // NOTE(nasr): return the first token the caller can walk the list from token_list
231 return token_list->start_token;
232}
233
234//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
235internal btree *
236parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
237{
238 btree *tree = PushStructZero(arena, btree);
239
240 s32 col_index = 0;
241 s32 row_index = 0;
242
243 // iterate over the token list while the token is not nil
244 for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
245 {
246 {
247 //- are we parsing the first line tokens?
248 //- if so, do something :))
249 if(ct->flags & FL)
250 {
251 // NOTE(nasr): FL marks end-of-line; advance row, reset col
252 row_index++;
253 col_index = 0;
254
255 // TODO(nasr): replace with nil header check function
256 // NOTE(nasr): == nil means header hasn't been set yet
257 if(table->header == &nil_csv_header || table->header == NULL)
258 {
259#if 0
260 // - no this should happen in the tokenization
261 table->headers->next =
262#endif
263 }
264 else
265 {
266
267 }
268
269 // FL tokens are structural, no value to index
270 continue;
271 }
272 }
273
274 // skip non-value tokens, only index actual cell values
275 if (ct->type != TOKEN_VALUE)
276 {
277 col_index++;
278 continue;
279 }
280
281 // NOTE(nasr): payload is the cten itself so the caller can reach
282 // row/col metadata without us having to copy it
283 key k = {
284 .header_index = col_index,
285 .row_index = row_index,
286 };
287
288 btree_insert(arena, tree, k, (void *)ct);
289
290 col_index++;
291 }
292
293 return tree;
294}