1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
|
#ifndef ENGINE_LEXER_H
#define ENGINE_LEXER_H
enum csv_token_flags
{
FL = 1 << 2,
};
enum csv_token_type
{
// first 255 tokens for ascii characters
TOKEN_UNDEFINED = 255,
TOKEN_IDENTIFIER,
TOKEN_VALUE,
};
typedef struct csv_token csv_token;
struct csv_token
{
string8 lexeme;
csv_token *next_token;
enum csv_token_type type;
enum csv_token_flags flags;
};
// NOTE(nasr): i dont think im going to use this.
typedef struct csv_row csv_row;
struct csv_row
{
// array of size col_count, points into mmap buffer
string8 *fields;
s32 count;
};
#if 0
typedef struct csv_lntity csv_entity;
struct csv_entity
{
//- not needed because we use key header mapping i think
};
#endif
typedef struct csv_header csv_header;
struct csv_header
{
string8 payload;
csv_header *next_header;
};
typedef struct csv_table csv_table;
struct csv_table
{
// first row, col names
// all data rows
csv_header *header;
s32 row_count;
s32 header_count;
b32 finding_headers;
};
typedef struct csv_token_list csv_token_list;
struct csv_token_list
{
csv_token *start_token;
csv_token *end_token;
};
read_only global_variable
csv_token nil_csv_token=
{
.lexeme = {.data = NULL, .size = 0},
.type = 0,
.flags = 0,
.next_token = &nil_csv_token,
};
read_only global_variable
csv_header nil_csv_header =
{
.payload = {.data = NULL, .size = 0},
.next_header = &nil_csv_header,
};
read_only global_variable
csv_token_list nil_csv_token_list =
{
.start_token = &nil_csv_token,
.end_token = &nil_csv_token,
};
read_only global_variable
csv_row nil_csv_row =
{
.fields = &nil_string,
.count = 0,
};
read_only global_variable
csv_table nil_csv_table =
{
.header = &nil_csv_header,
.row_count = 0,
};
#endif /* ENGINE_LEXER_H */
internal b32
is_nil_csv_token(csv_token *token)
{
return ((token == NULL) || (token == &nil_csv_token));
}
// TODO(nasr): segfaulting because end_token not allocated
internal void
csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
{
source_token_list->end_token->next_token = source_token;
source_token_list->end_token = source_token;
}
//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
internal void
csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
{
if(is_nil_csv_token(source->start_token)) return;
csv_token *source_ct = source->start_token;
csv_token *destination_et = destination->end_token;
// walk source and stitch each node onto destination's tail
for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
{
destination_et->next_token = source_ct;
destination_et = source_ct;
}
// destination_et now points at the last real source node (not the nil sentinel)
destination->end_token = destination_et;
}
#if 0
internal csv_token_list *
parse_csv_row(string8 row_buffer)
{
// csv_token_list *
}
#endif
// the lexer acts as a table builder from a csv file
// and parsing indivudal rows and columns
// the next step would be building a the b-tree
internal csv_token *
tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
{
unused(token_list);
if(buffer.size == 0) return NULL;
// URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
// NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
// forgot what the solution was
// TODO(nasr): check what the problem here was
// string size tracking across the loop not inside it
s32 start = 0;
for(s32 index = 0; buffer.data[index] != '\0'; ++index)
{
u8 point = buffer.data[index];
#if 0
if(is_whitespace(point))
{
warn("csv file is invalid, detected whitespace");
return NULL;
}
#endif
if(point == ',')
{
// emit a token for the field that ended before this comma
csv_token *token = PushStructZero(arena, csv_token);
assert_msg(token != NULL, "did the push struct fail??");
assert_msg(arena->current_position < arena->capacity, "no more arena size");
token->lexeme = StringCast(&buffer.data[start], index - start);
token->type = TOKEN_VALUE;
token->next_token = &nil_csv_token;
csv_token_list_append_token(token_list, token);
start = index + 1;
if(table->finding_headers)
{
table->header_count++;
}
}
else if(point == '\n')
{
// emit a token for the field that ended at this newline
csv_token *token = PushStructZero(arena, csv_token);
token->lexeme = StringCast(&buffer.data[start], index - start);
token->type = TOKEN_VALUE;
token->flags |= FL;
token->next_token = &nil_csv_token;
assert_msg(token_list, "token list invalid");
assert_msg(token, "you're tring to append an invalid token");
csv_token_list_append_token(token_list, token);
start = index + 1;
if(table->finding_headers)
{
{
//- map new header token list to table headers
}
table->finding_headers = FALSE;
}
table->row_count++;
}
}
// NOTE(nasr): return the first token the caller can walk the list from token_list
return token_list->start_token;
}
//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
internal btree *
parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
{
btree *tree = PushStructZero(arena, btree);
s32 col_index = 0;
s32 row_index = 0;
// iterate over the token list while the token is not nil
for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
{
{
//- are we parsing the first line tokens?
//- if so, do something :))
if(ct->flags & FL)
{
// NOTE(nasr): FL marks end-of-line; advance row, reset col
row_index++;
col_index = 0;
// TODO(nasr): replace with nil header check function
// NOTE(nasr): == nil means header hasn't been set yet
if(table->header == &nil_csv_header || table->header == NULL)
{
#if 0
// - no this should happen in the tokenization
table->headers->next =
#endif
}
else
{
}
// FL tokens are structural, no value to index
continue;
}
}
// skip non-value tokens, only index actual cell values
if (ct->type != TOKEN_VALUE)
{
col_index++;
continue;
}
// NOTE(nasr): payload is the cten itself so the caller can reach
// row/col metadata without us having to copy it
key k = {
.header_index = col_index,
.row_index = row_index,
};
btree_insert(arena, tree, k, (void *)ct);
col_index++;
}
return tree;
}
|