diff options
| author | nasr <nsrddyn@gmail.com> | 2026-04-13 14:58:49 +0200 |
|---|---|---|
| committer | nasr <nsrddyn@gmail.com> | 2026-04-13 14:59:10 +0200 |
| commit | a9cb228861a6b0fad4d508c05c0614757a7f0a34 (patch) | |
| tree | 281ae48c7248413cae727b403a1cd802741b061d /source/tb_db/csv_decoder.h | |
| parent | 65907835d9835d85cff31269db19e18045cb3392 (diff) | |
refactor(main): refactor directory structuremain
Diffstat (limited to 'source/tb_db/csv_decoder.h')
| -rw-r--r-- | source/tb_db/csv_decoder.h | 294 |
1 files changed, 294 insertions, 0 deletions
diff --git a/source/tb_db/csv_decoder.h b/source/tb_db/csv_decoder.h new file mode 100644 index 0000000..3d09dc6 --- /dev/null +++ b/source/tb_db/csv_decoder.h | |||
| @@ -0,0 +1,294 @@ | |||
| 1 | #ifndef ENGINE_LEXER_H | ||
| 2 | #define ENGINE_LEXER_H | ||
| 3 | |||
| 4 | enum csv_token_flags | ||
| 5 | { | ||
| 6 | FL = 1 << 2, | ||
| 7 | }; | ||
| 8 | |||
| 9 | enum csv_token_type | ||
| 10 | { | ||
| 11 | // first 255 tokens for ascii characters | ||
| 12 | TOKEN_UNDEFINED = 255, | ||
| 13 | TOKEN_IDENTIFIER, | ||
| 14 | TOKEN_VALUE, | ||
| 15 | }; | ||
| 16 | |||
| 17 | typedef struct csv_token csv_token; | ||
| 18 | struct csv_token | ||
| 19 | { | ||
| 20 | string8 lexeme; | ||
| 21 | csv_token *next_token; | ||
| 22 | enum csv_token_type type; | ||
| 23 | enum csv_token_flags flags; | ||
| 24 | }; | ||
| 25 | |||
| 26 | // NOTE(nasr): i dont think im going to use this. | ||
| 27 | typedef struct csv_row csv_row; | ||
| 28 | struct csv_row | ||
| 29 | { | ||
| 30 | // array of size col_count, points into mmap buffer | ||
| 31 | string8 *fields; | ||
| 32 | s32 count; | ||
| 33 | }; | ||
| 34 | |||
| 35 | #if 0 | ||
| 36 | typedef struct csv_lntity csv_entity; | ||
| 37 | struct csv_entity | ||
| 38 | { | ||
| 39 | //- not needed because we use key header mapping i think | ||
| 40 | }; | ||
| 41 | #endif | ||
| 42 | |||
| 43 | typedef struct csv_header csv_header; | ||
| 44 | struct csv_header | ||
| 45 | { | ||
| 46 | string8 payload; | ||
| 47 | csv_header *next_header; | ||
| 48 | }; | ||
| 49 | |||
| 50 | typedef struct csv_table csv_table; | ||
| 51 | struct csv_table | ||
| 52 | { | ||
| 53 | // first row, col names | ||
| 54 | // all data rows | ||
| 55 | csv_header *header; | ||
| 56 | s32 row_count; | ||
| 57 | s32 header_count; | ||
| 58 | b32 finding_headers; | ||
| 59 | }; | ||
| 60 | |||
| 61 | |||
| 62 | typedef struct csv_token_list csv_token_list; | ||
| 63 | struct csv_token_list | ||
| 64 | { | ||
| 65 | csv_token *start_token; | ||
| 66 | csv_token *end_token; | ||
| 67 | }; | ||
| 68 | |||
| 69 | read_only global_variable | ||
| 70 | csv_token nil_csv_token= | ||
| 71 | { | ||
| 72 | .lexeme = {.data = NULL, .size = 0}, | ||
| 73 | .type = 0, | ||
| 74 | .flags = 0, | ||
| 75 | .next_token = &nil_csv_token, | ||
| 76 | }; | ||
| 77 | |||
| 78 | read_only global_variable | ||
| 79 | csv_header nil_csv_header = | ||
| 80 | { | ||
| 81 | .payload = {.data = NULL, .size = 0}, | ||
| 82 | .next_header = &nil_csv_header, | ||
| 83 | }; | ||
| 84 | |||
| 85 | read_only global_variable | ||
| 86 | csv_token_list nil_csv_token_list = | ||
| 87 | { | ||
| 88 | .start_token = &nil_csv_token, | ||
| 89 | .end_token = &nil_csv_token, | ||
| 90 | }; | ||
| 91 | |||
| 92 | read_only global_variable | ||
| 93 | csv_row nil_csv_row = | ||
| 94 | { | ||
| 95 | .fields = &nil_string, | ||
| 96 | .count = 0, | ||
| 97 | }; | ||
| 98 | |||
| 99 | read_only global_variable | ||
| 100 | csv_table nil_csv_table = | ||
| 101 | { | ||
| 102 | .header = &nil_csv_header, | ||
| 103 | .row_count = 0, | ||
| 104 | }; | ||
| 105 | |||
| 106 | #endif /* ENGINE_LEXER_H */ | ||
| 107 | |||
| 108 | internal b32 | ||
| 109 | is_nil_csv_token(csv_token *token) | ||
| 110 | { | ||
| 111 | return ((token == NULL) || (token == &nil_csv_token)); | ||
| 112 | } | ||
| 113 | |||
| 114 | // TODO(nasr): segfaulting because end_token not allocated | ||
| 115 | internal void | ||
| 116 | csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token) | ||
| 117 | { | ||
| 118 | source_token_list->end_token->next_token = source_token; | ||
| 119 | source_token_list->end_token = source_token; | ||
| 120 | } | ||
| 121 | |||
| 122 | //- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother | ||
| 123 | internal void | ||
| 124 | csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source) | ||
| 125 | { | ||
| 126 | if(is_nil_csv_token(source->start_token)) return; | ||
| 127 | |||
| 128 | csv_token *source_ct = source->start_token; | ||
| 129 | csv_token *destination_et = destination->end_token; | ||
| 130 | |||
| 131 | // walk source and stitch each node onto destination's tail | ||
| 132 | for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token) | ||
| 133 | { | ||
| 134 | destination_et->next_token = source_ct; | ||
| 135 | destination_et = source_ct; | ||
| 136 | } | ||
| 137 | |||
| 138 | // destination_et now points at the last real source node (not the nil sentinel) | ||
| 139 | destination->end_token = destination_et; | ||
| 140 | } | ||
| 141 | |||
| 142 | #if 0 | ||
| 143 | internal csv_token_list * | ||
| 144 | parse_csv_row(string8 row_buffer) | ||
| 145 | { | ||
| 146 | // csv_token_list * | ||
| 147 | |||
| 148 | } | ||
| 149 | #endif | ||
| 150 | |||
| 151 | |||
| 152 | // the lexer acts as a table builder from a csv file | ||
| 153 | // and parsing indivudal rows and columns | ||
| 154 | // the next step would be building a the b-tree | ||
| 155 | internal csv_token * | ||
| 156 | tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list) | ||
| 157 | { | ||
| 158 | unused(token_list); | ||
| 159 | |||
| 160 | if(buffer.size == 0) return NULL; | ||
| 161 | |||
| 162 | // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit | ||
| 163 | // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING??? | ||
| 164 | // forgot what the solution was | ||
| 165 | // TODO(nasr): check what the problem here was | ||
| 166 | |||
| 167 | // string size tracking across the loop not inside it | ||
| 168 | s32 start = 0; | ||
| 169 | |||
| 170 | for(s32 index = 0; buffer.data[index] != '\0'; ++index) | ||
| 171 | { | ||
| 172 | u8 point = buffer.data[index]; | ||
| 173 | |||
| 174 | #if 0 | ||
| 175 | if(is_whitespace(point)) | ||
| 176 | { | ||
| 177 | warn("csv file is invalid, detected whitespace"); | ||
| 178 | return NULL; | ||
| 179 | } | ||
| 180 | #endif | ||
| 181 | |||
| 182 | if(point == ',') | ||
| 183 | { | ||
| 184 | // emit a token for the field that ended before this comma | ||
| 185 | csv_token *token = PushStructZero(arena, csv_token); | ||
| 186 | |||
| 187 | assert_msg(token != NULL, "did the push struct fail??"); | ||
| 188 | assert_msg(arena->current_position < arena->capacity, "no more arena size"); | ||
| 189 | |||
| 190 | token->lexeme = StringCast(&buffer.data[start], index - start); | ||
| 191 | token->type = TOKEN_VALUE; | ||
| 192 | token->next_token = &nil_csv_token; | ||
| 193 | csv_token_list_append_token(token_list, token); | ||
| 194 | |||
| 195 | start = index + 1; | ||
| 196 | |||
| 197 | if(table->finding_headers) | ||
| 198 | { | ||
| 199 | table->header_count++; | ||
| 200 | } | ||
| 201 | } | ||
| 202 | else if(point == '\n') | ||
| 203 | { | ||
| 204 | // emit a token for the field that ended at this newline | ||
| 205 | csv_token *token = PushStructZero(arena, csv_token); | ||
| 206 | token->lexeme = StringCast(&buffer.data[start], index - start); | ||
| 207 | token->type = TOKEN_VALUE; | ||
| 208 | token->flags |= FL; | ||
| 209 | token->next_token = &nil_csv_token; | ||
| 210 | |||
| 211 | assert_msg(token_list, "token list invalid"); | ||
| 212 | assert_msg(token, "you're tring to append an invalid token"); | ||
| 213 | |||
| 214 | csv_token_list_append_token(token_list, token); | ||
| 215 | |||
| 216 | start = index + 1; | ||
| 217 | |||
| 218 | if(table->finding_headers) | ||
| 219 | { | ||
| 220 | { | ||
| 221 | //- map new header token list to table headers | ||
| 222 | } | ||
| 223 | table->finding_headers = FALSE; | ||
| 224 | } | ||
| 225 | |||
| 226 | table->row_count++; | ||
| 227 | } | ||
| 228 | } | ||
| 229 | |||
| 230 | // NOTE(nasr): return the first token the caller can walk the list from token_list | ||
| 231 | return token_list->start_token; | ||
| 232 | } | ||
| 233 | |||
| 234 | //- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future | ||
| 235 | internal btree * | ||
| 236 | parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table) | ||
| 237 | { | ||
| 238 | btree *tree = PushStructZero(arena, btree); | ||
| 239 | |||
| 240 | s32 col_index = 0; | ||
| 241 | s32 row_index = 0; | ||
| 242 | |||
| 243 | // iterate over the token list while the token is not nil | ||
| 244 | for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token) | ||
| 245 | { | ||
| 246 | { | ||
| 247 | //- are we parsing the first line tokens? | ||
| 248 | //- if so, do something :)) | ||
| 249 | if(ct->flags & FL) | ||
| 250 | { | ||
| 251 | // NOTE(nasr): FL marks end-of-line; advance row, reset col | ||
| 252 | row_index++; | ||
| 253 | col_index = 0; | ||
| 254 | |||
| 255 | // TODO(nasr): replace with nil header check function | ||
| 256 | // NOTE(nasr): == nil means header hasn't been set yet | ||
| 257 | if(table->header == &nil_csv_header || table->header == NULL) | ||
| 258 | { | ||
| 259 | #if 0 | ||
| 260 | // - no this should happen in the tokenization | ||
| 261 | table->headers->next = | ||
| 262 | #endif | ||
| 263 | } | ||
| 264 | else | ||
| 265 | { | ||
| 266 | |||
| 267 | } | ||
| 268 | |||
| 269 | // FL tokens are structural, no value to index | ||
| 270 | continue; | ||
| 271 | } | ||
| 272 | } | ||
| 273 | |||
| 274 | // skip non-value tokens, only index actual cell values | ||
| 275 | if (ct->type != TOKEN_VALUE) | ||
| 276 | { | ||
| 277 | col_index++; | ||
| 278 | continue; | ||
| 279 | } | ||
| 280 | |||
| 281 | // NOTE(nasr): payload is the cten itself so the caller can reach | ||
| 282 | // row/col metadata without us having to copy it | ||
| 283 | key k = { | ||
| 284 | .header_index = col_index, | ||
| 285 | .row_index = row_index, | ||
| 286 | }; | ||
| 287 | |||
| 288 | btree_insert(arena, tree, k, (void *)ct); | ||
| 289 | |||
| 290 | col_index++; | ||
| 291 | } | ||
| 292 | |||
| 293 | return tree; | ||
| 294 | } | ||
