diff options
Diffstat (limited to 'source/csv_decoder.h')
| -rw-r--r-- | source/csv_decoder.h | 289 |
1 files changed, 289 insertions, 0 deletions
diff --git a/source/csv_decoder.h b/source/csv_decoder.h new file mode 100644 index 0000000..b754ef5 --- /dev/null +++ b/source/csv_decoder.h | |||
| @@ -0,0 +1,289 @@ | |||
| 1 | #ifndef ENGINE_LEXER_H | ||
| 2 | #define ENGINE_LEXER_H | ||
| 3 | |||
| 4 | enum csv_token_flags | ||
| 5 | { | ||
| 6 | FL = 1 << 2, | ||
| 7 | }; | ||
| 8 | |||
| 9 | enum csv_token_type | ||
| 10 | { | ||
| 11 | // first 255 tokens for ascii characters | ||
| 12 | TOKEN_UNDEFINED = 255, | ||
| 13 | TOKEN_IDENTIFIER, | ||
| 14 | TOKEN_VALUE, | ||
| 15 | }; | ||
| 16 | |||
| 17 | typedef struct csv_token csv_token; | ||
| 18 | struct csv_token | ||
| 19 | { | ||
| 20 | string8 lexeme; | ||
| 21 | csv_token *next_token; | ||
| 22 | enum csv_token_type type; | ||
| 23 | enum csv_token_flags flags; | ||
| 24 | }; | ||
| 25 | |||
| 26 | // NOTE(nasr): i dont think im going to use this. | ||
| 27 | typedef struct csv_row csv_row; | ||
| 28 | struct csv_row | ||
| 29 | { | ||
| 30 | // array of size col_count, points into mmap buffer | ||
| 31 | string8 *fields; | ||
| 32 | s32 count; | ||
| 33 | }; | ||
| 34 | |||
| 35 | #if 0 | ||
| 36 | typedef struct csv_lntity csv_entity; | ||
| 37 | struct csv_entity | ||
| 38 | { | ||
| 39 | //- not needed because we use key header mapping i think | ||
| 40 | }; | ||
| 41 | #endif | ||
| 42 | |||
| 43 | typedef struct csv_header csv_header; | ||
| 44 | struct csv_header | ||
| 45 | { | ||
| 46 | string8 payload; | ||
| 47 | csv_header *next_header; | ||
| 48 | }; | ||
| 49 | |||
| 50 | typedef struct csv_table csv_table; | ||
| 51 | struct csv_table | ||
| 52 | { | ||
| 53 | // first row, col names | ||
| 54 | // all data rows | ||
| 55 | csv_header *header; | ||
| 56 | s32 row_count; | ||
| 57 | s32 header_count; | ||
| 58 | }; | ||
| 59 | |||
| 60 | |||
| 61 | typedef struct csv_token_list csv_token_list; | ||
| 62 | struct csv_token_list | ||
| 63 | { | ||
| 64 | csv_token *start_token; | ||
| 65 | csv_token *end_token; | ||
| 66 | |||
| 67 | }; | ||
| 68 | |||
| 69 | read_only global_variable | ||
| 70 | csv_token nil_csv_token= | ||
| 71 | { | ||
| 72 | .lexeme = {.data = NULL, .size =0}, | ||
| 73 | .type = 0, | ||
| 74 | .flags = 0, | ||
| 75 | .next_token = &nil_csv_token, | ||
| 76 | |||
| 77 | }; | ||
| 78 | |||
| 79 | read_only global_variable | ||
| 80 | csv_header nil_csv_header = | ||
| 81 | { | ||
| 82 | .payload = {.data = NULL, .size = 0}, | ||
| 83 | .next_header = &nil_csv_header, | ||
| 84 | }; | ||
| 85 | |||
| 86 | read_only global_variable | ||
| 87 | csv_token_list nil_csv_token_list = | ||
| 88 | { | ||
| 89 | .start_token = &nil_csv_token, | ||
| 90 | .end_token = &nil_csv_token, | ||
| 91 | }; | ||
| 92 | |||
| 93 | |||
| 94 | read_only global_variable | ||
| 95 | csv_row nil_csv_row = | ||
| 96 | { | ||
| 97 | .fields = &nil_string, | ||
| 98 | .count = 0, | ||
| 99 | }; | ||
| 100 | |||
| 101 | read_only global_variable | ||
| 102 | csv_table nil_csv_table = | ||
| 103 | { | ||
| 104 | .header = &nil_csv_header, | ||
| 105 | .row_count = 0, | ||
| 106 | }; | ||
| 107 | |||
| 108 | #endif /* ENGINE_LEXER_H */ | ||
| 109 | |||
| 110 | internal b32 | ||
| 111 | is_nil_csv_token(csv_token *token) | ||
| 112 | { | ||
| 113 | return ((token == NULL) || (token == &nil_csv_token)); | ||
| 114 | } | ||
| 115 | |||
| 116 | internal void | ||
| 117 | csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token) | ||
| 118 | { | ||
| 119 | source_token_list->end_token->next_token = source_token; | ||
| 120 | source_token_list->end_token = source_token; | ||
| 121 | |||
| 122 | } | ||
| 123 | |||
| 124 | //- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother | ||
| 125 | internal void | ||
| 126 | csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source) | ||
| 127 | { | ||
| 128 | |||
| 129 | csv_token *source_ct = source->start_token; | ||
| 130 | csv_token *destination_end_ct = destination->end_token; | ||
| 131 | |||
| 132 | for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token) | ||
| 133 | { | ||
| 134 | destination_end_ct->next_token = source_ct; | ||
| 135 | } | ||
| 136 | |||
| 137 | destination->end_token = source_ct; | ||
| 138 | } | ||
| 139 | |||
| 140 | #if 0 | ||
| 141 | internal csv_token_list * | ||
| 142 | parse_csv_row(string8 row_buffer) | ||
| 143 | { | ||
| 144 | // csv_token_list * | ||
| 145 | |||
| 146 | } | ||
| 147 | #endif | ||
| 148 | |||
| 149 | |||
| 150 | // the lexer acts as a table builder from a csv file | ||
| 151 | // and parsing indivudal rows and columns | ||
| 152 | // the next step would be building a the b-tree | ||
| 153 | internal csv_token * | ||
| 154 | tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list) | ||
| 155 | { | ||
| 156 | |||
| 157 | unused(token_list); | ||
| 158 | b32 finding_headers = TRUE; | ||
| 159 | |||
| 160 | if(buffer.size < 0) return NULL; | ||
| 161 | |||
| 162 | csv_token *tok = PushStruct(arena, csv_token); | ||
| 163 | |||
| 164 | // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit | ||
| 165 | // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING??? | ||
| 166 | // forgot what the solution was | ||
| 167 | // TODO(nasr): check what the problem here was | ||
| 168 | for(s32 index = 0; buffer.data[index] != '\0'; ++index) | ||
| 169 | { | ||
| 170 | u8 point = buffer.data[index]; | ||
| 171 | |||
| 172 | s32 start = 0; | ||
| 173 | s32 end = 0; | ||
| 174 | |||
| 175 | if(is_whitespace(point)) | ||
| 176 | { | ||
| 177 | warn("csv file is invalid, detected whitespace"); | ||
| 178 | return NULL; | ||
| 179 | } | ||
| 180 | |||
| 181 | |||
| 182 | if(point == '\n') | ||
| 183 | { | ||
| 184 | if(finding_headers) | ||
| 185 | { | ||
| 186 | #if 0 | ||
| 187 | string8 headers_buffer = {.data = &buffer.data[start], .size = end - start}; | ||
| 188 | #endif | ||
| 189 | finding_headers = FALSE; | ||
| 190 | |||
| 191 | { | ||
| 192 | //- map new header token list to table headers | ||
| 193 | } | ||
| 194 | } | ||
| 195 | #if 0 | ||
| 196 | else | ||
| 197 | { | ||
| 198 | |||
| 199 | } | ||
| 200 | #endif | ||
| 201 | |||
| 202 | |||
| 203 | table->row_count++; | ||
| 204 | } | ||
| 205 | else if(point == ',') | ||
| 206 | { | ||
| 207 | if (finding_headers) | ||
| 208 | { | ||
| 209 | table->header_count++; | ||
| 210 | } | ||
| 211 | } | ||
| 212 | |||
| 213 | switch(point) | ||
| 214 | { | ||
| 215 | case('\n'): | ||
| 216 | { | ||
| 217 | tok->flags |= FL; | ||
| 218 | break; | ||
| 219 | } | ||
| 220 | |||
| 221 | case(','): | ||
| 222 | { | ||
| 223 | end = index - 1; | ||
| 224 | start = index + 1; | ||
| 225 | break; | ||
| 226 | } | ||
| 227 | default: | ||
| 228 | { | ||
| 229 | break; | ||
| 230 | } | ||
| 231 | } | ||
| 232 | |||
| 233 | tok->lexeme = StringCast(&buffer.data[start], end - start); | ||
| 234 | tok->next_token = tok; | ||
| 235 | } | ||
| 236 | |||
| 237 | return tok; | ||
| 238 | } | ||
| 239 | |||
| 240 | //- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future | ||
| 241 | internal b_tree * | ||
| 242 | parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table) | ||
| 243 | { | ||
| 244 | b_tree *tree = PushStructZero(arena, b_tree); | ||
| 245 | b_tree_create(arena, tree); | ||
| 246 | |||
| 247 | // iterate over the token list while the token is not nil | ||
| 248 | for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token) | ||
| 249 | { | ||
| 250 | |||
| 251 | //- TODO(nasr): check initizalization or something tomorrow | ||
| 252 | { | ||
| 253 | //- are we parsing the first line tokens? | ||
| 254 | //- if so, do something :)) | ||
| 255 | if(ct->flags & FL) | ||
| 256 | { | ||
| 257 | // TODO(nasr): replace with nil header check function | ||
| 258 | if(table->header != &nil_csv_header || table->header == NULL) | ||
| 259 | { | ||
| 260 | #if 0 | ||
| 261 | // - no this should happen in the tokenization | ||
| 262 | table->headers->next = | ||
| 263 | #endif | ||
| 264 | } | ||
| 265 | else | ||
| 266 | { | ||
| 267 | |||
| 268 | } | ||
| 269 | |||
| 270 | } | ||
| 271 | } | ||
| 272 | |||
| 273 | // TODO(nasr): fix this logic tomorrow | ||
| 274 | csv_token *ct = PushStruct(arena, csv_token); | ||
| 275 | // skip structural ctens, only index values | ||
| 276 | if (ct->type != TOKEN_VALUE) | ||
| 277 | { | ||
| 278 | continue; | ||
| 279 | } | ||
| 280 | |||
| 281 | // NOTE(nasr): payload is the cten itself so the caller can reach | ||
| 282 | // row/col metadata without us having to copy it | ||
| 283 | // NOTE(nasr): heh why do we void cast again? | ||
| 284 | b_tree_insert(tree, ct->lexeme, (void *)ct); | ||
| 285 | } | ||
| 286 | |||
| 287 | return tree; | ||
| 288 | } | ||
| 289 | |||
