diff options
Diffstat (limited to 'source/csv_decoder.h')
| -rw-r--r-- | source/csv_decoder.h | 144 |
1 files changed, 72 insertions, 72 deletions
diff --git a/source/csv_decoder.h b/source/csv_decoder.h index 446f1da..3d09dc6 100644 --- a/source/csv_decoder.h +++ b/source/csv_decoder.h | |||
| @@ -55,6 +55,7 @@ struct csv_table | |||
| 55 | csv_header *header; | 55 | csv_header *header; |
| 56 | s32 row_count; | 56 | s32 row_count; |
| 57 | s32 header_count; | 57 | s32 header_count; |
| 58 | b32 finding_headers; | ||
| 58 | }; | 59 | }; |
| 59 | 60 | ||
| 60 | 61 | ||
| @@ -63,17 +64,15 @@ struct csv_token_list | |||
| 63 | { | 64 | { |
| 64 | csv_token *start_token; | 65 | csv_token *start_token; |
| 65 | csv_token *end_token; | 66 | csv_token *end_token; |
| 66 | |||
| 67 | }; | 67 | }; |
| 68 | 68 | ||
| 69 | read_only global_variable | 69 | read_only global_variable |
| 70 | csv_token nil_csv_token= | 70 | csv_token nil_csv_token= |
| 71 | { | 71 | { |
| 72 | .lexeme = {.data = NULL, .size =0}, | 72 | .lexeme = {.data = NULL, .size = 0}, |
| 73 | .type = 0, | 73 | .type = 0, |
| 74 | .flags = 0, | 74 | .flags = 0, |
| 75 | .next_token = &nil_csv_token, | 75 | .next_token = &nil_csv_token, |
| 76 | |||
| 77 | }; | 76 | }; |
| 78 | 77 | ||
| 79 | read_only global_variable | 78 | read_only global_variable |
| @@ -90,7 +89,6 @@ csv_token_list nil_csv_token_list = | |||
| 90 | .end_token = &nil_csv_token, | 89 | .end_token = &nil_csv_token, |
| 91 | }; | 90 | }; |
| 92 | 91 | ||
| 93 | |||
| 94 | read_only global_variable | 92 | read_only global_variable |
| 95 | csv_row nil_csv_row = | 93 | csv_row nil_csv_row = |
| 96 | { | 94 | { |
| @@ -113,28 +111,32 @@ is_nil_csv_token(csv_token *token) | |||
| 113 | return ((token == NULL) || (token == &nil_csv_token)); | 111 | return ((token == NULL) || (token == &nil_csv_token)); |
| 114 | } | 112 | } |
| 115 | 113 | ||
| 114 | // TODO(nasr): segfaulting because end_token not allocated | ||
| 116 | internal void | 115 | internal void |
| 117 | csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token) | 116 | csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token) |
| 118 | { | 117 | { |
| 119 | source_token_list->end_token->next_token = source_token; | 118 | source_token_list->end_token->next_token = source_token; |
| 120 | source_token_list->end_token = source_token; | 119 | source_token_list->end_token = source_token; |
| 121 | |||
| 122 | } | 120 | } |
| 123 | 121 | ||
| 124 | //- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother | 122 | //- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother |
| 125 | internal void | 123 | internal void |
| 126 | csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source) | 124 | csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source) |
| 127 | { | 125 | { |
| 126 | if(is_nil_csv_token(source->start_token)) return; | ||
| 128 | 127 | ||
| 129 | csv_token *source_ct = source->start_token; | 128 | csv_token *source_ct = source->start_token; |
| 130 | csv_token *destination_end_ct = destination->end_token; | 129 | csv_token *destination_et = destination->end_token; |
| 131 | 130 | ||
| 132 | for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token) | 131 | // walk source and stitch each node onto destination's tail |
| 133 | { | 132 | for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token) |
| 134 | destination_end_ct->next_token = source_ct; | 133 | { |
| 135 | } | 134 | destination_et->next_token = source_ct; |
| 135 | destination_et = source_ct; | ||
| 136 | } | ||
| 136 | 137 | ||
| 137 | destination->end_token = source_ct; | 138 | // destination_et now points at the last real source node (not the nil sentinel) |
| 139 | destination->end_token = destination_et; | ||
| 138 | } | 140 | } |
| 139 | 141 | ||
| 140 | #if 0 | 142 | #if 0 |
| @@ -153,88 +155,80 @@ parse_csv_row(string8 row_buffer) | |||
| 153 | internal csv_token * | 155 | internal csv_token * |
| 154 | tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list) | 156 | tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list) |
| 155 | { | 157 | { |
| 156 | |||
| 157 | unused(token_list); | 158 | unused(token_list); |
| 158 | b32 finding_headers = TRUE; | ||
| 159 | 159 | ||
| 160 | if(buffer.size == 0) return NULL; | 160 | if(buffer.size == 0) return NULL; |
| 161 | 161 | ||
| 162 | csv_token *tok = PushStruct(arena, csv_token); | ||
| 163 | |||
| 164 | // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit | 162 | // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit |
| 165 | // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING??? | 163 | // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING??? |
| 166 | // forgot what the solution was | 164 | // forgot what the solution was |
| 167 | // TODO(nasr): check what the problem here was | 165 | // TODO(nasr): check what the problem here was |
| 166 | |||
| 167 | // string size tracking across the loop not inside it | ||
| 168 | s32 start = 0; | ||
| 169 | |||
| 168 | for(s32 index = 0; buffer.data[index] != '\0'; ++index) | 170 | for(s32 index = 0; buffer.data[index] != '\0'; ++index) |
| 169 | { | 171 | { |
| 170 | u8 point = buffer.data[index]; | 172 | u8 point = buffer.data[index]; |
| 171 | 173 | ||
| 172 | s32 start = 0; | 174 | #if 0 |
| 173 | s32 end = 0; | ||
| 174 | |||
| 175 | if(is_whitespace(point)) | 175 | if(is_whitespace(point)) |
| 176 | { | 176 | { |
| 177 | warn("csv file is invalid, detected whitespace"); | 177 | warn("csv file is invalid, detected whitespace"); |
| 178 | return NULL; | 178 | return NULL; |
| 179 | } | 179 | } |
| 180 | #endif | ||
| 180 | 181 | ||
| 181 | 182 | if(point == ',') | |
| 182 | if(point == '\n') | ||
| 183 | { | 183 | { |
| 184 | if(finding_headers) | 184 | // emit a token for the field that ended before this comma |
| 185 | { | 185 | csv_token *token = PushStructZero(arena, csv_token); |
| 186 | #if 0 | ||
| 187 | string8 headers_buffer = {.data = &buffer.data[start], .size = end - start}; | ||
| 188 | #endif | ||
| 189 | finding_headers = FALSE; | ||
| 190 | 186 | ||
| 191 | { | 187 | assert_msg(token != NULL, "did the push struct fail??"); |
| 192 | //- map new header token list to table headers | 188 | assert_msg(arena->current_position < arena->capacity, "no more arena size"); |
| 193 | } | ||
| 194 | } | ||
| 195 | #if 0 | ||
| 196 | else | ||
| 197 | { | ||
| 198 | 189 | ||
| 199 | } | 190 | token->lexeme = StringCast(&buffer.data[start], index - start); |
| 200 | #endif | 191 | token->type = TOKEN_VALUE; |
| 192 | token->next_token = &nil_csv_token; | ||
| 193 | csv_token_list_append_token(token_list, token); | ||
| 201 | 194 | ||
| 195 | start = index + 1; | ||
| 202 | 196 | ||
| 203 | table->row_count++; | 197 | if(table->finding_headers) |
| 204 | } | ||
| 205 | else if(point == ',') | ||
| 206 | { | ||
| 207 | if (finding_headers) | ||
| 208 | { | 198 | { |
| 209 | table->header_count++; | 199 | table->header_count++; |
| 210 | } | 200 | } |
| 211 | } | 201 | } |
| 212 | 202 | else if(point == '\n') | |
| 213 | switch(point) | ||
| 214 | { | 203 | { |
| 215 | case('\n'): | 204 | // emit a token for the field that ended at this newline |
| 216 | { | 205 | csv_token *token = PushStructZero(arena, csv_token); |
| 217 | tok->flags |= FL; | 206 | token->lexeme = StringCast(&buffer.data[start], index - start); |
| 218 | break; | 207 | token->type = TOKEN_VALUE; |
| 219 | } | 208 | token->flags |= FL; |
| 209 | token->next_token = &nil_csv_token; | ||
| 220 | 210 | ||
| 221 | case(','): | 211 | assert_msg(token_list, "token list invalid"); |
| 222 | { | 212 | assert_msg(token, "you're tring to append an invalid token"); |
| 223 | end = index - 1; | 213 | |
| 224 | start = index + 1; | 214 | csv_token_list_append_token(token_list, token); |
| 225 | break; | 215 | |
| 226 | } | 216 | start = index + 1; |
| 227 | default: | 217 | |
| 218 | if(table->finding_headers) | ||
| 219 | { | ||
| 228 | { | 220 | { |
| 229 | break; | 221 | //- map new header token list to table headers |
| 230 | } | 222 | } |
| 231 | } | 223 | table->finding_headers = FALSE; |
| 224 | } | ||
| 232 | 225 | ||
| 233 | tok->lexeme = StringCast(&buffer.data[start], end - start); | 226 | table->row_count++; |
| 234 | tok->next_token = tok; | 227 | } |
| 235 | } | 228 | } |
| 236 | 229 | ||
| 237 | return tok; | 230 | // NOTE(nasr): return the first token the caller can walk the list from token_list |
| 231 | return token_list->start_token; | ||
| 238 | } | 232 | } |
| 239 | 233 | ||
| 240 | //- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future | 234 | //- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future |
| @@ -243,18 +237,24 @@ parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table) | |||
| 243 | { | 237 | { |
| 244 | btree *tree = PushStructZero(arena, btree); | 238 | btree *tree = PushStructZero(arena, btree); |
| 245 | 239 | ||
| 240 | s32 col_index = 0; | ||
| 241 | s32 row_index = 0; | ||
| 242 | |||
| 246 | // iterate over the token list while the token is not nil | 243 | // iterate over the token list while the token is not nil |
| 247 | for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token) | 244 | for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token) |
| 248 | { | 245 | { |
| 249 | |||
| 250 | //- TODO(nasr): check initizalization or something tomorrow | ||
| 251 | { | 246 | { |
| 252 | //- are we parsing the first line tokens? | 247 | //- are we parsing the first line tokens? |
| 253 | //- if so, do something :)) | 248 | //- if so, do something :)) |
| 254 | if(ct->flags & FL) | 249 | if(ct->flags & FL) |
| 255 | { | 250 | { |
| 251 | // NOTE(nasr): FL marks end-of-line; advance row, reset col | ||
| 252 | row_index++; | ||
| 253 | col_index = 0; | ||
| 254 | |||
| 256 | // TODO(nasr): replace with nil header check function | 255 | // TODO(nasr): replace with nil header check function |
| 257 | if(table->header != &nil_csv_header || table->header == NULL) | 256 | // NOTE(nasr): == nil means header hasn't been set yet |
| 257 | if(table->header == &nil_csv_header || table->header == NULL) | ||
| 258 | { | 258 | { |
| 259 | #if 0 | 259 | #if 0 |
| 260 | // - no this should happen in the tokenization | 260 | // - no this should happen in the tokenization |
| @@ -265,30 +265,30 @@ parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table) | |||
| 265 | { | 265 | { |
| 266 | 266 | ||
| 267 | } | 267 | } |
| 268 | |||
| 269 | // FL tokens are structural, no value to index | ||
| 270 | continue; | ||
| 268 | } | 271 | } |
| 269 | } | 272 | } |
| 270 | 273 | ||
| 271 | // TODO(nasr): fix this logic tomorrow | 274 | // skip non-value tokens, only index actual cell values |
| 272 | csv_token *ct = PushStruct(arena, csv_token); | ||
| 273 | // skip structural ctens, only index values | ||
| 274 | if (ct->type != TOKEN_VALUE) | 275 | if (ct->type != TOKEN_VALUE) |
| 275 | { | 276 | { |
| 277 | col_index++; | ||
| 276 | continue; | 278 | continue; |
| 277 | } | 279 | } |
| 278 | 280 | ||
| 279 | // NOTE(nasr): payload is the cten itself so the caller can reach | 281 | // NOTE(nasr): payload is the cten itself so the caller can reach |
| 280 | // row/col metadata without us having to copy it | 282 | // row/col metadata without us having to copy it |
| 281 | // NOTE(nasr): heh why do we void cast again? | ||
| 282 | |||
| 283 | key k = { | 283 | key k = { |
| 284 | .header_index = 1, | 284 | .header_index = col_index, |
| 285 | .row_index = 1, | 285 | .row_index = row_index, |
| 286 | }; | 286 | }; |
| 287 | 287 | ||
| 288 | // btree_insert(arena, tree, (key)ct->lexeme, (void *)ct); | ||
| 289 | btree_insert(arena, tree, k, (void *)ct); | 288 | btree_insert(arena, tree, k, (void *)ct); |
| 289 | |||
| 290 | col_index++; | ||
| 290 | } | 291 | } |
| 291 | 292 | ||
| 292 | return tree; | 293 | return tree; |
| 293 | } | 294 | } |
| 294 | |||
