source/tb_db/csv_decoder.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294

#ifndef ENGINE_LEXER_H
#define ENGINE_LEXER_H

enum csv_token_flags
{
    FL      = 1 << 2,
};

enum csv_token_type
{
    // first 255 tokens  for ascii characters
    TOKEN_UNDEFINED = 255,
    TOKEN_IDENTIFIER,
    TOKEN_VALUE,
};

typedef struct csv_token csv_token;
struct csv_token
{
    string8 lexeme;
    csv_token *next_token;
    enum csv_token_type type;
    enum csv_token_flags flags;
};

// NOTE(nasr): i dont think im going to use this.
typedef struct csv_row csv_row;
struct csv_row
{
    // array of size col_count, points into mmap buffer
    string8 *fields;
    s32      count;
};

#if 0
typedef struct csv_lntity csv_entity;
struct csv_entity 
{
    //- not needed because we use key header mapping i think
};
#endif

typedef struct csv_header csv_header;
struct csv_header 
{
    string8  payload;
    csv_header *next_header;
};

typedef struct csv_table csv_table;
struct csv_table
{
    // first row, col names
    // all data rows
    csv_header  *header;
    s32         row_count;
    s32         header_count;
    b32         finding_headers;
};


typedef struct csv_token_list csv_token_list;
struct csv_token_list
{
    csv_token *start_token;
    csv_token *end_token;
};

read_only global_variable
csv_token nil_csv_token=
{
    .lexeme         = {.data = NULL, .size = 0},
    .type           = 0,
    .flags          = 0,
    .next_token     = &nil_csv_token,
};

read_only global_variable
csv_header nil_csv_header = 
{
    .payload =  {.data = NULL, .size = 0},
    .next_header = &nil_csv_header,
};

read_only global_variable
csv_token_list nil_csv_token_list =
{
    .start_token = &nil_csv_token,
    .end_token   = &nil_csv_token,
};

read_only global_variable
csv_row  nil_csv_row =
{
    .fields     = &nil_string,
    .count      = 0,
};

read_only global_variable
csv_table nil_csv_table =
{
    .header     = &nil_csv_header,
    .row_count   = 0,
};

#endif /* ENGINE_LEXER_H */

internal b32 
is_nil_csv_token(csv_token *token)
{
    return ((token == NULL) || (token == &nil_csv_token)); 
}

// TODO(nasr): segfaulting because end_token not allocated
internal void
csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
{
    source_token_list->end_token->next_token = source_token;
    source_token_list->end_token             = source_token;
}

//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
internal void
csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
{
    if(is_nil_csv_token(source->start_token)) return;

    csv_token *source_ct      = source->start_token;
    csv_token *destination_et = destination->end_token;

    // walk source and stitch each node onto destination's tail
    for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
    {
        destination_et->next_token = source_ct;
        destination_et             = source_ct;
    }

    // destination_et now points at the last real source node (not the nil sentinel)
    destination->end_token = destination_et;
}

#if 0
internal csv_token_list *
parse_csv_row(string8 row_buffer)
{
    // csv_token_list *

}
#endif


// the lexer acts as a table builder from a csv  file
// and parsing indivudal rows and columns
// the next step would be building a the b-tree
internal csv_token *
tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
{
    unused(token_list);

    if(buffer.size == 0) return NULL;

    // URGENT(nasr): segfaulting because memcpy of strring value doesnt  work dammit
    // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
    // forgot what the solution was
    // TODO(nasr): check what the problem here was

    // string size tracking across the loop not inside it
    s32 start = 0;

    for(s32 index = 0; buffer.data[index] != '\0'; ++index)
    {
        u8 point = buffer.data[index];

#if 0
        if(is_whitespace(point))
        {
            warn("csv file is invalid, detected whitespace");
            return NULL;
        }
#endif

        if(point == ',')
        {
            // emit a token for the field that ended before this comma
            csv_token *token  = PushStructZero(arena, csv_token);

            assert_msg(token != NULL, "did the push struct fail??");
            assert_msg(arena->current_position < arena->capacity, "no more arena size");

            token->lexeme     = StringCast(&buffer.data[start], index - start);
            token->type       = TOKEN_VALUE;
            token->next_token = &nil_csv_token;
            csv_token_list_append_token(token_list, token);

            start = index + 1;

            if(table->finding_headers)
            {
                table->header_count++;
            }
        }
        else if(point == '\n')
        {
            // emit a token for the field that ended at this newline
            csv_token *token  = PushStructZero(arena, csv_token);
            token->lexeme     = StringCast(&buffer.data[start], index - start);
            token->type       = TOKEN_VALUE;
            token->flags     |= FL;
            token->next_token = &nil_csv_token;

            assert_msg(token_list, "token list invalid");
            assert_msg(token, "you're tring to append an invalid token");

            csv_token_list_append_token(token_list, token);

            start = index + 1;

            if(table->finding_headers)
            {
                {
                    //- map new header token list to table headers
                }
                table->finding_headers = FALSE;
            }

            table->row_count++;
        }
    }

    // NOTE(nasr): return the first token the caller can walk the list from token_list
    return token_list->start_token;
}

//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
internal btree *
parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
{
    btree *tree = PushStructZero(arena, btree);

    s32 col_index = 0;
    s32 row_index = 0;

    // iterate over the token list while the token is not nil
    for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
    {
        {
            //- are we parsing the first line tokens?
            //- if so, do something :))
            if(ct->flags & FL) 
            {
                // NOTE(nasr): FL marks end-of-line; advance row, reset col
                row_index++;
                col_index = 0;

                // TODO(nasr): replace with nil header check function
                // NOTE(nasr): == nil means header hasn't been set yet
                if(table->header == &nil_csv_header || table->header == NULL)
                {
#if 0
                    // - no this should happen in the tokenization
                    table->headers->next =     
#endif
                }
                else 
                {

                }

                // FL tokens are structural, no value to index
                continue;
            }
        }

        // skip non-value tokens, only index actual cell values
        if (ct->type != TOKEN_VALUE)
        {
            col_index++;
            continue;
        }

        // NOTE(nasr): payload is the cten itself so the caller can reach
        // row/col metadata without us having to copy it
        key k = {
            .header_index = col_index,
            .row_index    = row_index,
        };

        btree_insert(arena, tree, k, (void *)ct);

        col_index++;
    }

    return tree;
}