summaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
Diffstat (limited to 'source')
-rw-r--r--source/b_tree_impl.h (renamed from source/b_tree.h)49
-rw-r--r--source/csv_decoder.h289
-rw-r--r--source/csv_reader.h183
-rw-r--r--source/tb_db.c (renamed from source/engine.c)28
4 files changed, 346 insertions, 203 deletions
diff --git a/source/b_tree.h b/source/b_tree_impl.h
index 0c4b1e1..8eb0723 100644
--- a/source/b_tree.h
+++ b/source/b_tree_impl.h
@@ -4,16 +4,29 @@
4 4
5// maximum height of the tree the lower the lower the lower amount 5// maximum height of the tree the lower the lower the lower amount
6// of disk reads which translates into faster? 6// of disk reads which translates into faster?
7#define B_TREE_ORDER 4 7#if 0
8global_variable read_only s16 B_TREE_ORDER = 4;
9#endif
10#define B_TREE_ORDER 4
11
12//- NOTE(nasr): defining a key to improve sorting
13// i think saying that a key is a combination of the column + row is a good way of appraoching this
14typedef struct key key;
15struct key
16{
17 string8 header;
18 s32 row;
19};
8 20
9typedef struct b_tree_node b_tree_node; 21typedef struct b_tree_node b_tree_node;
10struct b_tree_node 22struct b_tree_node
11{ 23{
12 // store the values 24 // store the key values of the sub nodes? if they are leaves?
13 string8 keys[B_TREE_ORDER - 1]; 25 key keys[B_TREE_ORDER - 1];
14 // TODO(nasr): replace with something more generic? 26 // TODO(nasr): replace with something more generic?
15 // NOTE(nasr): cons of void * -> no type safety 27 // NOTE(nasr): cons of void * -> no type safety
16 // is there a way to still have some sort of that? 28 // is there a way to still have some sort of that?
29 // size not variable
17 void *payload_per_key[B_TREE_ORDER - 1]; 30 void *payload_per_key[B_TREE_ORDER - 1];
18 b_tree_node *parent; 31 b_tree_node *parent;
19 // handle to store children faster than linked list 32 // handle to store children faster than linked list
@@ -27,6 +40,10 @@ struct b_tree_node
27 // s32 *refc; 40 // s32 *refc;
28 s32 key_count; 41 s32 key_count;
29 b32 leaf; 42 b32 leaf;
43
44
45 // NOTE(nasr): do we hold the reference to the arena? or do we pass is it as a reference?
46 // this could solve memory location issues?
30}; 47};
31 48
32typedef struct b_tree b_tree; 49typedef struct b_tree b_tree;
@@ -55,9 +72,17 @@ btree_node_alloc(mem_arena *arena)
55internal s32 72internal s32
56btree_node_find_pos(string8 value, b_tree_node *node) 73btree_node_find_pos(string8 value, b_tree_node *node)
57{ 74{
75 unused(value);
76 unused(node);
77
78#if 0
58 s32 i = 0; 79 s32 i = 0;
59 for (; i < node->key_count && string8_cmp(node->keys[i], value) < 0; ++i); 80 for (; i < node->key_count && string8_cmp(node->keys[i], value) < 0; ++i);
60 return i; 81 return i;
82#endif
83
84 return 0;
85
61} 86}
62 87
63internal void 88internal void
@@ -68,10 +93,15 @@ b_tree_create(mem_arena *arena, b_tree *tree)
68 tree->root->key_count = 0; 93 tree->root->key_count = 0;
69} 94}
70 95
96
71// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory 97// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory
72internal void * 98internal void *
73b_tree_search(b_tree_node *node, string8 key) 99b_tree_search(b_tree_node *node, string8 key)
74{ 100{
101 unused(node);
102 unused(key);
103
104#if 0
75 s32 i = btree_node_find_pos(key, node); 105 s32 i = btree_node_find_pos(key, node);
76 106
77 if (i < node->key_count && string8_cmp(node->keys[i], key) == 0) 107 if (i < node->key_count && string8_cmp(node->keys[i], key) == 0)
@@ -83,12 +113,20 @@ b_tree_search(b_tree_node *node, string8 key)
83 return NULL; 113 return NULL;
84 } 114 }
85 return b_tree_search(node->children[i], key); 115 return b_tree_search(node->children[i], key);
116#endif
117
118 return NULL;
86} 119}
87 120
121
88// TODO(nasr): split node when key_count == B_TREE_ORDER - 1 (node is full) 122// TODO(nasr): split node when key_count == B_TREE_ORDER - 1 (node is full)
89internal void 123internal void
90b_tree_insert(mem_arena *arena, b_tree *tree, string8 key, void *payload) 124b_tree_insert(b_tree *tree, string8 key, void *payload)
91{ 125{
126 unused(tree);
127 unused(key);
128 unused(payload);
129#if 0
92 b_tree_node *current_node = tree->root; 130 b_tree_node *current_node = tree->root;
93 131
94 if (current_node->leaf) 132 if (current_node->leaf)
@@ -109,15 +147,18 @@ b_tree_insert(mem_arena *arena, b_tree *tree, string8 key, void *payload)
109 } 147 }
110 else { 148 else {
111 // TODO(nasr): creating a new branch / tree? 149 // TODO(nasr): creating a new branch / tree?
150 // make a seperate function for this
112 } 151 }
113 } 152 }
114 // TODO(nasr): internal node case walk down then split on the way back up 153 // TODO(nasr): internal node case walk down then split on the way back up
154#endif
115} 155}
116 156
117internal void 157internal void
118b_tree_write(b_tree *bt) 158b_tree_write(b_tree *bt)
119{ 159{
120 // TODO(nasr): write the b_tree to disk 160 // TODO(nasr): write the b_tree to disk
161 unused(bt);
121} 162}
122 163
123#endif /* B_TREE_IMPLEMENTATION */ 164#endif /* B_TREE_IMPLEMENTATION */
diff --git a/source/csv_decoder.h b/source/csv_decoder.h
new file mode 100644
index 0000000..b754ef5
--- /dev/null
+++ b/source/csv_decoder.h
@@ -0,0 +1,289 @@
1#ifndef ENGINE_LEXER_H
2#define ENGINE_LEXER_H
3
4enum csv_token_flags
5{
6 FL = 1 << 2,
7};
8
9enum csv_token_type
10{
11 // first 255 tokens for ascii characters
12 TOKEN_UNDEFINED = 255,
13 TOKEN_IDENTIFIER,
14 TOKEN_VALUE,
15};
16
17typedef struct csv_token csv_token;
18struct csv_token
19{
20 string8 lexeme;
21 csv_token *next_token;
22 enum csv_token_type type;
23 enum csv_token_flags flags;
24};
25
26// NOTE(nasr): i dont think im going to use this.
27typedef struct csv_row csv_row;
28struct csv_row
29{
30 // array of size col_count, points into mmap buffer
31 string8 *fields;
32 s32 count;
33};
34
35#if 0
36typedef struct csv_lntity csv_entity;
37struct csv_entity
38{
39 //- not needed because we use key header mapping i think
40};
41#endif
42
43typedef struct csv_header csv_header;
44struct csv_header
45{
46 string8 payload;
47 csv_header *next_header;
48};
49
50typedef struct csv_table csv_table;
51struct csv_table
52{
53 // first row, col names
54 // all data rows
55 csv_header *header;
56 s32 row_count;
57 s32 header_count;
58};
59
60
61typedef struct csv_token_list csv_token_list;
62struct csv_token_list
63{
64 csv_token *start_token;
65 csv_token *end_token;
66
67};
68
69read_only global_variable
70csv_token nil_csv_token=
71{
72 .lexeme = {.data = NULL, .size =0},
73 .type = 0,
74 .flags = 0,
75 .next_token = &nil_csv_token,
76
77};
78
79read_only global_variable
80csv_header nil_csv_header =
81{
82 .payload = {.data = NULL, .size = 0},
83 .next_header = &nil_csv_header,
84};
85
86read_only global_variable
87csv_token_list nil_csv_token_list =
88{
89 .start_token = &nil_csv_token,
90 .end_token = &nil_csv_token,
91};
92
93
94read_only global_variable
95csv_row nil_csv_row =
96{
97 .fields = &nil_string,
98 .count = 0,
99};
100
101read_only global_variable
102csv_table nil_csv_table =
103{
104 .header = &nil_csv_header,
105 .row_count = 0,
106};
107
108#endif /* ENGINE_LEXER_H */
109
110internal b32
111is_nil_csv_token(csv_token *token)
112{
113 return ((token == NULL) || (token == &nil_csv_token));
114}
115
116internal void
117csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
118{
119 source_token_list->end_token->next_token = source_token;
120 source_token_list->end_token = source_token;
121
122}
123
124//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
125internal void
126csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
127{
128
129 csv_token *source_ct = source->start_token;
130 csv_token *destination_end_ct = destination->end_token;
131
132 for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
133 {
134 destination_end_ct->next_token = source_ct;
135 }
136
137 destination->end_token = source_ct;
138}
139
140#if 0
141internal csv_token_list *
142parse_csv_row(string8 row_buffer)
143{
144 // csv_token_list *
145
146}
147#endif
148
149
150// the lexer acts as a table builder from a csv file
151// and parsing indivudal rows and columns
152// the next step would be building a the b-tree
153internal csv_token *
154tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
155{
156
157 unused(token_list);
158 b32 finding_headers = TRUE;
159
160 if(buffer.size < 0) return NULL;
161
162 csv_token *tok = PushStruct(arena, csv_token);
163
164 // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
165 // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
166 // forgot what the solution was
167 // TODO(nasr): check what the problem here was
168 for(s32 index = 0; buffer.data[index] != '\0'; ++index)
169 {
170 u8 point = buffer.data[index];
171
172 s32 start = 0;
173 s32 end = 0;
174
175 if(is_whitespace(point))
176 {
177 warn("csv file is invalid, detected whitespace");
178 return NULL;
179 }
180
181
182 if(point == '\n')
183 {
184 if(finding_headers)
185 {
186#if 0
187 string8 headers_buffer = {.data = &buffer.data[start], .size = end - start};
188#endif
189 finding_headers = FALSE;
190
191 {
192 //- map new header token list to table headers
193 }
194 }
195#if 0
196 else
197 {
198
199 }
200#endif
201
202
203 table->row_count++;
204 }
205 else if(point == ',')
206 {
207 if (finding_headers)
208 {
209 table->header_count++;
210 }
211 }
212
213 switch(point)
214 {
215 case('\n'):
216 {
217 tok->flags |= FL;
218 break;
219 }
220
221 case(','):
222 {
223 end = index - 1;
224 start = index + 1;
225 break;
226 }
227 default:
228 {
229 break;
230 }
231 }
232
233 tok->lexeme = StringCast(&buffer.data[start], end - start);
234 tok->next_token = tok;
235 }
236
237 return tok;
238}
239
240//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
241internal b_tree *
242parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
243{
244 b_tree *tree = PushStructZero(arena, b_tree);
245 b_tree_create(arena, tree);
246
247 // iterate over the token list while the token is not nil
248 for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
249 {
250
251 //- TODO(nasr): check initizalization or something tomorrow
252 {
253 //- are we parsing the first line tokens?
254 //- if so, do something :))
255 if(ct->flags & FL)
256 {
257 // TODO(nasr): replace with nil header check function
258 if(table->header != &nil_csv_header || table->header == NULL)
259 {
260#if 0
261 // - no this should happen in the tokenization
262 table->headers->next =
263#endif
264 }
265 else
266 {
267
268 }
269
270 }
271 }
272
273 // TODO(nasr): fix this logic tomorrow
274 csv_token *ct = PushStruct(arena, csv_token);
275 // skip structural ctens, only index values
276 if (ct->type != TOKEN_VALUE)
277 {
278 continue;
279 }
280
281 // NOTE(nasr): payload is the cten itself so the caller can reach
282 // row/col metadata without us having to copy it
283 // NOTE(nasr): heh why do we void cast again?
284 b_tree_insert(tree, ct->lexeme, (void *)ct);
285 }
286
287 return tree;
288}
289
diff --git a/source/csv_reader.h b/source/csv_reader.h
deleted file mode 100644
index f5205bf..0000000
--- a/source/csv_reader.h
+++ /dev/null
@@ -1,183 +0,0 @@
1#ifndef ENGINE_LEXER_H
2#define ENGINE_LEXER_H
3
4typedef enum csv_token_flags csv_token_flags;
5enum csv_token_flags
6{
7 START_FL = 1 << 1,
8 END_FL = 1 << 2,
9};
10
11typedef enum csv_token_type csv_token_type;
12enum csv_token_type
13{
14 // first 255 tokens for ascii characters
15 TOKEN_UNDEFINED = 255,
16 TOKEN_IDENTIFIER,
17 TOKEN_VALUE,
18};
19
20typedef struct csv_token csv_token;
21struct csv_token
22{
23 string8 lexeme;
24 csv_token_type type;
25 csv_token_flags flags;
26 csv_token *next;
27};
28
29// NOTE(nasr): i dont think im going to use this.
30typedef struct csv_row csv_row;
31struct csv_row
32{
33 // array of size col_count, points into mmap buffer
34 string8 *fields;
35 s32 count;
36};
37
38typedef struct csv_table csv_table;
39struct csv_table
40{
41 // first row, col names
42 // all data rows
43 string8 *headers;
44 csv_row *rows;
45 s32 col_count;
46 s32 row_count;
47};
48
49
50typedef struct csv_token_list csv_token_list;
51struct csv_token_list
52{
53 csv_token *start_token;
54 csv_token *end_token;
55
56};
57
58read_only global_variable
59csv_token nil_csv_token=
60{
61 .lexeme = {.data = NULL, .size =0},
62 .type = (csv_token_type)0,
63 .flags = 0,
64 .next = &nil_csv_token,
65
66};
67
68read_only global_variable
69csv_token_list nil_csv_token_list =
70{
71 .start_token = &nil_csv_token,
72 .end_token = &nil_csv_token,
73};
74
75
76read_only global_variable
77csv_row nil_csv_row =
78{
79 .fields = &nil_string,
80 .count = 0,
81};
82
83read_only global_variable
84csv_table nil_csv_table =
85{
86 .headers = &nil_string,
87 .rows = &nil_csv_row,
88 .col_count = 0,
89 .row_count = 0,
90};
91
92#endif /* ENGINE_LEXER_H */
93
94// the lexer acts as a table builder from a csv file
95// and parsing indivudal rows and columns
96// the next step would be building a the b-tree
97internal csv_token *
98tokenize_csv(string8 buffer, mem_arena *arena)
99{
100 b32 FL = TRUE;
101
102 if(buffer.size < 0) return NULL;
103
104 csv_token *tok = PushStruct(arena, csv_token);
105
106 // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
107 // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
108 for(s32 index = 0; buffer.data[index] != '\0'; ++index)
109 {
110 u8 point = buffer.data[index];
111
112 s32 start = 0;
113 s32 end = 0;
114
115 if(is_whitespace(point))
116 {
117 warn("csv file is invalid, detected whitespace");
118 return NULL;
119 }
120
121 switch(point)
122 {
123 case('\n'):
124 {
125 if(FL) tok->flags |= END_FL;
126 break;
127 }
128
129 case(','):
130 {
131 end = index - 1;
132 start = index + 1;
133 break;
134 }
135 default:
136 {
137 break;
138 }
139 }
140
141 tok->lexeme = StringCast(&buffer.data[start], end - start);
142 tok->next = tok;
143 }
144
145 return tok;
146}
147
148internal void
149read_csv(string8 buffer)
150{
151 // printf("\nsize:%lu\ndata %s\n", buffer.size, buffer.data);
152
153}
154
155internal b_tree *
156parse_csv(mem_arena *arena, csv_token_list *ctl)
157{
158 b_tree *tree = PushStructZero(arena, b_tree);
159 b_tree_create(arena, tree);
160
161 //- TODO(nasr): check initizalization or something tomorrow
162 {
163
164 }
165 // TODO(nasr): fix this logic tomorrow
166 csv_token *ct = PushStruct(arena, csv_token);
167
168 for (;ct != NULL; ct = ct->next)
169 {
170 // skip structural ctens, only index values
171 if (ct->type != TOKEN_VALUE)
172 {
173 continue;
174 }
175
176 // NOTE(nasr): payload is the cten itself so the caller can reach
177 // row/col metadata without us having to copy it
178 // NOTE(nasr): heh why do we void cast again?
179 b_tree_insert(arena, tree, ct->lexeme, (void *)ct);
180 }
181
182 return tree;
183}
diff --git a/source/engine.c b/source/tb_db.c
index 106f113..b992111 100644
--- a/source/engine.c
+++ b/source/tb_db.c
@@ -1,6 +1,3 @@
1
2
3
4#define B_TREE_IMPLEMENTATION 1#define B_TREE_IMPLEMENTATION
5#define BASE_UNITY 2#define BASE_UNITY
6#include "base/base_include.h" 3#include "base/base_include.h"
@@ -33,11 +30,10 @@ internal b32
33is_delimiter(u8 point) 30is_delimiter(u8 point)
34{ 31{
35 return (point == ','); 32 return (point == ',');
36
37} 33}
38 34
39#include "b_tree.h" 35#include "b_tree_impl.h"
40#include "csv_reader.h" 36#include "csv_decoder.h"
41 37
42typedef struct query_token query_token; 38typedef struct query_token query_token;
43struct query_token 39struct query_token
@@ -80,7 +76,6 @@ is_nil_query_token_list(query_token *token)
80 return (token == &nil_query_token) || (token == NULL); 76 return (token == &nil_query_token) || (token == NULL);
81} 77}
82 78
83
84// takes on line of the repl input 79// takes on line of the repl input
85// return a reference to the passed list 80// return a reference to the passed list
86internal query_token_list * 81internal query_token_list *
@@ -90,7 +85,7 @@ query_tokenizer(mem_arena *arena, string8 *buffer, query_token_list *list)
90 unused(initialized); 85 unused(initialized);
91 86
92 for (u64 index = 0; index < buffer->size; ++index) 87 for (u64 index = 0; index < buffer->size; ++index)
93 { 88 {
94 u8 codepoint = buffer->data[index]; 89 u8 codepoint = buffer->data[index];
95 90
96 if(codepoint == '\n' || codepoint == '\r') break; 91 if(codepoint == '\n' || codepoint == '\r') break;
@@ -98,10 +93,7 @@ query_tokenizer(mem_arena *arena, string8 *buffer, query_token_list *list)
98 s32 start = 0; 93 s32 start = 0;
99 s32 end = 0; 94 s32 end = 0;
100 95
101 if(is_whitespace(codepoint)) 96 if(is_whitespace(codepoint)) end = index;
102 {
103 end = index;
104 }
105 97
106 // save the token 98 // save the token
107 // TODO(nasr): work on the string macros cuz no work 99 // TODO(nasr): work on the string macros cuz no work
@@ -185,19 +177,23 @@ int main(int count, char **value)
185 } 177 }
186 178
187 { 179 {
188 read_csv(buffer);
189 180
190 csv_token *tokens = tokenize_csv(buffer, global_arena); 181 // NOTE(nasr): the use of tables is required for tracking headers etc.
182 // i think we can optimize this away in the future but for now its fine
183 csv_table *table = PushStruct(global_arena, csv_table);
184
185 csv_token_list *token_list = PushStruct(global_arena, csv_token_list);
186
187 csv_token *tokens = tokenize_csv(buffer, global_arena, table, token_list);
191 188
192 assert_msg(tokens != NULL, "Tokens are NULL."); 189 assert_msg(tokens != NULL, "Tokens are NULL.");
193 190
194 csv_token_list *ctl = PushStruct(global_arena, csv_token_list); 191 csv_token_list *ctl = PushStruct(global_arena, csv_token_list);
195 b_tree *bt = parse_csv(global_arena, ctl); 192 b_tree *bt = parse_csv(global_arena, ctl, table);
196 193
197 b_tree_write(bt); 194 b_tree_write(bt);
198 } 195 }
199 196
200
201 // NOTE(nasr): not sure on how to approach the b-tree and the table format thing 197 // NOTE(nasr): not sure on how to approach the b-tree and the table format thing
202 // we kind of want our table format i think? but i wouldnt be sure about the use case 198 // we kind of want our table format i think? but i wouldnt be sure about the use case
203 // so we stick to the regular b_tree for now. commenting out the tables. 199 // so we stick to the regular b_tree for now. commenting out the tables.