summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornasr <nsrddyn@gmail.com>2026-03-16 19:20:23 +0000
committernasr <nsrddyn@gmail.com>2026-03-16 19:20:23 +0000
commit180ccc84aac07c7bee2b09a6e07f7406908409b9 (patch)
treeefa39665e41c3132626f2c08b2f3ae0d18adc17a
parent2e258673171c2e4663a8b5d58e2ad174bb0ecd96 (diff)
feature(main): lots of stuff see description
1. increased compile time warnings to help with some optimizations. 2. impelmeented csv lexing helper functions that do stuff on tokenlists like appending and concatenating lists with each other 3. realiszed that btree design in faulty so disabled it and will refactor it in the next approach
-rw-r--r--Makefile5
-rw-r--r--source/b_tree_impl.h (renamed from source/b_tree.h)49
-rw-r--r--source/csv_decoder.h289
-rw-r--r--source/csv_reader.h183
-rw-r--r--source/tb_db.c (renamed from source/engine.c)28
5 files changed, 348 insertions, 206 deletions
diff --git a/Makefile b/Makefile
index 0224a42..39e9b87 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,7 @@
1BIN = build/engine 1BIN = build/engine
2SRC = source/engine.c 2SRC = source/tb_db.c
3CC = clang 3CC = clang
4CFLAGS = -Wall -Wextra -Wfloat-equal -Wswitch-default -Wswitch-enum \ 4CFLAGS = -Wall -Wextra -Wpedantic -Wno-unused-function -g -Werror
5 -Wno-unused-parameter -Wno-implicit-fallthrough -Wno-unused-function -g -Werror
6 5
7$(BIN): $(SRC) 6$(BIN): $(SRC)
8 mkdir -p build 7 mkdir -p build
diff --git a/source/b_tree.h b/source/b_tree_impl.h
index 0c4b1e1..8eb0723 100644
--- a/source/b_tree.h
+++ b/source/b_tree_impl.h
@@ -4,16 +4,29 @@
4 4
5// maximum height of the tree the lower the lower the lower amount 5// maximum height of the tree the lower the lower the lower amount
6// of disk reads which translates into faster? 6// of disk reads which translates into faster?
7#define B_TREE_ORDER 4 7#if 0
8global_variable read_only s16 B_TREE_ORDER = 4;
9#endif
10#define B_TREE_ORDER 4
11
12//- NOTE(nasr): defining a key to improve sorting
13// i think saying that a key is a combination of the column + row is a good way of appraoching this
14typedef struct key key;
15struct key
16{
17 string8 header;
18 s32 row;
19};
8 20
9typedef struct b_tree_node b_tree_node; 21typedef struct b_tree_node b_tree_node;
10struct b_tree_node 22struct b_tree_node
11{ 23{
12 // store the values 24 // store the key values of the sub nodes? if they are leaves?
13 string8 keys[B_TREE_ORDER - 1]; 25 key keys[B_TREE_ORDER - 1];
14 // TODO(nasr): replace with something more generic? 26 // TODO(nasr): replace with something more generic?
15 // NOTE(nasr): cons of void * -> no type safety 27 // NOTE(nasr): cons of void * -> no type safety
16 // is there a way to still have some sort of that? 28 // is there a way to still have some sort of that?
29 // size not variable
17 void *payload_per_key[B_TREE_ORDER - 1]; 30 void *payload_per_key[B_TREE_ORDER - 1];
18 b_tree_node *parent; 31 b_tree_node *parent;
19 // handle to store children faster than linked list 32 // handle to store children faster than linked list
@@ -27,6 +40,10 @@ struct b_tree_node
27 // s32 *refc; 40 // s32 *refc;
28 s32 key_count; 41 s32 key_count;
29 b32 leaf; 42 b32 leaf;
43
44
45 // NOTE(nasr): do we hold the reference to the arena? or do we pass is it as a reference?
46 // this could solve memory location issues?
30}; 47};
31 48
32typedef struct b_tree b_tree; 49typedef struct b_tree b_tree;
@@ -55,9 +72,17 @@ btree_node_alloc(mem_arena *arena)
55internal s32 72internal s32
56btree_node_find_pos(string8 value, b_tree_node *node) 73btree_node_find_pos(string8 value, b_tree_node *node)
57{ 74{
75 unused(value);
76 unused(node);
77
78#if 0
58 s32 i = 0; 79 s32 i = 0;
59 for (; i < node->key_count && string8_cmp(node->keys[i], value) < 0; ++i); 80 for (; i < node->key_count && string8_cmp(node->keys[i], value) < 0; ++i);
60 return i; 81 return i;
82#endif
83
84 return 0;
85
61} 86}
62 87
63internal void 88internal void
@@ -68,10 +93,15 @@ b_tree_create(mem_arena *arena, b_tree *tree)
68 tree->root->key_count = 0; 93 tree->root->key_count = 0;
69} 94}
70 95
96
71// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory 97// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory
72internal void * 98internal void *
73b_tree_search(b_tree_node *node, string8 key) 99b_tree_search(b_tree_node *node, string8 key)
74{ 100{
101 unused(node);
102 unused(key);
103
104#if 0
75 s32 i = btree_node_find_pos(key, node); 105 s32 i = btree_node_find_pos(key, node);
76 106
77 if (i < node->key_count && string8_cmp(node->keys[i], key) == 0) 107 if (i < node->key_count && string8_cmp(node->keys[i], key) == 0)
@@ -83,12 +113,20 @@ b_tree_search(b_tree_node *node, string8 key)
83 return NULL; 113 return NULL;
84 } 114 }
85 return b_tree_search(node->children[i], key); 115 return b_tree_search(node->children[i], key);
116#endif
117
118 return NULL;
86} 119}
87 120
121
88// TODO(nasr): split node when key_count == B_TREE_ORDER - 1 (node is full) 122// TODO(nasr): split node when key_count == B_TREE_ORDER - 1 (node is full)
89internal void 123internal void
90b_tree_insert(mem_arena *arena, b_tree *tree, string8 key, void *payload) 124b_tree_insert(b_tree *tree, string8 key, void *payload)
91{ 125{
126 unused(tree);
127 unused(key);
128 unused(payload);
129#if 0
92 b_tree_node *current_node = tree->root; 130 b_tree_node *current_node = tree->root;
93 131
94 if (current_node->leaf) 132 if (current_node->leaf)
@@ -109,15 +147,18 @@ b_tree_insert(mem_arena *arena, b_tree *tree, string8 key, void *payload)
109 } 147 }
110 else { 148 else {
111 // TODO(nasr): creating a new branch / tree? 149 // TODO(nasr): creating a new branch / tree?
150 // make a seperate function for this
112 } 151 }
113 } 152 }
114 // TODO(nasr): internal node case walk down then split on the way back up 153 // TODO(nasr): internal node case walk down then split on the way back up
154#endif
115} 155}
116 156
117internal void 157internal void
118b_tree_write(b_tree *bt) 158b_tree_write(b_tree *bt)
119{ 159{
120 // TODO(nasr): write the b_tree to disk 160 // TODO(nasr): write the b_tree to disk
161 unused(bt);
121} 162}
122 163
123#endif /* B_TREE_IMPLEMENTATION */ 164#endif /* B_TREE_IMPLEMENTATION */
diff --git a/source/csv_decoder.h b/source/csv_decoder.h
new file mode 100644
index 0000000..b754ef5
--- /dev/null
+++ b/source/csv_decoder.h
@@ -0,0 +1,289 @@
1#ifndef ENGINE_LEXER_H
2#define ENGINE_LEXER_H
3
4enum csv_token_flags
5{
6 FL = 1 << 2,
7};
8
9enum csv_token_type
10{
11 // first 255 tokens for ascii characters
12 TOKEN_UNDEFINED = 255,
13 TOKEN_IDENTIFIER,
14 TOKEN_VALUE,
15};
16
17typedef struct csv_token csv_token;
18struct csv_token
19{
20 string8 lexeme;
21 csv_token *next_token;
22 enum csv_token_type type;
23 enum csv_token_flags flags;
24};
25
26// NOTE(nasr): i dont think im going to use this.
27typedef struct csv_row csv_row;
28struct csv_row
29{
30 // array of size col_count, points into mmap buffer
31 string8 *fields;
32 s32 count;
33};
34
35#if 0
36typedef struct csv_lntity csv_entity;
37struct csv_entity
38{
39 //- not needed because we use key header mapping i think
40};
41#endif
42
43typedef struct csv_header csv_header;
44struct csv_header
45{
46 string8 payload;
47 csv_header *next_header;
48};
49
50typedef struct csv_table csv_table;
51struct csv_table
52{
53 // first row, col names
54 // all data rows
55 csv_header *header;
56 s32 row_count;
57 s32 header_count;
58};
59
60
61typedef struct csv_token_list csv_token_list;
62struct csv_token_list
63{
64 csv_token *start_token;
65 csv_token *end_token;
66
67};
68
69read_only global_variable
70csv_token nil_csv_token=
71{
72 .lexeme = {.data = NULL, .size =0},
73 .type = 0,
74 .flags = 0,
75 .next_token = &nil_csv_token,
76
77};
78
79read_only global_variable
80csv_header nil_csv_header =
81{
82 .payload = {.data = NULL, .size = 0},
83 .next_header = &nil_csv_header,
84};
85
86read_only global_variable
87csv_token_list nil_csv_token_list =
88{
89 .start_token = &nil_csv_token,
90 .end_token = &nil_csv_token,
91};
92
93
94read_only global_variable
95csv_row nil_csv_row =
96{
97 .fields = &nil_string,
98 .count = 0,
99};
100
101read_only global_variable
102csv_table nil_csv_table =
103{
104 .header = &nil_csv_header,
105 .row_count = 0,
106};
107
108#endif /* ENGINE_LEXER_H */
109
110internal b32
111is_nil_csv_token(csv_token *token)
112{
113 return ((token == NULL) || (token == &nil_csv_token));
114}
115
116internal void
117csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
118{
119 source_token_list->end_token->next_token = source_token;
120 source_token_list->end_token = source_token;
121
122}
123
124//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
125internal void
126csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
127{
128
129 csv_token *source_ct = source->start_token;
130 csv_token *destination_end_ct = destination->end_token;
131
132 for(;!is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
133 {
134 destination_end_ct->next_token = source_ct;
135 }
136
137 destination->end_token = source_ct;
138}
139
140#if 0
141internal csv_token_list *
142parse_csv_row(string8 row_buffer)
143{
144 // csv_token_list *
145
146}
147#endif
148
149
150// the lexer acts as a table builder from a csv file
151// and parsing indivudal rows and columns
152// the next step would be building a the b-tree
153internal csv_token *
154tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
155{
156
157 unused(token_list);
158 b32 finding_headers = TRUE;
159
160 if(buffer.size < 0) return NULL;
161
162 csv_token *tok = PushStruct(arena, csv_token);
163
164 // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
165 // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
166 // forgot what the solution was
167 // TODO(nasr): check what the problem here was
168 for(s32 index = 0; buffer.data[index] != '\0'; ++index)
169 {
170 u8 point = buffer.data[index];
171
172 s32 start = 0;
173 s32 end = 0;
174
175 if(is_whitespace(point))
176 {
177 warn("csv file is invalid, detected whitespace");
178 return NULL;
179 }
180
181
182 if(point == '\n')
183 {
184 if(finding_headers)
185 {
186#if 0
187 string8 headers_buffer = {.data = &buffer.data[start], .size = end - start};
188#endif
189 finding_headers = FALSE;
190
191 {
192 //- map new header token list to table headers
193 }
194 }
195#if 0
196 else
197 {
198
199 }
200#endif
201
202
203 table->row_count++;
204 }
205 else if(point == ',')
206 {
207 if (finding_headers)
208 {
209 table->header_count++;
210 }
211 }
212
213 switch(point)
214 {
215 case('\n'):
216 {
217 tok->flags |= FL;
218 break;
219 }
220
221 case(','):
222 {
223 end = index - 1;
224 start = index + 1;
225 break;
226 }
227 default:
228 {
229 break;
230 }
231 }
232
233 tok->lexeme = StringCast(&buffer.data[start], end - start);
234 tok->next_token = tok;
235 }
236
237 return tok;
238}
239
240//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
241internal b_tree *
242parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
243{
244 b_tree *tree = PushStructZero(arena, b_tree);
245 b_tree_create(arena, tree);
246
247 // iterate over the token list while the token is not nil
248 for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
249 {
250
251 //- TODO(nasr): check initizalization or something tomorrow
252 {
253 //- are we parsing the first line tokens?
254 //- if so, do something :))
255 if(ct->flags & FL)
256 {
257 // TODO(nasr): replace with nil header check function
258 if(table->header != &nil_csv_header || table->header == NULL)
259 {
260#if 0
261 // - no this should happen in the tokenization
262 table->headers->next =
263#endif
264 }
265 else
266 {
267
268 }
269
270 }
271 }
272
273 // TODO(nasr): fix this logic tomorrow
274 csv_token *ct = PushStruct(arena, csv_token);
275 // skip structural ctens, only index values
276 if (ct->type != TOKEN_VALUE)
277 {
278 continue;
279 }
280
281 // NOTE(nasr): payload is the cten itself so the caller can reach
282 // row/col metadata without us having to copy it
283 // NOTE(nasr): heh why do we void cast again?
284 b_tree_insert(tree, ct->lexeme, (void *)ct);
285 }
286
287 return tree;
288}
289
diff --git a/source/csv_reader.h b/source/csv_reader.h
deleted file mode 100644
index f5205bf..0000000
--- a/source/csv_reader.h
+++ /dev/null
@@ -1,183 +0,0 @@
1#ifndef ENGINE_LEXER_H
2#define ENGINE_LEXER_H
3
4typedef enum csv_token_flags csv_token_flags;
5enum csv_token_flags
6{
7 START_FL = 1 << 1,
8 END_FL = 1 << 2,
9};
10
11typedef enum csv_token_type csv_token_type;
12enum csv_token_type
13{
14 // first 255 tokens for ascii characters
15 TOKEN_UNDEFINED = 255,
16 TOKEN_IDENTIFIER,
17 TOKEN_VALUE,
18};
19
20typedef struct csv_token csv_token;
21struct csv_token
22{
23 string8 lexeme;
24 csv_token_type type;
25 csv_token_flags flags;
26 csv_token *next;
27};
28
29// NOTE(nasr): i dont think im going to use this.
30typedef struct csv_row csv_row;
31struct csv_row
32{
33 // array of size col_count, points into mmap buffer
34 string8 *fields;
35 s32 count;
36};
37
38typedef struct csv_table csv_table;
39struct csv_table
40{
41 // first row, col names
42 // all data rows
43 string8 *headers;
44 csv_row *rows;
45 s32 col_count;
46 s32 row_count;
47};
48
49
50typedef struct csv_token_list csv_token_list;
51struct csv_token_list
52{
53 csv_token *start_token;
54 csv_token *end_token;
55
56};
57
58read_only global_variable
59csv_token nil_csv_token=
60{
61 .lexeme = {.data = NULL, .size =0},
62 .type = (csv_token_type)0,
63 .flags = 0,
64 .next = &nil_csv_token,
65
66};
67
68read_only global_variable
69csv_token_list nil_csv_token_list =
70{
71 .start_token = &nil_csv_token,
72 .end_token = &nil_csv_token,
73};
74
75
76read_only global_variable
77csv_row nil_csv_row =
78{
79 .fields = &nil_string,
80 .count = 0,
81};
82
83read_only global_variable
84csv_table nil_csv_table =
85{
86 .headers = &nil_string,
87 .rows = &nil_csv_row,
88 .col_count = 0,
89 .row_count = 0,
90};
91
92#endif /* ENGINE_LEXER_H */
93
94// the lexer acts as a table builder from a csv file
95// and parsing indivudal rows and columns
96// the next step would be building a the b-tree
97internal csv_token *
98tokenize_csv(string8 buffer, mem_arena *arena)
99{
100 b32 FL = TRUE;
101
102 if(buffer.size < 0) return NULL;
103
104 csv_token *tok = PushStruct(arena, csv_token);
105
106 // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
107 // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
108 for(s32 index = 0; buffer.data[index] != '\0'; ++index)
109 {
110 u8 point = buffer.data[index];
111
112 s32 start = 0;
113 s32 end = 0;
114
115 if(is_whitespace(point))
116 {
117 warn("csv file is invalid, detected whitespace");
118 return NULL;
119 }
120
121 switch(point)
122 {
123 case('\n'):
124 {
125 if(FL) tok->flags |= END_FL;
126 break;
127 }
128
129 case(','):
130 {
131 end = index - 1;
132 start = index + 1;
133 break;
134 }
135 default:
136 {
137 break;
138 }
139 }
140
141 tok->lexeme = StringCast(&buffer.data[start], end - start);
142 tok->next = tok;
143 }
144
145 return tok;
146}
147
148internal void
149read_csv(string8 buffer)
150{
151 // printf("\nsize:%lu\ndata %s\n", buffer.size, buffer.data);
152
153}
154
155internal b_tree *
156parse_csv(mem_arena *arena, csv_token_list *ctl)
157{
158 b_tree *tree = PushStructZero(arena, b_tree);
159 b_tree_create(arena, tree);
160
161 //- TODO(nasr): check initizalization or something tomorrow
162 {
163
164 }
165 // TODO(nasr): fix this logic tomorrow
166 csv_token *ct = PushStruct(arena, csv_token);
167
168 for (;ct != NULL; ct = ct->next)
169 {
170 // skip structural ctens, only index values
171 if (ct->type != TOKEN_VALUE)
172 {
173 continue;
174 }
175
176 // NOTE(nasr): payload is the cten itself so the caller can reach
177 // row/col metadata without us having to copy it
178 // NOTE(nasr): heh why do we void cast again?
179 b_tree_insert(arena, tree, ct->lexeme, (void *)ct);
180 }
181
182 return tree;
183}
diff --git a/source/engine.c b/source/tb_db.c
index 106f113..b992111 100644
--- a/source/engine.c
+++ b/source/tb_db.c
@@ -1,6 +1,3 @@
1
2
3
4#define B_TREE_IMPLEMENTATION 1#define B_TREE_IMPLEMENTATION
5#define BASE_UNITY 2#define BASE_UNITY
6#include "base/base_include.h" 3#include "base/base_include.h"
@@ -33,11 +30,10 @@ internal b32
33is_delimiter(u8 point) 30is_delimiter(u8 point)
34{ 31{
35 return (point == ','); 32 return (point == ',');
36
37} 33}
38 34
39#include "b_tree.h" 35#include "b_tree_impl.h"
40#include "csv_reader.h" 36#include "csv_decoder.h"
41 37
42typedef struct query_token query_token; 38typedef struct query_token query_token;
43struct query_token 39struct query_token
@@ -80,7 +76,6 @@ is_nil_query_token_list(query_token *token)
80 return (token == &nil_query_token) || (token == NULL); 76 return (token == &nil_query_token) || (token == NULL);
81} 77}
82 78
83
84// takes on line of the repl input 79// takes on line of the repl input
85// return a reference to the passed list 80// return a reference to the passed list
86internal query_token_list * 81internal query_token_list *
@@ -90,7 +85,7 @@ query_tokenizer(mem_arena *arena, string8 *buffer, query_token_list *list)
90 unused(initialized); 85 unused(initialized);
91 86
92 for (u64 index = 0; index < buffer->size; ++index) 87 for (u64 index = 0; index < buffer->size; ++index)
93 { 88 {
94 u8 codepoint = buffer->data[index]; 89 u8 codepoint = buffer->data[index];
95 90
96 if(codepoint == '\n' || codepoint == '\r') break; 91 if(codepoint == '\n' || codepoint == '\r') break;
@@ -98,10 +93,7 @@ query_tokenizer(mem_arena *arena, string8 *buffer, query_token_list *list)
98 s32 start = 0; 93 s32 start = 0;
99 s32 end = 0; 94 s32 end = 0;
100 95
101 if(is_whitespace(codepoint)) 96 if(is_whitespace(codepoint)) end = index;
102 {
103 end = index;
104 }
105 97
106 // save the token 98 // save the token
107 // TODO(nasr): work on the string macros cuz no work 99 // TODO(nasr): work on the string macros cuz no work
@@ -185,19 +177,23 @@ int main(int count, char **value)
185 } 177 }
186 178
187 { 179 {
188 read_csv(buffer);
189 180
190 csv_token *tokens = tokenize_csv(buffer, global_arena); 181 // NOTE(nasr): the use of tables is required for tracking headers etc.
182 // i think we can optimize this away in the future but for now its fine
183 csv_table *table = PushStruct(global_arena, csv_table);
184
185 csv_token_list *token_list = PushStruct(global_arena, csv_token_list);
186
187 csv_token *tokens = tokenize_csv(buffer, global_arena, table, token_list);
191 188
192 assert_msg(tokens != NULL, "Tokens are NULL."); 189 assert_msg(tokens != NULL, "Tokens are NULL.");
193 190
194 csv_token_list *ctl = PushStruct(global_arena, csv_token_list); 191 csv_token_list *ctl = PushStruct(global_arena, csv_token_list);
195 b_tree *bt = parse_csv(global_arena, ctl); 192 b_tree *bt = parse_csv(global_arena, ctl, table);
196 193
197 b_tree_write(bt); 194 b_tree_write(bt);
198 } 195 }
199 196
200
201 // NOTE(nasr): not sure on how to approach the b-tree and the table format thing 197 // NOTE(nasr): not sure on how to approach the b-tree and the table format thing
202 // we kind of want our table format i think? but i wouldnt be sure about the use case 198 // we kind of want our table format i think? but i wouldnt be sure about the use case
203 // so we stick to the regular b_tree for now. commenting out the tables. 199 // so we stick to the regular b_tree for now. commenting out the tables.