summaryrefslogtreecommitdiff
path: root/source/tb_db
diff options
context:
space:
mode:
Diffstat (limited to 'source/tb_db')
-rw-r--r--source/tb_db/btree_impl.h209
-rw-r--r--source/tb_db/csv_decoder.h294
-rw-r--r--source/tb_db/tb_db.c198
3 files changed, 701 insertions, 0 deletions
diff --git a/source/tb_db/btree_impl.h b/source/tb_db/btree_impl.h
new file mode 100644
index 0000000..b7015bd
--- /dev/null
+++ b/source/tb_db/btree_impl.h
@@ -0,0 +1,209 @@
1/**
2 * >> api notes
3 * >> by Abdellah El Morabit
4 *
5 * - creation of a btree happens when doing an insertion on an
6 * empty btree where the root is null or nil
7 *
8 **/
9
10/* single header b-tree implementation by Abdellah El Morabit */
11#ifndef BTREE_H
12#define BTREE_H
13
14// maximum height of the tree the lower the lower the lower amount
15// of disk reads which translates into faster?
16#if 0
17global_variable read_only s16 btree_ORDER = 4;
18#endif
19#define BTREE_ORDER 4
20
21//- NOTE(nasr): defining a key to improve sorting
22// i think saying that a key is a combination of the column + row is a good way of appraoching this
23typedef struct key key;
24struct key
25{
26 s32 header_index; //- col number
27 s32 row_index;
28};
29
30typedef struct btree_node btree_node;
31struct btree_node
32{
33 // store the key values of the sub nodes? if they are leaves?
34 key keys[BTREE_ORDER - 1];
35 // TODO(nasr): replace with something more generic?
36 // NOTE(nasr): cons of void * -> no type safety
37 // is there a way to still have some sort of that? size not variable
38 void *payload_per_key[BTREE_ORDER - 1];
39 btree_node *parent;
40 // handle to store children faster than linked list
41 // because now we can iterate over the nodes instead of having cache misses
42 // when jumping through linked nodes
43 btree_node *children[BTREE_ORDER];
44 // this shouldn't change things but could be a performance improvement on a bigger scale??
45 s32 *max_children;
46 // NOTE(nasr): reference count ::: check how many leaves are using this node
47 // also not needed for now because we don't free individual node because of arena allocator
48 // s32 *refc;
49 s32 key_count;
50 b32 leaf;
51
52
53 // NOTE(nasr): do we hold the reference to the arena? or do we pass is it as a reference?
54 // this could solve memory location issues?
55};
56
57
58typedef struct btree btree;
59struct btree
60{
61 // NOTE(nasr): make sure this always stays in memory
62 // so that initial fetch never requires a disk read
63 btree_node *root;
64};
65
66
67
68#ifdef BTREE_IMPLEMENTATION
69///////////////////////////////////////////////////////////////////////////////
70//- btree.c
71
72
73internal b32
74is_nil_btree_node(btree_node *node)
75{
76 // NOTE(nasr): looks like a useless function but could be usefull when we implement a nil_btree_node
77 return (node == NULL);
78}
79
80// TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees?
81
82internal b8
83key_compare(key destination_key, key source_key)
84{
85 s32 source_index = source_key.header_index + source_key.row_index;
86 s32 destination_index = destination_key.header_index + destination_key.row_index;
87
88 if(destination_index > source_index) return 1;
89 else if(destination_index < source_index) return -1;
90 else return 0;
91}
92
93// NOTE(nasr): @return the index of the found element
94//- @param: start node could be the root node for all we care. but this enables optimizations in the future if we need them
95internal s32
96btree_find_ipos(key k, btree_node *start_node)
97{
98 s32 index = 0;
99 // scan right while the node's key at [index] is strictly less than k
100 while (index < start_node->key_count && key_compare(start_node->keys[index], k) < 0)
101 {
102 ++index;
103 }
104 return index;
105}
106
107// NOTE(nasr): nodes that get passed as parameters should've already been loaded into memory
108internal void *
109btree_search(btree_node *node, key key)
110{
111 s32 index = btree_find_ipos(key, node);
112
113 if (index < node->key_count && key_compare(node->keys[index], key) == 0)
114 {
115 return node->payload_per_key[index];
116 }
117 if (node->leaf)
118 {
119 return NULL;
120 }
121 return btree_search(node->children[index], key);
122}
123
124// TODO(nasr): split node when key_count == btree_ORDER - 1 (node is full)
125// -- each node has max btree_ORDER - 1 keys
126//
127internal void
128btree_insert(mem_arena *arena, btree *tree, key key, void *payload)
129{
130 if(is_nil_btree_node(tree->root))
131 {
132 tree->root = PushStructZero(arena, btree_node);
133 tree->root->leaf = 1;
134 tree->root->key_count = 0;
135 }
136
137 btree_node *current_node = tree->root;
138
139 if (current_node->leaf)
140 {
141 // find the insertion position and shift keys right to make room
142 //- after: but what is this. we are expecting a new key.but we are looking for
143 //- its position in the tree is useless because it's non existent
144
145 s32 i = btree_find_ipos(key, current_node);
146
147 for (s32 j = current_node->key_count; j > i; --j)
148 {
149 current_node->keys[j] = current_node->keys[j - 1];
150 current_node->payload_per_key[j] = current_node->payload_per_key[j - 1];
151 }
152
153 current_node->keys[i] = key;
154 current_node->payload_per_key[i] = payload;
155
156 if(current_node->key_count < (BTREE_ORDER - 1))
157 {
158 current_node->key_count += 1;
159 }
160 else {
161 // TODO(nasr): creating a new branch / tree?
162 // make a seperate function for this
163 // NOTE(nasr): only split through the parent when one actually exists;
164 // root splits need to be handled separately (promote mid key, create two children)
165 if(current_node->parent != NULL)
166 {
167 s32 split_pos = current_node->parent->key_count / 2 - 1;
168 s32 updated_keycount_p = 0;
169 for(s32 index = 0; index <= split_pos; ++index)
170 {
171 ++updated_keycount_p;
172 }
173 current_node->parent->key_count = updated_keycount_p;
174 }
175 }
176 }
177 // TODO(nasr): internal node case walk down then split on the way back up
178}
179
180
181internal void
182btree_write(btree *bt)
183{
184 unused(bt);
185#if 0
186 temp_arena *temp_arena = temp_arena_begin(arena);
187 btree_node *root = bt->root;
188
189 for(btree_node *bt = PushStruct(arena); root->next; next = root;)
190 {
191
192 }
193
194 temp_arena_end(temp);
195#endif
196}
197
198// - load the entire db into memory.
199#if 0
200internal void
201btree_load(mem_arena *arena)
202{
203
204
205}
206#endif
207
208#endif /* BTREE_IMPLEMENTATION */
209#endif /* BTREE_H */
diff --git a/source/tb_db/csv_decoder.h b/source/tb_db/csv_decoder.h
new file mode 100644
index 0000000..3d09dc6
--- /dev/null
+++ b/source/tb_db/csv_decoder.h
@@ -0,0 +1,294 @@
1#ifndef ENGINE_LEXER_H
2#define ENGINE_LEXER_H
3
4enum csv_token_flags
5{
6 FL = 1 << 2,
7};
8
9enum csv_token_type
10{
11 // first 255 tokens for ascii characters
12 TOKEN_UNDEFINED = 255,
13 TOKEN_IDENTIFIER,
14 TOKEN_VALUE,
15};
16
17typedef struct csv_token csv_token;
18struct csv_token
19{
20 string8 lexeme;
21 csv_token *next_token;
22 enum csv_token_type type;
23 enum csv_token_flags flags;
24};
25
26// NOTE(nasr): i dont think im going to use this.
27typedef struct csv_row csv_row;
28struct csv_row
29{
30 // array of size col_count, points into mmap buffer
31 string8 *fields;
32 s32 count;
33};
34
35#if 0
36typedef struct csv_lntity csv_entity;
37struct csv_entity
38{
39 //- not needed because we use key header mapping i think
40};
41#endif
42
43typedef struct csv_header csv_header;
44struct csv_header
45{
46 string8 payload;
47 csv_header *next_header;
48};
49
50typedef struct csv_table csv_table;
51struct csv_table
52{
53 // first row, col names
54 // all data rows
55 csv_header *header;
56 s32 row_count;
57 s32 header_count;
58 b32 finding_headers;
59};
60
61
62typedef struct csv_token_list csv_token_list;
63struct csv_token_list
64{
65 csv_token *start_token;
66 csv_token *end_token;
67};
68
69read_only global_variable
70csv_token nil_csv_token=
71{
72 .lexeme = {.data = NULL, .size = 0},
73 .type = 0,
74 .flags = 0,
75 .next_token = &nil_csv_token,
76};
77
78read_only global_variable
79csv_header nil_csv_header =
80{
81 .payload = {.data = NULL, .size = 0},
82 .next_header = &nil_csv_header,
83};
84
85read_only global_variable
86csv_token_list nil_csv_token_list =
87{
88 .start_token = &nil_csv_token,
89 .end_token = &nil_csv_token,
90};
91
92read_only global_variable
93csv_row nil_csv_row =
94{
95 .fields = &nil_string,
96 .count = 0,
97};
98
99read_only global_variable
100csv_table nil_csv_table =
101{
102 .header = &nil_csv_header,
103 .row_count = 0,
104};
105
106#endif /* ENGINE_LEXER_H */
107
108internal b32
109is_nil_csv_token(csv_token *token)
110{
111 return ((token == NULL) || (token == &nil_csv_token));
112}
113
114// TODO(nasr): segfaulting because end_token not allocated
115internal void
116csv_token_list_append_token(csv_token_list *source_token_list, csv_token *source_token)
117{
118 source_token_list->end_token->next_token = source_token;
119 source_token_list->end_token = source_token;
120}
121
122//- concatenate 2 token lists so we can handle parsing individual rows and concatenating them to eachother
123internal void
124csv_token_list_concat_list(csv_token_list *destination, csv_token_list *source)
125{
126 if(is_nil_csv_token(source->start_token)) return;
127
128 csv_token *source_ct = source->start_token;
129 csv_token *destination_et = destination->end_token;
130
131 // walk source and stitch each node onto destination's tail
132 for(; !is_nil_csv_token(source_ct); source_ct = source_ct->next_token)
133 {
134 destination_et->next_token = source_ct;
135 destination_et = source_ct;
136 }
137
138 // destination_et now points at the last real source node (not the nil sentinel)
139 destination->end_token = destination_et;
140}
141
142#if 0
143internal csv_token_list *
144parse_csv_row(string8 row_buffer)
145{
146 // csv_token_list *
147
148}
149#endif
150
151
152// the lexer acts as a table builder from a csv file
153// and parsing indivudal rows and columns
154// the next step would be building a the b-tree
155internal csv_token *
156tokenize_csv(string8 buffer, mem_arena *arena, csv_table *table, csv_token_list *token_list)
157{
158 unused(token_list);
159
160 if(buffer.size == 0) return NULL;
161
162 // URGENT(nasr): segfaulting because memcpy of strring value doesnt work dammit
163 // NOPE ITS BEECAUSE WEE DONT LOAD CSV OR SOMTHING???
164 // forgot what the solution was
165 // TODO(nasr): check what the problem here was
166
167 // string size tracking across the loop not inside it
168 s32 start = 0;
169
170 for(s32 index = 0; buffer.data[index] != '\0'; ++index)
171 {
172 u8 point = buffer.data[index];
173
174#if 0
175 if(is_whitespace(point))
176 {
177 warn("csv file is invalid, detected whitespace");
178 return NULL;
179 }
180#endif
181
182 if(point == ',')
183 {
184 // emit a token for the field that ended before this comma
185 csv_token *token = PushStructZero(arena, csv_token);
186
187 assert_msg(token != NULL, "did the push struct fail??");
188 assert_msg(arena->current_position < arena->capacity, "no more arena size");
189
190 token->lexeme = StringCast(&buffer.data[start], index - start);
191 token->type = TOKEN_VALUE;
192 token->next_token = &nil_csv_token;
193 csv_token_list_append_token(token_list, token);
194
195 start = index + 1;
196
197 if(table->finding_headers)
198 {
199 table->header_count++;
200 }
201 }
202 else if(point == '\n')
203 {
204 // emit a token for the field that ended at this newline
205 csv_token *token = PushStructZero(arena, csv_token);
206 token->lexeme = StringCast(&buffer.data[start], index - start);
207 token->type = TOKEN_VALUE;
208 token->flags |= FL;
209 token->next_token = &nil_csv_token;
210
211 assert_msg(token_list, "token list invalid");
212 assert_msg(token, "you're tring to append an invalid token");
213
214 csv_token_list_append_token(token_list, token);
215
216 start = index + 1;
217
218 if(table->finding_headers)
219 {
220 {
221 //- map new header token list to table headers
222 }
223 table->finding_headers = FALSE;
224 }
225
226 table->row_count++;
227 }
228 }
229
230 // NOTE(nasr): return the first token the caller can walk the list from token_list
231 return token_list->start_token;
232}
233
234//- NOTE(nasr): I don't know why we are still using that dumb table but we'll remove it in the future
235internal btree *
236parse_csv(mem_arena *arena, csv_token_list *ctl, csv_table *table)
237{
238 btree *tree = PushStructZero(arena, btree);
239
240 s32 col_index = 0;
241 s32 row_index = 0;
242
243 // iterate over the token list while the token is not nil
244 for (csv_token *ct = ctl->start_token; !is_nil_csv_token(ct); ct = ct->next_token)
245 {
246 {
247 //- are we parsing the first line tokens?
248 //- if so, do something :))
249 if(ct->flags & FL)
250 {
251 // NOTE(nasr): FL marks end-of-line; advance row, reset col
252 row_index++;
253 col_index = 0;
254
255 // TODO(nasr): replace with nil header check function
256 // NOTE(nasr): == nil means header hasn't been set yet
257 if(table->header == &nil_csv_header || table->header == NULL)
258 {
259#if 0
260 // - no this should happen in the tokenization
261 table->headers->next =
262#endif
263 }
264 else
265 {
266
267 }
268
269 // FL tokens are structural, no value to index
270 continue;
271 }
272 }
273
274 // skip non-value tokens, only index actual cell values
275 if (ct->type != TOKEN_VALUE)
276 {
277 col_index++;
278 continue;
279 }
280
281 // NOTE(nasr): payload is the cten itself so the caller can reach
282 // row/col metadata without us having to copy it
283 key k = {
284 .header_index = col_index,
285 .row_index = row_index,
286 };
287
288 btree_insert(arena, tree, k, (void *)ct);
289
290 col_index++;
291 }
292
293 return tree;
294}
diff --git a/source/tb_db/tb_db.c b/source/tb_db/tb_db.c
new file mode 100644
index 0000000..4ae247d
--- /dev/null
+++ b/source/tb_db/tb_db.c
@@ -0,0 +1,198 @@
1#define BTREE_IMPLEMENTATION
2#define BASE_UNITY
3#include "../base/base_include.h"
4
5internal b32
6is_alpha(u8 point)
7{
8 return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_'));
9}
10
11internal b32
12is_digit(u8 point)
13{
14 return (point >= '0' && point <= '9');
15}
16
17internal b32
18is_alpha_num(u8 point)
19{
20 return (is_alpha(point) || is_digit(point));
21}
22
23internal b32
24is_whitespace(u8 point)
25{
26 return (point == '\n' || point == '\r' || point == ' ' || point == '\t');
27}
28
29internal b32
30is_delimiter(u8 point)
31{
32 return (point == ',');
33}
34
35#include "btree_impl.h"
36#include "csv_decoder.h"
37
38typedef struct query_token query_token;
39struct query_token
40{
41 string8 lexeme;
42 query_token *next;
43};
44
45typedef struct query_token_list query_token_list;
46struct query_token_list
47{
48 query_token *start_token;
49 query_token *current_token;
50};
51
52read_only global_variable
53query_token nil_query_token =
54{
55 .lexeme = {.data = NULL, .size = 0},
56 .next = &nil_query_token
57};
58
59
60read_only global_variable
61query_token_list nil_query_token_list =
62{
63 .start_token = &nil_query_token,
64 .current_token = &nil_query_token,
65};
66
67internal b32
68is_nil_query_token(query_token *token)
69{
70 return (token == &nil_query_token) || (token == NULL);
71}
72
73internal b32
74is_nil_query_token_list(query_token *token)
75{
76 return (token == &nil_query_token) || (token == NULL);
77}
78
79// takes on line of the repl input
80// return a reference to the passed list
81internal query_token_list *
82query_tokenizer(mem_arena *arena, string8 *buffer, query_token_list *list)
83{
84 b32 initialized = 0;
85 unused(initialized);
86
87 for (u64 index = 0; index < buffer->size; ++index)
88 {
89 u8 codepoint = buffer->data[index];
90
91 if(codepoint == '\n' || codepoint == '\r') break;
92
93 s32 start = 0;
94 s32 end = 0;
95
96 if(is_whitespace(codepoint)) end = index;
97
98 // save the token
99 // TODO(nasr): work on the string macros cuz no work
100 {
101 query_token *new_token = PushStruct(arena, query_token);
102
103 //- initialize list
104 {
105 if(is_nil_query_token(list->start_token))
106 {
107 list->start_token = new_token;
108 list->current_token = new_token;
109 }
110 else
111 {
112 //- all we need to do - we dont track parents or what ever. this is a token stream not a tree
113 list->current_token->next = new_token;
114 }
115 }
116
117 s32 new_token_size = end - start;
118
119 new_token->lexeme = PushString(arena, new_token_size);
120 new_token->lexeme.data = &buffer->data[index];
121 new_token->lexeme.size = new_token_size;
122
123 list->current_token->next = new_token;
124
125 start = index + 1;
126 }
127 }
128
129 return list;
130}
131
132int main(int count, char **value)
133{
134#if 1
135 unused(nil_query_token_list);
136#endif
137
138 if(count < 2) value[1] = "./test/data.csv";
139
140 local_persist b32 running = 1;
141
142 mem_arena *global_arena = arena_create(GiB(1));
143
144 string8 buffer = load_file(global_arena, value[1]);
145
146 // NOTE(nasr): the use of tables is required for tracking headers etc.
147 // i think we can optimize this away in the future but for now its fine
148 csv_table *table = PushStructZero(global_arena, csv_table);
149 csv_token_list *token_list = PushStructZero(global_arena, csv_token_list);
150
151 token_list->start_token = &nil_csv_token;
152 token_list->end_token = &nil_csv_token;
153
154 csv_token *tokens = tokenize_csv(buffer, global_arena, table, token_list);
155 assert_msg(tokens != NULL, "tokens are null");
156
157 // NOTE(nasr): token_list is now populated — pass it directly, not a fresh empty list
158 btree *bt = parse_csv(global_arena, token_list, table);
159
160 print("\nDatabase Engine\n");
161
162 for(;;)
163 {
164 if(running)
165 {
166 u8 *lbuf = PushArray(global_arena, u8, 256);
167 s32 err = os_read(STDIN_FD, lbuf, 256);
168
169 if(err < 0)
170 {
171 print("error reading from stdin");
172 }
173
174 // TODO(nasr): extract this later in the future and make a string copy function/macro
175 // @params (s32 lbuf_size, string8 lbuf_stringified)
176 // NOTE(nasr): use err (bytes read) not sizeof(lbuf*) — sizeof a pointer is always 8
177 s32 lbuf_size = err;
178 string8 lbuf_stringified = PushString(global_arena, lbuf_size);
179 {
180 memcpy(lbuf_stringified.data, lbuf, lbuf_size);
181 lbuf_stringified.size = lbuf_size;
182 }
183
184 query_token_list *qtl = PushStructZero(global_arena, query_token_list);
185
186
187 query_tokenizer(global_arena, &lbuf_stringified, qtl);
188
189 // TODO(nasr): dispatch qtl against bt here
190
191 unused(bt);
192
193 sleep(1);
194 }
195 }
196
197 return 0;
198}