diff options
| -rw-r--r-- | source/b_tree.h | 44 | ||||
| -rw-r--r-- | source/base/base_string.h | 3 | ||||
| -rw-r--r-- | source/btree.h (renamed from source/b_tree.c) | 44 | ||||
| -rw-r--r-- | source/csv_parser.c | 26 | ||||
| -rw-r--r-- | source/csv_parser.h | 40 | ||||
| -rw-r--r-- | source/csv_reader.h | 145 | ||||
| -rw-r--r-- | source/engine.c | 101 | ||||
| -rw-r--r-- | source/lexer.c | 80 | ||||
| -rw-r--r-- | source/lexer.h | 30 | ||||
| -rw-r--r-- | source/query.c | 0 | ||||
| -rw-r--r-- | source/query.h | 0 |
11 files changed, 283 insertions, 230 deletions
diff --git a/source/b_tree.h b/source/b_tree.h deleted file mode 100644 index 0fc4d3c..0000000 --- a/source/b_tree.h +++ /dev/null | |||
| @@ -1,44 +0,0 @@ | |||
| 1 | #ifndef BTREE_H | ||
| 2 | #define BTREE_H | ||
| 3 | |||
| 4 | // maximum height of the tree the lower the lower the lower amount | ||
| 5 | // of disk reads which translates into faster? | ||
| 6 | #define B_TREE_ORDER 4 | ||
| 7 | |||
| 8 | typedef struct b_tree_node b_tree_node; | ||
| 9 | struct b_tree_node | ||
| 10 | { | ||
| 11 | |||
| 12 | // store the values | ||
| 13 | string8 keys[B_TREE_ORDER - 1]; | ||
| 14 | csv_row *rows[B_TREE_ORDER - 1]; | ||
| 15 | |||
| 16 | b_tree_node *parent; | ||
| 17 | // handle to store children faster than linked list | ||
| 18 | // because now we can iteratate over the nodes instead of having cache misses | ||
| 19 | // when jumping through linked nodes | ||
| 20 | b_tree_node *children[B_TREE_ORDER]; | ||
| 21 | |||
| 22 | // NOTE(nasr): reference count ::: check how many leaves are using this node | ||
| 23 | // also not needed for now because we don't free individual node because of arena allocator | ||
| 24 | // s32 *refc; | ||
| 25 | |||
| 26 | s32 key_count; | ||
| 27 | |||
| 28 | b32 leaf; | ||
| 29 | |||
| 30 | }; | ||
| 31 | |||
| 32 | typedef struct b_tree b_tree; | ||
| 33 | struct b_tree | ||
| 34 | { | ||
| 35 | // NOTE(nasr): make sure this always stays in memory | ||
| 36 | // so that initial fetch never requires a disk read | ||
| 37 | b_tree_node *root; | ||
| 38 | |||
| 39 | }; | ||
| 40 | |||
| 41 | |||
| 42 | |||
| 43 | |||
| 44 | #endif /* BTREE_H */ | ||
diff --git a/source/base/base_string.h b/source/base/base_string.h index 4e0d923..ef2cf16 100644 --- a/source/base/base_string.h +++ b/source/base/base_string.h | |||
| @@ -12,7 +12,8 @@ | |||
| 12 | result; \ | 12 | result; \ |
| 13 | }) | 13 | }) |
| 14 | 14 | ||
| 15 | #define String8Cast(literal, literal_count) ( string8 ){( u8 * )( literal ),( u64 )( literal_count ) } | 15 | #define StringCast(data, size) (string8){(u8 *)(data), (u64)(size) } |
| 16 | #define StringPCast(data, size) (string8 *){(u8 *)(data), (u64)(size) } | ||
| 16 | 17 | ||
| 17 | #define StringFmt "%.*s" | 18 | #define StringFmt "%.*s" |
| 18 | #define ULongFmt "%lu" | 19 | #define ULongFmt "%lu" |
diff --git a/source/b_tree.c b/source/btree.h index e42f709..42c8e21 100644 --- a/source/b_tree.c +++ b/source/btree.h | |||
| @@ -1,3 +1,45 @@ | |||
| 1 | #ifndef BTREE_H | ||
| 2 | #define BTREE_H | ||
| 3 | |||
| 4 | // maximum height of the tree the lower the lower the lower amount | ||
| 5 | // of disk reads which translates into faster? | ||
| 6 | #define B_TREE_ORDER 4 | ||
| 7 | |||
| 8 | typedef struct b_tree_node b_tree_node; | ||
| 9 | struct b_tree_node | ||
| 10 | { | ||
| 11 | |||
| 12 | // store the values | ||
| 13 | string8 keys[B_TREE_ORDER - 1]; | ||
| 14 | csv_row *rows[B_TREE_ORDER - 1]; | ||
| 15 | |||
| 16 | b_tree_node *parent; | ||
| 17 | // handle to store children faster than linked list | ||
| 18 | // because now we can iteratate over the nodes instead of having cache misses | ||
| 19 | // when jumping through linked nodes | ||
| 20 | b_tree_node *children[B_TREE_ORDER]; | ||
| 21 | |||
| 22 | // NOTE(nasr): reference count ::: check how many leaves are using this node | ||
| 23 | // also not needed for now because we don't free individual node because of arena allocator | ||
| 24 | // s32 *refc; | ||
| 25 | |||
| 26 | s32 key_count; | ||
| 27 | |||
| 28 | b32 leaf; | ||
| 29 | |||
| 30 | }; | ||
| 31 | |||
| 32 | typedef struct b_tree b_tree; | ||
| 33 | struct b_tree | ||
| 34 | { | ||
| 35 | // NOTE(nasr): make sure this always stays in memory | ||
| 36 | // so that initial fetch never requires a disk read | ||
| 37 | b_tree_node *root; | ||
| 38 | |||
| 39 | }; | ||
| 40 | |||
| 41 | |||
| 42 | |||
| 1 | 43 | ||
| 2 | // TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees? | 44 | // TODO(nasr): 1. splitting the tree when getting too big? (horizontally) 2. joining trees? |
| 3 | 45 | ||
| @@ -63,4 +105,4 @@ b_tree_write() | |||
| 63 | // TODO(nasr): write the b_tree to disk | 105 | // TODO(nasr): write the b_tree to disk |
| 64 | } | 106 | } |
| 65 | 107 | ||
| 66 | 108 | #endif /* BTREE_H */ | |
diff --git a/source/csv_parser.c b/source/csv_parser.c deleted file mode 100644 index 6679d2f..0000000 --- a/source/csv_parser.c +++ /dev/null | |||
| @@ -1,26 +0,0 @@ | |||
| 1 | internal void | ||
| 2 | strip_new_line(string8 buffer) | ||
| 3 | { | ||
| 4 | |||
| 5 | for (u64 index = 0; index < buffer.size; index++) | ||
| 6 | { | ||
| 7 | |||
| 8 | } | ||
| 9 | |||
| 10 | return; | ||
| 11 | |||
| 12 | } | ||
| 13 | |||
| 14 | internal void | ||
| 15 | read_csv(string8 buffer) | ||
| 16 | { | ||
| 17 | printf("\nsize:%lu\ndata %s\n", buffer.size, buffer.data); | ||
| 18 | |||
| 19 | } | ||
| 20 | |||
| 21 | internal csv_table * | ||
| 22 | parse_csv(token *tokens, csv_table *table) | ||
| 23 | { | ||
| 24 | |||
| 25 | return NULL; | ||
| 26 | } | ||
diff --git a/source/csv_parser.h b/source/csv_parser.h deleted file mode 100644 index 1aa96b9..0000000 --- a/source/csv_parser.h +++ /dev/null | |||
| @@ -1,40 +0,0 @@ | |||
| 1 | #ifndef CSV_PARSER_H | ||
| 2 | #define CSV_PARSER_H | ||
| 3 | |||
| 4 | typedef struct csv_row csv_row; | ||
| 5 | struct csv_row | ||
| 6 | { | ||
| 7 | // array of size col_count, points into mmap buffer | ||
| 8 | string8 *fields; | ||
| 9 | s32 count; | ||
| 10 | }; | ||
| 11 | |||
| 12 | typedef struct csv_table csv_table; | ||
| 13 | struct csv_table | ||
| 14 | { | ||
| 15 | // first row, col names | ||
| 16 | // all data rows | ||
| 17 | string8 *headers; | ||
| 18 | csv_row *rows; | ||
| 19 | s32 col_count; | ||
| 20 | s32 row_count; | ||
| 21 | }; | ||
| 22 | |||
| 23 | read_only global_variable | ||
| 24 | csv_row nil_csv_row = | ||
| 25 | { | ||
| 26 | .fields = &nil_string, | ||
| 27 | .count = 0, | ||
| 28 | }; | ||
| 29 | |||
| 30 | read_only global_variable | ||
| 31 | csv_table nil_csv_table = | ||
| 32 | { | ||
| 33 | .headers = &nil_string, | ||
| 34 | .rows = &nil_csv_row, | ||
| 35 | .col_count = 0, | ||
| 36 | .row_count = 0, | ||
| 37 | }; | ||
| 38 | |||
| 39 | |||
| 40 | #endif /* CSV_PARSER_H */ | ||
diff --git a/source/csv_reader.h b/source/csv_reader.h new file mode 100644 index 0000000..6c8e625 --- /dev/null +++ b/source/csv_reader.h | |||
| @@ -0,0 +1,145 @@ | |||
| 1 | #ifndef ENGINE_LEXER_H | ||
| 2 | #define ENGINE_LEXER_H | ||
| 3 | |||
| 4 | typedef enum token_flags token_flags; | ||
| 5 | enum token_flags | ||
| 6 | { | ||
| 7 | START_FL = 1 << 1, | ||
| 8 | END_FL = 1 << 2, | ||
| 9 | }; | ||
| 10 | |||
| 11 | |||
| 12 | typedef enum token_type token_type; | ||
| 13 | enum token_type | ||
| 14 | { | ||
| 15 | // first 255 tokens for ascii characters | ||
| 16 | TOKEN_UNDEFINED = 255, | ||
| 17 | TOKEN_IDENTIFIER, | ||
| 18 | TOKEN_VALUE, | ||
| 19 | }; | ||
| 20 | |||
| 21 | typedef struct token token; | ||
| 22 | struct token | ||
| 23 | { | ||
| 24 | string8 lexeme; | ||
| 25 | token_type type; | ||
| 26 | token_flags flags; | ||
| 27 | token *next; | ||
| 28 | }; | ||
| 29 | |||
| 30 | typedef struct csv_row csv_row; | ||
| 31 | struct csv_row | ||
| 32 | { | ||
| 33 | // array of size col_count, points into mmap buffer | ||
| 34 | string8 *fields; | ||
| 35 | s32 count; | ||
| 36 | }; | ||
| 37 | |||
| 38 | typedef struct csv_table csv_table; | ||
| 39 | struct csv_table | ||
| 40 | { | ||
| 41 | // first row, col names | ||
| 42 | // all data rows | ||
| 43 | string8 *headers; | ||
| 44 | csv_row *rows; | ||
| 45 | s32 col_count; | ||
| 46 | s32 row_count; | ||
| 47 | }; | ||
| 48 | |||
| 49 | read_only global_variable | ||
| 50 | csv_row nil_csv_row = | ||
| 51 | { | ||
| 52 | .fields = &nil_string, | ||
| 53 | .count = 0, | ||
| 54 | }; | ||
| 55 | |||
| 56 | read_only global_variable | ||
| 57 | csv_table nil_csv_table = | ||
| 58 | { | ||
| 59 | .headers = &nil_string, | ||
| 60 | .rows = &nil_csv_row, | ||
| 61 | .col_count = 0, | ||
| 62 | .row_count = 0, | ||
| 63 | }; | ||
| 64 | |||
| 65 | |||
| 66 | // the lexer acts as a table builder from a csv file | ||
| 67 | // and parsing indivudal rows and columns | ||
| 68 | // the next step would be building a the b-tree | ||
| 69 | internal token * | ||
| 70 | tokenize_csv(string8 buffer, mem_arena *arena) | ||
| 71 | { | ||
| 72 | |||
| 73 | b32 FL = TRUE; | ||
| 74 | |||
| 75 | if(buffer.size < 0) return NULL; | ||
| 76 | for(s32 index = 0; buffer.data[index] != '\0'; ++index) | ||
| 77 | { | ||
| 78 | token *tok = PushStruct(arena, token); | ||
| 79 | u8 point = buffer.data[index]; | ||
| 80 | |||
| 81 | s32 start = 0; | ||
| 82 | s32 end = 0; | ||
| 83 | |||
| 84 | if(is_whitespace(point)) | ||
| 85 | { | ||
| 86 | print("csv file is invalid"); | ||
| 87 | return NULL; | ||
| 88 | } | ||
| 89 | |||
| 90 | switch(point) | ||
| 91 | { | ||
| 92 | case('\n'): | ||
| 93 | { | ||
| 94 | if(FL) tok->flags |= END_FL; | ||
| 95 | break; | ||
| 96 | } | ||
| 97 | |||
| 98 | case(','): | ||
| 99 | { | ||
| 100 | end = index - 1; | ||
| 101 | start = index + 1; | ||
| 102 | break; | ||
| 103 | } | ||
| 104 | default: | ||
| 105 | { | ||
| 106 | break; | ||
| 107 | } | ||
| 108 | } | ||
| 109 | |||
| 110 | tok->lexeme = StringCast(&buffer.data[start], end - start); | ||
| 111 | tok->next = tok; | ||
| 112 | } | ||
| 113 | |||
| 114 | return NULL; | ||
| 115 | } | ||
| 116 | |||
| 117 | internal void | ||
| 118 | strip_new_line(string8 buffer) | ||
| 119 | { | ||
| 120 | |||
| 121 | for (u64 index = 0; index < buffer.size; index++) | ||
| 122 | { | ||
| 123 | |||
| 124 | } | ||
| 125 | |||
| 126 | return; | ||
| 127 | |||
| 128 | } | ||
| 129 | |||
| 130 | internal void | ||
| 131 | read_csv(string8 buffer) | ||
| 132 | { | ||
| 133 | // printf("\nsize:%lu\ndata %s\n", buffer.size, buffer.data); | ||
| 134 | |||
| 135 | } | ||
| 136 | |||
| 137 | internal csv_table * | ||
| 138 | parse_csv(token *tokens, csv_table *table) | ||
| 139 | { | ||
| 140 | |||
| 141 | return NULL; | ||
| 142 | } | ||
| 143 | |||
| 144 | |||
| 145 | #endif /* ENGINE_LEXER_H */ | ||
diff --git a/source/engine.c b/source/engine.c index fe609e9..3527e27 100644 --- a/source/engine.c +++ b/source/engine.c | |||
| @@ -1,17 +1,88 @@ | |||
| 1 | #define BASE_UNITY | 1 | #define BASE_UNITY |
| 2 | #include "base/base_include.h" | 2 | #include "base/base_include.h" |
| 3 | 3 | ||
| 4 | #include <stdio.h> | 4 | internal b32 |
| 5 | is_alpha(u8 point) | ||
| 6 | { | ||
| 7 | return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_')); | ||
| 8 | } | ||
| 9 | |||
| 10 | internal b32 | ||
| 11 | is_digit(u8 point) | ||
| 12 | { | ||
| 13 | return (point >= '0' && point <= '9'); | ||
| 14 | } | ||
| 5 | 15 | ||
| 6 | #include "csv_parser.h" | 16 | internal b32 |
| 7 | #include "lexer.h" | 17 | is_alpha_num(u8 point) |
| 18 | { | ||
| 19 | return (is_alpha(point) || is_digit(point)); | ||
| 20 | } | ||
| 21 | |||
| 22 | internal b32 | ||
| 23 | is_whitespace(u8 point) | ||
| 24 | { | ||
| 25 | return (point == '\n' || point == '\r' || point == ' ' || point == '\t'); | ||
| 26 | } | ||
| 27 | |||
| 28 | internal b32 | ||
| 29 | is_delimiter(u8 point) | ||
| 30 | { | ||
| 31 | return (point == ','); | ||
| 32 | } | ||
| 33 | |||
| 34 | #include "csv_reader.h" | ||
| 35 | // #include "btree.h" | ||
| 36 | |||
| 37 | typedef struct query_token query_token; | ||
| 38 | struct query_token | ||
| 39 | { | ||
| 40 | string8 *lexeme; | ||
| 41 | query_token *next; | ||
| 42 | }; | ||
| 43 | |||
| 44 | |||
| 45 | // takes on line of the repl input | ||
| 46 | internal query_token * | ||
| 47 | query_tokenizer(mem_arena *arena, string8 *buffer) | ||
| 48 | { | ||
| 49 | query_token *tok = PushStruct(arena, query_token); | ||
| 50 | unused(tok); | ||
| 8 | 51 | ||
| 9 | #include "lexer.c" | 52 | for (u64 index = 0; index < buffer->size; ++index) |
| 10 | #include "csv_parser.c" | 53 | { |
| 54 | u8 codepoint = buffer->data[index]; | ||
| 55 | |||
| 56 | if(codepoint == '\n' || codepoint == '\r') break; | ||
| 57 | |||
| 58 | s32 start = 0; | ||
| 59 | s32 end = 0; | ||
| 60 | |||
| 61 | if(is_whitespace(codepoint)) | ||
| 62 | { | ||
| 63 | end = index; | ||
| 64 | } | ||
| 65 | |||
| 66 | s32 new_token_size = end - start; | ||
| 67 | |||
| 68 | tok->lexeme->data = &buffer->data[index]; | ||
| 69 | tok->lexeme->size = new_token_size; | ||
| 70 | |||
| 71 | tok->next = tok; | ||
| 72 | start = index + 1; | ||
| 73 | |||
| 74 | } | ||
| 75 | |||
| 76 | return tok; | ||
| 77 | } | ||
| 11 | 78 | ||
| 12 | int main(int c, char **v) | 79 | int main(int c, char **v) |
| 13 | { | 80 | { |
| 14 | if(c < 2) return -999; | 81 | if(c < 2) |
| 82 | { | ||
| 83 | print("bad file, setting default file\n"); | ||
| 84 | } | ||
| 85 | else v[1] = "./test/customers-10000.csv"; | ||
| 15 | 86 | ||
| 16 | local_persist b32 running = 1; | 87 | local_persist b32 running = 1; |
| 17 | 88 | ||
| @@ -26,13 +97,27 @@ int main(int c, char **v) | |||
| 26 | { | 97 | { |
| 27 | if (running) | 98 | if (running) |
| 28 | { | 99 | { |
| 29 | // TODO(nasr): check for return key | ||
| 30 | { | 100 | { |
| 31 | u8 line_buffer[256] = {}; | 101 | u8 line_buffer[256] = {}; |
| 32 | |||
| 33 | s64 codepoint = os_read(STDIN_FD, line_buffer, 256); | 102 | s64 codepoint = os_read(STDIN_FD, line_buffer, 256); |
| 34 | unused(codepoint); | 103 | unused(codepoint); |
| 104 | for(s32 index = 0; index < 256; ++index) | ||
| 105 | { | ||
| 106 | if(line_buffer[index] == '\n') | ||
| 107 | { | ||
| 108 | print("exiting"); | ||
| 109 | return 0; | ||
| 110 | } | ||
| 111 | else if(line_buffer[index] == ' ') | ||
| 112 | { | ||
| 35 | 113 | ||
| 114 | print("TODO(nasr): "); | ||
| 115 | } | ||
| 116 | |||
| 117 | } | ||
| 118 | } | ||
| 119 | |||
| 120 | { | ||
| 36 | read_csv(buffer); | 121 | read_csv(buffer); |
| 37 | token *tokens = tokenize_csv(buffer, global_arena); | 122 | token *tokens = tokenize_csv(buffer, global_arena); |
| 38 | global_table = parse_csv(tokens, global_table); | 123 | global_table = parse_csv(tokens, global_table); |
diff --git a/source/lexer.c b/source/lexer.c deleted file mode 100644 index decfb7a..0000000 --- a/source/lexer.c +++ /dev/null | |||
| @@ -1,80 +0,0 @@ | |||
| 1 | // the lexer acts as a table builder from a csv file | ||
| 2 | // and parsing indivudal rows and columns | ||
| 3 | // the next step would be building a the b-tree | ||
| 4 | internal b32 | ||
| 5 | is_alpha(u8 point) | ||
| 6 | { | ||
| 7 | return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_')); | ||
| 8 | } | ||
| 9 | |||
| 10 | internal b32 | ||
| 11 | is_digit(u8 point) | ||
| 12 | { | ||
| 13 | return (point >= '0' && point <= '9'); | ||
| 14 | } | ||
| 15 | |||
| 16 | internal b32 | ||
| 17 | is_alpha_num(u8 point) | ||
| 18 | { | ||
| 19 | return (is_alpha(point) || is_digit(point)); | ||
| 20 | } | ||
| 21 | |||
| 22 | internal b32 | ||
| 23 | is_whitespace(u8 point) | ||
| 24 | { | ||
| 25 | return (point == '\n' || point == '\r' || point == ' ' || point == '\t'); | ||
| 26 | } | ||
| 27 | |||
| 28 | internal b32 | ||
| 29 | is_delimiter(u8 point) | ||
| 30 | { | ||
| 31 | return (point == ','); | ||
| 32 | } | ||
| 33 | |||
| 34 | internal token * | ||
| 35 | tokenize_csv(string8 buffer, mem_arena *arena) | ||
| 36 | { | ||
| 37 | |||
| 38 | b32 FL = TRUE; | ||
| 39 | |||
| 40 | if(buffer.size < 0) return NULL; | ||
| 41 | for(s32 index = 0; buffer.data[index] != '\0'; ++index) | ||
| 42 | { | ||
| 43 | token *tok = PushStruct(arena, token); | ||
| 44 | u8 point = buffer.data[index]; | ||
| 45 | |||
| 46 | s32 start = 0; | ||
| 47 | s32 end = 0; | ||
| 48 | |||
| 49 | if(is_whitespace(point)) | ||
| 50 | { | ||
| 51 | print("csv file is invalid"); | ||
| 52 | return NULL; | ||
| 53 | } | ||
| 54 | |||
| 55 | switch(point) | ||
| 56 | { | ||
| 57 | case('\n'): | ||
| 58 | { | ||
| 59 | if(FL) tok->flags |= END_FL; | ||
| 60 | break; | ||
| 61 | } | ||
| 62 | |||
| 63 | case(','): | ||
| 64 | { | ||
| 65 | end = index - 1; | ||
| 66 | start = index + 1; | ||
| 67 | break; | ||
| 68 | } | ||
| 69 | default: | ||
| 70 | { | ||
| 71 | break; | ||
| 72 | } | ||
| 73 | } | ||
| 74 | |||
| 75 | tok->lexeme = String8Cast(&buffer.data[start], end - start); | ||
| 76 | tok->next = tok; | ||
| 77 | } | ||
| 78 | |||
| 79 | return NULL; | ||
| 80 | } | ||
diff --git a/source/lexer.h b/source/lexer.h deleted file mode 100644 index c950a58..0000000 --- a/source/lexer.h +++ /dev/null | |||
| @@ -1,30 +0,0 @@ | |||
| 1 | #ifndef ENGINE_LEXER_H | ||
| 2 | #define ENGINE_LEXER_H | ||
| 3 | |||
| 4 | typedef enum token_flags token_flags; | ||
| 5 | enum token_flags | ||
| 6 | { | ||
| 7 | START_FL = 1 << 1, | ||
| 8 | END_FL = 1 << 2, | ||
| 9 | }; | ||
| 10 | |||
| 11 | |||
| 12 | typedef enum token_type token_type; | ||
| 13 | enum token_type | ||
| 14 | { | ||
| 15 | // first 255 tokens for ascii characters | ||
| 16 | TOKEN_UNDEFINED = 255, | ||
| 17 | TOKEN_IDENTIFIER, | ||
| 18 | TOKEN_VALUE, | ||
| 19 | }; | ||
| 20 | |||
| 21 | typedef struct token token; | ||
| 22 | struct token | ||
| 23 | { | ||
| 24 | string8 lexeme; | ||
| 25 | token_type type; | ||
| 26 | token_flags flags; | ||
| 27 | token *next; | ||
| 28 | }; | ||
| 29 | |||
| 30 | #endif /* ENGINE_LEXER_H */ | ||
diff --git a/source/query.c b/source/query.c deleted file mode 100644 index e69de29..0000000 --- a/source/query.c +++ /dev/null | |||
diff --git a/source/query.h b/source/query.h deleted file mode 100644 index e69de29..0000000 --- a/source/query.h +++ /dev/null | |||
