improvement(main): worked on the lexer, close to finishing the tokenization

csv's are simple
author: nasr <nsrddyn@gmail.com> 2026-03-06 18:53:23 +0000
committer: nasr <nsrddyn@gmail.com> 2026-03-06 18:53:23 +0000
commit: 2c9057b8f009bd39d97a2d30cf71135cb07c5e4b (patch)
tree: 64712280738eba2ce174aef50a380c9a3d862d35
parent: d8c52d6c408a172f1210c77df3e3a9629ea68dc6 (diff)
3 files changed, 64 insertions, 51 deletions
diff --git a/source/engine/engine.c b/source/engine/engine.c
index 05c143c..64b15bf 100644
--- a/source/engine/engine.c
+++ b/source/engine/engine.c
@@ -3,8 +3,7 @@
 #include <stdio.h>
-#include "../lexer/lexer.h"
-#include "../lexer/lexer.c"
 #include "../parser/parser.h"
 #include "../parser/parser.c"
@@ -15,24 +14,22 @@
 #include "../storage/csv_reader.h"
 #include "../storage/csv_reader.c"
+#include "../lexer/lexer.h"
+#include "../lexer/lexer.c"
 int main(int c, char **v)
 {
    if(c < 2) return -999;
-    string8 buffer = load_file(v[1]);
+    mem_arena *global_arena = arena_create(MiB(20));
-    // read_csv(buffer);
+    csv_table *global_table = PushStruct(global_arena, csv_table);
-    tokenize_csv(buffer);
+    string8 buffer = load_file(v[1]);
+    read_csv(buffer);
+    tokenize_csv(buffer, global_table, global_arena);
-    // for(;;)
-    // {
-    //     print("reading user input...");
-    //     // TODO(nasr): design a repl system
-    //
-    //     sleep(1);
-    // }
-    //
    return 0;
 }
diff --git a/source/lexer/lexer.c b/source/lexer/lexer.c
index 1c7ab38..948afd0 100644
--- a/source/lexer/lexer.c
+++ b/source/lexer/lexer.c
@@ -1,77 +1,97 @@
+// the lexer acts as a table builder from a csv  file
+// and parsing indivudal rows and columns
+// the next step would be building a the b-tree
 internal b32
 is_alpha(u8 point)
 {
-  return ((point >= 'a' && point <= 'z') ||
+    return ((point >= 'a' && point <= 'z') || (point >= 'A' && point <= 'Z') || (point == '_'));
-          (point >= 'A' && point <= 'Z') ||
-          (point == '_'));
 }
 internal b32
 is_digit(u8 point)
 {
-  return (point >= '0' && point <= '9');
+    return (point >= '0' && point <= '9');
 }
 internal b32
 is_alpha_num(u8 point)
 {
-  return (is_alpha(point) || is_digit(point));
+    return (is_alpha(point) || is_digit(point));
 }
 internal b32
 is_whitespace(u8 point)
 {
-  return (point == '\n' || point == '\r' ||
+    return (point == '\n' || point == '\r' || point == ' ' || point == '\t');
-          point == ' ' || point == '\t');
 }
 internal b32
 is_delimiter(u8 point)
 {
    return (point == ',');
 }
 internal token *
-tokenize_csv(string8 buffer)
+tokenize_csv(string8 buffer, csv_table *global_table, mem_arena *arena)
 {
    i32 count = 0;
    string8 **tokens = PushArray(arena, string8 *, buffer.size / 10);
+    b32 first_line = 1;
    if(buffer.size < 0) return NULL;
    for(i32 index = 0;
-         buffer.data[index] != '\0';
+        buffer.data[index] != '\0';
-         ++index)
+        ++index)
    {
-        string8 tokens = {0};
+        csv_row *row = PushStruct(arena, csv_row);
+        string8 token = {0};
        u8 point = buffer.data[index];
-        if(is_whitespace(point)) continue;
-        u8 *start = &buffer.data;
-        if(is_delimiter(point))
-        {
-        }
-        u8 *end = start - 1;
+        u8 *start = buffer.data;
+        u8 *end = NULL;
-        unused(start);
+        unused(row);
-        unused(end);
        switch (point)
        {
+            case '\n':
+                {
+                    first_line = -1;
+                    break;
+                }
+            case ',':
+                {
+                    end = start - 1;
+                    if (first_line)
+                    {
+                        global_table->headers = &token;
+                        ++global_table->headers;
+                        break;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
            default:
                {
                    printf("point: %c\n", point);
                    count++;
+                    break;
                }
        }
+        token = (string8){
+            .data = start,
+            .size = end - start,
+        };
+        **tokens = token;
+        ++*tokens;
    }
    printf("%d", count);
diff --git a/source/repl/repl.c b/source/repl/repl.c
index 4c57345..dd289d8 100644
--- a/source/repl/repl.c
+++ b/source/repl/repl.c
@@ -1,16 +1,12 @@
-#ifndef ENGINE_REPL_H
+internal void
-#define ENGINE_REPL_H
+init_repl()
-typedef struct node node;
-struct node
-{
-};
-typedef struct btree btree;
-struct btree
 {
+    for(;;)
+    {
+        print("reading user input...");
+        // TODO(nasr): design a repl system
-};
+        sleep(1);
+    }
-#endif /* ENGINE_H */
+}
author	nasr <nsrddyn@gmail.com>	2026-03-06 18:53:23 +0000
committer	nasr <nsrddyn@gmail.com>	2026-03-06 18:53:23 +0000
commit	2c9057b8f009bd39d97a2d30cf71135cb07c5e4b (patch)
tree	64712280738eba2ce174aef50a380c9a3d862d35
parent	d8c52d6c408a172f1210c77df3e3a9629ea68dc6 (diff)

diff --git a/source/engine/engine.c b/source/engine/engine.c index 05c143c..64b15bf 100644 --- a/source/engine/engine.c +++ b/source/engine/engine.c
@@ -3,8 +3,7 @@
3		3
4	#include <stdio.h>	4	#include <stdio.h>
5		5
6	#include "../lexer/lexer.h"	6
7	#include "../lexer/lexer.c"
8		7
9	#include "../parser/parser.h"	8	#include "../parser/parser.h"
10	#include "../parser/parser.c"	9	#include "../parser/parser.c"
@@ -15,24 +14,22 @@
15	#include "../storage/csv_reader.h"	14	#include "../storage/csv_reader.h"
16	#include "../storage/csv_reader.c"	15	#include "../storage/csv_reader.c"
17		16
		17	#include "../lexer/lexer.h"
		18	#include "../lexer/lexer.c"
		19
		20
18		21
19	int main(int c, char **v)	22	int main(int c, char **v)
20	{	23	{
21	if(c < 2) return -999;	24	if(c < 2) return -999;
22		25
23	string8 buffer = load_file(v[1]);	26	mem_arena *global_arena = arena_create(MiB(20));
24	// read_csv(buffer);	27	csv_table *global_table = PushStruct(global_arena, csv_table);
25	tokenize_csv(buffer);
26		28
		29	string8 buffer = load_file(v[1]);
		30	read_csv(buffer);
		31	tokenize_csv(buffer, global_table, global_arena);
27		32
28	// for(;;)
29	// {
30	// print("reading user input...");
31	// // TODO(nasr): design a repl system
32	//
33	// sleep(1);
34	// }
35	//
36		33
37	return 0;	34	return 0;
38	}	35	}


diff --git a/source/lexer/lexer.c b/source/lexer/lexer.c index 1c7ab38..948afd0 100644 --- a/source/lexer/lexer.c +++ b/source/lexer/lexer.c
@@ -1,77 +1,97 @@
		1	// the lexer acts as a table builder from a csv file
		2	// and parsing indivudal rows and columns
		3	// the next step would be building a the b-tree
1	internal b32	4	internal b32
2	is_alpha(u8 point)	5	is_alpha(u8 point)
3	{	6	{
4	return ((point >= 'a' && point <= 'z') \|\|	7	return ((point >= 'a' && point <= 'z') \|\| (point >= 'A' && point <= 'Z') \|\| (point == '_'));
5	(point >= 'A' && point <= 'Z') \|\|
6	(point == '_'));
7	}	8	}
8		9
9	internal b32	10	internal b32
10	is_digit(u8 point)	11	is_digit(u8 point)
11	{	12	{
12	return (point >= '0' && point <= '9');	13	return (point >= '0' && point <= '9');
13	}	14	}
14		15
15	internal b32	16	internal b32
16	is_alpha_num(u8 point)	17	is_alpha_num(u8 point)
17	{	18	{
18	return (is_alpha(point) \|\| is_digit(point));	19	return (is_alpha(point) \|\| is_digit(point));
19	}	20	}
20		21
21	internal b32	22	internal b32
22	is_whitespace(u8 point)	23	is_whitespace(u8 point)
23	{	24	{
24	return (point == '\n' \|\| point == '\r' \|\|	25	return (point == '\n' \|\| point == '\r' \|\| point == ' ' \|\| point == '\t');
25	point == ' ' \|\| point == '\t');
26	}	26	}
27		27
28	internal b32	28	internal b32
29	is_delimiter(u8 point)	29	is_delimiter(u8 point)
30	{	30	{
31
32	return (point == ',');	31	return (point == ',');
33
34	}	32	}
35		33
36	internal token *	34	internal token *
37	tokenize_csv(string8 buffer)	35	tokenize_csv(string8 buffer, csv_table global_table, mem_arena arena)
38	{	36	{
39	i32 count = 0;	37	i32 count = 0;
40	string8 *tokens = PushArray(arena, string8 , buffer.size / 10);	38	string8 *tokens = PushArray(arena, string8 , buffer.size / 10);
		39	b32 first_line = 1;
41		40
42	if(buffer.size < 0) return NULL;	41	if(buffer.size < 0) return NULL;
43	for(i32 index = 0;	42	for(i32 index = 0;
44	buffer.data[index] != '\0';	43	buffer.data[index] != '\0';
45	++index)	44	++index)
46	{	45	{
47	string8 tokens = {0};	46	csv_row *row = PushStruct(arena, csv_row);
		47	string8 token = {0};
48		48
49	u8 point = buffer.data[index];	49	u8 point = buffer.data[index];
50	if(is_whitespace(point)) continue;
51
52	u8 *start = &buffer.data;
53
54	if(is_delimiter(point))
55	{
56
57
58	}
59		50
60	u8 *end = start - 1;	51	u8 *start = buffer.data;
		52	u8 *end = NULL;
61		53
62	unused(start);	54	unused(row);
63	unused(end);
64		55
65	switch (point)	56	switch (point)
66	{	57	{
		58	case '\n':
		59	{
		60	first_line = -1;
		61	break;
		62	}
		63	case ',':
		64	{
		65	end = start - 1;
		66
		67	if (first_line)
		68	{
		69	global_table->headers = &token;
		70	++global_table->headers;
		71	break;
		72	}
		73	else
		74	{
		75
		76	break;
		77	}
		78	}
67		79
68	default:	80	default:
69	{	81	{
70	printf("point: %c\n", point);	82	printf("point: %c\n", point);
71	count++;	83	count++;
		84	break;
72	}	85	}
73	}	86	}
74		87
		88	token = (string8){
		89	.data = start,
		90	.size = end - start,
		91	};
		92
		93	**tokens = token;
		94	++*tokens;
75	}	95	}
76		96
77	printf("%d", count);	97	printf("%d", count);


diff --git a/source/repl/repl.c b/source/repl/repl.c index 4c57345..dd289d8 100644 --- a/source/repl/repl.c +++ b/source/repl/repl.c
@@ -1,16 +1,12 @@
1	#ifndef ENGINE_REPL_H	1	internal void
2	#define ENGINE_REPL_H	2	init_repl()
3
4	typedef struct node node;
5	struct node
6	{
7
8	};
9
10	typedef struct btree btree;
11	struct btree
12	{	3	{
		4	for(;;)
		5	{
		6	print("reading user input...");
		7	// TODO(nasr): design a repl system
13		8
14	};	9	sleep(1);
		10	}
15		11
16	#endif /* ENGINE_H */	12	}