diff options
Diffstat (limited to 'source/fajr_lexer')
| -rw-r--r-- | source/fajr_lexer/fajr_lexer.c | 274 | ||||
| -rw-r--r-- | source/fajr_lexer/fajr_lexer.h | 119 |
2 files changed, 393 insertions, 0 deletions
diff --git a/source/fajr_lexer/fajr_lexer.c b/source/fajr_lexer/fajr_lexer.c new file mode 100644 index 0000000..3a98ca9 --- /dev/null +++ b/source/fajr_lexer/fajr_lexer.c | |||
| @@ -0,0 +1,274 @@ | |||
| 1 | internal inline b32 | ||
| 2 | IsAlpha(u8 Character) | ||
| 3 | { | ||
| 4 | return ((Character >= 'a' && Character <= 'z') || | ||
| 5 | (Character >= 'A' && Character <= 'Z') || | ||
| 6 | (Character == '_')); | ||
| 7 | } | ||
| 8 | |||
| 9 | internal inline b32 | ||
| 10 | IsDigit(u8 Character) | ||
| 11 | { | ||
| 12 | return (Character >= '0' && Character <= '9'); | ||
| 13 | } | ||
| 14 | |||
| 15 | internal b32 | ||
| 16 | IsDelimiter(u8 Character) | ||
| 17 | { | ||
| 18 | for(i32 Index = 0; Index < (i32)sizeof(Delimiters); ++Index) | ||
| 19 | { | ||
| 20 | if(Delimiters[Index] == Character) | ||
| 21 | { | ||
| 22 | return 1; | ||
| 23 | } | ||
| 24 | } | ||
| 25 | return 0; | ||
| 26 | } | ||
| 27 | |||
| 28 | internal inline b32 | ||
| 29 | IsNilTokenNode(token_node *TokenNode) | ||
| 30 | { | ||
| 31 | return TokenNode == &nil_token_node || TokenNode == NULL; | ||
| 32 | } | ||
| 33 | |||
| 34 | internal inline b32 | ||
| 35 | IsNilToken(token *Token) | ||
| 36 | { | ||
| 37 | return Token == &nil_token || Token == NULL; | ||
| 38 | } | ||
| 39 | |||
| 40 | internal inline b32 | ||
| 41 | IsWhiteSpace(u8 Character) | ||
| 42 | { | ||
| 43 | return (Character == '\n' || Character == '\r' || | ||
| 44 | Character == ' ' || Character == '\t'); | ||
| 45 | } | ||
| 46 | |||
| 47 | internal inline void | ||
| 48 | ParseCStyleComment(u8 Buffer[]) | ||
| 49 | { | ||
| 50 | // TODO(nasr): handle c style comments | ||
| 51 | // couuld be usefull for function information visualiszation | ||
| 52 | // so think of a way to link themn to functions and variables? | ||
| 53 | // some sort of meta data per thing? | ||
| 54 | // and then we can do a visualization if the str8.count of the metadata thing is bigger then 0 | ||
| 55 | // we should a visualization thing for the thing | ||
| 56 | // if the thing is less then 0, we dont do anything? | ||
| 57 | |||
| 58 | // TODO(nasr): while doingn this we could also add in some editor specific anotations ? | ||
| 59 | } | ||
| 60 | |||
| 61 | internal inline void | ||
| 62 | ParseCPPStyleComment(u8 Buffer[]) | ||
| 63 | { | ||
| 64 | // TODO(nasr): | ||
| 65 | } | ||
| 66 | |||
| 67 | internal inline b32 | ||
| 68 | Is_TokenBreak(u8 Character) | ||
| 69 | { | ||
| 70 | return (IsWhiteSpace(Character) || IsDelimiter(Character)); | ||
| 71 | } | ||
| 72 | |||
| 73 | internal token_list * | ||
| 74 | Lex(string8 *Buffer, mem_arena *Arena, token_list *List) | ||
| 75 | { | ||
| 76 | b32 Initialized = 0; | ||
| 77 | i32 Line = 1; | ||
| 78 | i32 Column = 1; | ||
| 79 | |||
| 80 | for(u64 TextIndex = 0; TextIndex < Buffer->size; TextIndex++) | ||
| 81 | { | ||
| 82 | u8 Character = Buffer->data[TextIndex]; | ||
| 83 | |||
| 84 | if(Character == '\r' || Character == '\n') | ||
| 85 | { | ||
| 86 | if(Character == '\r' && | ||
| 87 | (TextIndex + 1 < Buffer->size) && | ||
| 88 | Buffer->data[TextIndex + 1] == '\n') | ||
| 89 | { | ||
| 90 | TextIndex++; | ||
| 91 | } | ||
| 92 | |||
| 93 | ++TextIndex; | ||
| 94 | ++Line; | ||
| 95 | |||
| 96 | // NOTE(nasr): reset the column to the beginning of the line | ||
| 97 | Column = 1; | ||
| 98 | continue; | ||
| 99 | } | ||
| 100 | |||
| 101 | if(IsWhiteSpace(Character)) | ||
| 102 | { | ||
| 103 | ++Column; | ||
| 104 | continue; | ||
| 105 | } | ||
| 106 | |||
| 107 | token_node *TokenNode = PushStruct(Arena, token_node); | ||
| 108 | token *Token = PushStruct(Arena, token); | ||
| 109 | TokenNode->Next = &nil_token_node; | ||
| 110 | TokenNode->Previous = &nil_token_node; | ||
| 111 | TokenNode->Token = Token; | ||
| 112 | Token->Line = Line; | ||
| 113 | Token->Column = Column; | ||
| 114 | Token->ByteOffset = (u64)TextIndex; | ||
| 115 | Token->Flags = FlagNone; | ||
| 116 | |||
| 117 | u64 TokenStart = TextIndex; | ||
| 118 | u64 TokenEnd = TextIndex; | ||
| 119 | |||
| 120 | if(Character > 126) | ||
| 121 | { | ||
| 122 | Token->Type = TokenUnwantedChild; | ||
| 123 | TokenEnd = TextIndex + 1; | ||
| 124 | } | ||
| 125 | else if(IsAlpha(Character)) | ||
| 126 | { | ||
| 127 | while((TextIndex + 1 < Buffer->size) && | ||
| 128 | (IsAlpha(Buffer->data[TextIndex + 1]) || IsDigit(Buffer->data[TextIndex + 1]))) | ||
| 129 | { | ||
| 130 | ++TextIndex; | ||
| 131 | } | ||
| 132 | |||
| 133 | // TODO(nasr): build a lexeme | ||
| 134 | TokenEnd = TextIndex + 1; | ||
| 135 | string8 Lexeme = { | ||
| 136 | .data = (u8 *)Buffer->data, | ||
| 137 | .size = (u64)Buffer->data | ||
| 138 | } | ||
| 139 | ; | ||
| 140 | |||
| 141 | // TODO(nasr): handle functions | ||
| 142 | if(string8_cmp(Lexeme, StringLit("func"), 0)) | ||
| 143 | Token->Type = TokenIf; | ||
| 144 | else if(string8_cmp(Lexeme, StringLit("if"), 0)) | ||
| 145 | Token->Type = TokenElse; | ||
| 146 | else if(string8_cmp(Lexeme, StringLit("return"), 0)) | ||
| 147 | Token->Type = TokenReturn; | ||
| 148 | else if(string8_cmp(Lexeme, StringLit("while"), 0)) | ||
| 149 | Token->Type = TokenWhile; | ||
| 150 | else if(string8_cmp(Lexeme, StringLit("for"), 0)) | ||
| 151 | Token->Type = TokenFor; | ||
| 152 | else if(string8_cmp(Lexeme, StringLit("break"), 0)) | ||
| 153 | Token->Type = TokenBreak; | ||
| 154 | else if(string8_cmp(Lexeme, StringLit("continue"), 0)) | ||
| 155 | Token->Type = TokenContinue; | ||
| 156 | else | ||
| 157 | Token->Type = TokenIdentifier; | ||
| 158 | } | ||
| 159 | else if(IsDigit(Character)) | ||
| 160 | { | ||
| 161 | while((TextIndex + 1 < Buffer->size) && | ||
| 162 | IsDigit(Buffer->data[TextIndex + 1])) | ||
| 163 | { | ||
| 164 | ++TextIndex; | ||
| 165 | } | ||
| 166 | |||
| 167 | TokenEnd = TextIndex + 1; | ||
| 168 | Token->Type = TokenNumber; | ||
| 169 | } | ||
| 170 | |||
| 171 | else | ||
| 172 | { | ||
| 173 | u8 Next = (TextIndex + 1 < Buffer->size) ? Buffer->data[TextIndex + 1] : 0; | ||
| 174 | |||
| 175 | switch(Character) | ||
| 176 | { | ||
| 177 | case '=': | ||
| 178 | { | ||
| 179 | if(Next == '=') | ||
| 180 | { | ||
| 181 | Token->Type = TokenDoubleEqual; | ||
| 182 | TextIndex++; | ||
| 183 | } | ||
| 184 | else | ||
| 185 | { | ||
| 186 | Token->Type = (token_type)'='; | ||
| 187 | } | ||
| 188 | } | ||
| 189 | break; | ||
| 190 | |||
| 191 | case '>': | ||
| 192 | { | ||
| 193 | if(Next == '=') | ||
| 194 | { | ||
| 195 | Token->Type = TokenGreaterEqual; | ||
| 196 | TextIndex++; | ||
| 197 | } | ||
| 198 | else if(Next == '>') | ||
| 199 | { | ||
| 200 | Token->Type = TokenRightShift; | ||
| 201 | TextIndex++; | ||
| 202 | } | ||
| 203 | else | ||
| 204 | { | ||
| 205 | Token->Type = (token_type)'>'; | ||
| 206 | } | ||
| 207 | } | ||
| 208 | break; | ||
| 209 | |||
| 210 | case '<': | ||
| 211 | { | ||
| 212 | if(Next == '=') | ||
| 213 | { | ||
| 214 | Token->Type = TokenLesserEqual; | ||
| 215 | TextIndex++; | ||
| 216 | } | ||
| 217 | else if(Next == '<') | ||
| 218 | { | ||
| 219 | Token->Type = TokenLeftShift; | ||
| 220 | TextIndex++; | ||
| 221 | } | ||
| 222 | else | ||
| 223 | { | ||
| 224 | Token->Type = (token_type)'<'; | ||
| 225 | } | ||
| 226 | } | ||
| 227 | break; | ||
| 228 | |||
| 229 | case '"': | ||
| 230 | { | ||
| 231 | while(Buffer->data[TextIndex + 1] != '"' && Buffer->data[TextIndex + 1] != '\0') | ||
| 232 | { | ||
| 233 | ++TextIndex; | ||
| 234 | if(Buffer->data[TextIndex + 1] == '\\') | ||
| 235 | |||
| 236 | ++TextIndex; | ||
| 237 | } | ||
| 238 | |||
| 239 | TokenStart += 1; | ||
| 240 | Token->Type = TokenString; | ||
| 241 | } | ||
| 242 | break; | ||
| 243 | default: | ||
| 244 | { | ||
| 245 | Token->Type = (token_type)Character; | ||
| 246 | } | ||
| 247 | break; | ||
| 248 | } | ||
| 249 | } | ||
| 250 | |||
| 251 | TokenEnd = TextIndex + 1; | ||
| 252 | |||
| 253 | Token->Lexeme.data = (u8 *)&Buffer->data[TokenStart]; | ||
| 254 | Token->Lexeme.size = (u64)(TokenEnd - TokenStart); | ||
| 255 | Column += (i32)Token->Lexeme.size; | ||
| 256 | |||
| 257 | //Log("Token: \t%.lu*s\n", Token->Lexeme.Size, Token->Lexeme.Data); | ||
| 258 | |||
| 259 | if(!Initialized) | ||
| 260 | { | ||
| 261 | Initialized = 1; | ||
| 262 | List->Root = TokenNode; | ||
| 263 | List->Current = TokenNode; | ||
| 264 | } | ||
| 265 | else | ||
| 266 | { | ||
| 267 | TokenNode->Previous = List->Current; | ||
| 268 | List->Current->Next = TokenNode; | ||
| 269 | List->Current = TokenNode; | ||
| 270 | } | ||
| 271 | } | ||
| 272 | |||
| 273 | return List; | ||
| 274 | } | ||
diff --git a/source/fajr_lexer/fajr_lexer.h b/source/fajr_lexer/fajr_lexer.h new file mode 100644 index 0000000..754b89a --- /dev/null +++ b/source/fajr_lexer/fajr_lexer.h | |||
| @@ -0,0 +1,119 @@ | |||
| 1 | #ifndef FAJR_LEXER_H | ||
| 2 | #define FAJR_LEXER_H | ||
| 3 | |||
| 4 | typedef enum token_type token_type; | ||
| 5 | enum token_type | ||
| 6 | { | ||
| 7 | TokenUndefined = 256, | ||
| 8 | TokenIdentifier, | ||
| 9 | TokenIdentifierAssignmentValue, | ||
| 10 | TokenValue, | ||
| 11 | TokenString, | ||
| 12 | TokenNumber, | ||
| 13 | TokenDoubleEqual, | ||
| 14 | TokenGreaterEqual, | ||
| 15 | TokenLesserEqual, | ||
| 16 | TokenParam, | ||
| 17 | TokenFunc, | ||
| 18 | TokenReturn, | ||
| 19 | TokenIf, | ||
| 20 | TokenElse, | ||
| 21 | TokenFor, | ||
| 22 | TokenWhile, | ||
| 23 | TokenBreak, | ||
| 24 | TokenContinue, | ||
| 25 | TokenExpression, | ||
| 26 | TokenFuncBody, | ||
| 27 | TokenUnwantedChild, | ||
| 28 | TokenNewLine, | ||
| 29 | TokenRightShift, | ||
| 30 | TokenLeftShift, | ||
| 31 | TokenStar, | ||
| 32 | }; | ||
| 33 | |||
| 34 | typedef struct Tokenizer Tokenizer; | ||
| 35 | struct Tokenizer | ||
| 36 | { | ||
| 37 | i32 Line; | ||
| 38 | i32 Column; | ||
| 39 | }; | ||
| 40 | |||
| 41 | typedef enum token_flags token_flags; | ||
| 42 | enum token_flags | ||
| 43 | { | ||
| 44 | FlagNone = (0), | ||
| 45 | FlagConstant = (1 << 0), | ||
| 46 | FlagGlobal = (1 << 1), | ||
| 47 | FlagsValue = (1 << 2), | ||
| 48 | FlagDefinition = (1 << 3), | ||
| 49 | FlagComparison = (1 << 4), | ||
| 50 | FlagDeprecated = (1 << 5), | ||
| 51 | FlagDirty = (1 << 6), | ||
| 52 | }; | ||
| 53 | |||
| 54 | typedef struct token token; | ||
| 55 | struct token | ||
| 56 | { | ||
| 57 | string8 Lexeme; | ||
| 58 | token_type Type; | ||
| 59 | token_flags Flags; | ||
| 60 | u64 ByteOffset; | ||
| 61 | i32 Column; | ||
| 62 | i32 Line; | ||
| 63 | |||
| 64 | string8 MetaData; | ||
| 65 | }; | ||
| 66 | |||
| 67 | typedef struct token_node token_node; | ||
| 68 | struct token_node | ||
| 69 | { | ||
| 70 | token_node *Next; | ||
| 71 | token_node *Previous; | ||
| 72 | token *Token; | ||
| 73 | }; | ||
| 74 | |||
| 75 | typedef struct token_list token_list; | ||
| 76 | struct token_list | ||
| 77 | { | ||
| 78 | token_node *Root; | ||
| 79 | token_node *Current; | ||
| 80 | }; | ||
| 81 | |||
| 82 | typedef struct lexer lexer; | ||
| 83 | struct lexer | ||
| 84 | { | ||
| 85 | u8 *Text; | ||
| 86 | u64 TextCount; | ||
| 87 | u8 *EndOfFile; | ||
| 88 | u8 *UndefinedTokens; | ||
| 89 | }; | ||
| 90 | |||
| 91 | global_variable const u8 Delimiters[] = | ||
| 92 | { | ||
| 93 | '{', | ||
| 94 | '}', | ||
| 95 | '(', | ||
| 96 | ')', | ||
| 97 | '[', | ||
| 98 | ']', | ||
| 99 | ';', | ||
| 100 | }; | ||
| 101 | |||
| 102 | read_only global_variable token nil_token = | ||
| 103 | { | ||
| 104 | .Lexeme = {NULL, 0}, | ||
| 105 | .Type = TokenUndefined, | ||
| 106 | .Flags = FlagNone, | ||
| 107 | .ByteOffset = 0, | ||
| 108 | .Column = 0, | ||
| 109 | .Line = 0, | ||
| 110 | }; | ||
| 111 | |||
| 112 | read_only global_variable token_node nil_token_node = | ||
| 113 | { | ||
| 114 | .Next = &nil_token_node, | ||
| 115 | .Previous = &nil_token_node, | ||
| 116 | .Token = NULL, | ||
| 117 | }; | ||
| 118 | |||
| 119 | #endif // FAJR_LEXER_H | ||
