Files
rifle/core/lexer.c
2026-02-24 22:32:54 -05:00

538 lines
10 KiB
C

#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <unistd.h>
#include "rifle/lexer.h"
#include "rifle/trace.h"
/*
* Returns true if the given input character is a
* whitespace character
*/
static inline bool
lexer_is_ws(char c)
{
switch (c) {
case '\t':
case '\r':
case '\f':
case ' ':
return true;
}
return false;
}
/*
* Returns true if the given character is apart of a numbering
* system prefix (e.g., '0x' or 'o0')
*/
static inline bool
lexer_is_num_prefix(char c)
{
switch (c) {
case 'o':
case 'x':
return true;
}
return false;
}
/*
* Place a given character into the putback buffer
*
* @state: Compiler state
* @c: Character to insert
*/
static inline void
lexer_putback(struct rifle_state *state, char c)
{
if (state == NULL) {
return;
}
state->putback = c;
}
/*
* Pop a character from the putback buffer
*
* @state: Compiler state
*
* Returns '\0' on failure
*/
static inline char
lexer_putback_pop(struct rifle_state *state)
{
char c;
if (state == NULL) {
return '\0';
}
c = state->putback;
state->putback = '\0';
return c;
}
/*
* Consume a single byte from the input source file
* while skipping any whitespace characters.
*
* @state: Compiler state
* @keep_ws: If true, preserve whitespace
*
* Returns the consumed character on success, otherwise
* '\0' upon failure.
*/
static char
lexer_nom(struct rifle_state *state, bool keep_ws)
{
char c;
if (state == NULL) {
return '\0';
}
/*
* If there is any data in the putback buffer then
* we shall take it.
*/
if ((c = lexer_putback_pop(state)) != '\0') {
if (keep_ws && lexer_is_ws(c))
return c;
if ((!lexer_is_ws(c)))
return c;
}
while (read(state->in_fd, &c, 1) > 0) {
if (lexer_is_ws(c) && !keep_ws) {
continue;
}
return c;
}
return '\0';
}
/*
* Scan for an identifier in the input source file
*
* @state: Compiler state
* @lc: Last character
* @tok: Last token
*
* Returns zero on success
*/
static int
lexer_scan_ident(struct rifle_state *state, int lc, struct token *tok)
{
char *buf, c;
size_t bufcap, bufsz;
if (state == NULL || tok == NULL) {
return -1;
}
bufcap = 8;
bufsz = 0;
if ((buf = malloc(bufcap)) == NULL) {
return -1;
}
if (!isalnum(lc) && lc != '.' && lc != '_') {
return -1;
}
buf[bufsz++] = lc;
for (;;) {
c = lexer_nom(state, true);
if (c == '\0') {
return -1;
}
if (!isalnum(c) && c != '.' && c != '_') {
lexer_putback(state, c);
buf[bufsz] = '\0';
break;
}
buf[bufsz++] = c;
if (bufsz >= bufcap - 1) {
bufcap += 8;
buf = realloc(buf, bufcap);
}
if (buf == NULL) {
return -1;
}
}
tok->type = TT_IDENT;
tok->s = ptrbox_strdup(&state->ptrbox, buf);
free(buf);
return 0;
}
/*
* Check a identifier and potentially override it if it
* counts as a directive.
*
* @state: Compiler state
* @tok: Token
*/
static void
lexer_check_direc(struct rifle_state *state, struct token *tok)
{
if (state == NULL || tok == NULL) {
return;
}
if (tok->type != TT_IDENT) {
return;
}
/* Check the character after the '.' prefix */
switch (tok->s[1]) {
case 'f':
if (strcmp(tok->s, ".f") == 0) {
tok->type = TT_F;
return;
}
break;
case 'e':
if (strcmp(tok->s, ".extern") == 0) {
tok->type = TT_EXTERN;
return;
}
break;
case 's':
if (strcmp(tok->s, ".struct") == 0) {
tok->type = TT_STRUCT;
return;
}
break;
case 'p':
if (strcmp(tok->s, ".pub") == 0) {
tok->type = TT_PUB;
return;
}
break;
}
}
/*
* Check a identifier and potentially override it if it
* counts as a reserved keyword.
*
* @state: Compiler state
* @tok: Token
*/
static void
lexer_check_kw(struct rifle_state *state, struct token *tok)
{
if (state == NULL || tok == NULL) {
return;
}
if (tok->type != TT_IDENT) {
return;
}
switch (*tok->s) {
case 'u':
if (strcmp(tok->s, "u8") == 0) {
tok->type = TT_U8;
return;
}
if (strcmp(tok->s, "u16") == 0) {
tok->type = TT_U16;
return;
}
if (strcmp(tok->s, "u32") == 0) {
tok->type = TT_U32;
return;
}
if (strcmp(tok->s, "u64") == 0) {
tok->type = TT_U64;
return;
}
break;
case 'v':
if (strcmp(tok->s, "void") == 0) {
tok->type = TT_VOID;
return;
}
break;
case 'r':
if (strcmp(tok->s, "return") == 0) {
tok->type = TT_RETURN;
return;
}
break;
case 'l':
if (strcmp(tok->s, "loop") == 0) {
tok->type = TT_LOOP;
return;
}
break;
case 'b':
if (strcmp(tok->s, "break") == 0) {
tok->type = TT_BREAK;
return;
}
break;
case '.':
lexer_check_direc(state, tok);
break;
}
}
/*
* Assert that the given identifier token is actually a preprocessor directive.
* This is only used when a '#' has been encountered before the identifier.
*
* @state: Compiler state
* @tok: Last token
*
* Returns zero on success
*/
static int
lexer_assert_preproc(struct rifle_state *state, struct token *tok)
{
if (state == NULL || tok == NULL) {
return -1;
}
if (tok->type != TT_IDENT) {
return -1;
}
switch (*tok->s) {
case 'd':
if (strcmp(tok->s, "define") == 0) {
tok->type = TT_DEFINE;
return 0;
}
break;
case 'i':
if (strcmp(tok->s, "ifdef") == 0) {
tok->type = TT_IFDEF;
return 0;
}
if (strcmp(tok->s, "ifndef") == 0) {
tok->type = TT_IFNDEF;
return 0;
}
break;
case 'e':
if (strcmp(tok->s, "endif") == 0) {
tok->type = TT_ENDIF;
return 0;
}
break;
}
trace_error(state, "bad preprocessor directive\n");
return -1;
}
/*
* Scan for a series of numbers
*
* @state: Compiler state
* @lc: Last character
* @tok: Token result
*/
static int
lexer_scan_num(struct rifle_state *state, int lc, struct token *tok)
{
char c, buf[22];
char prefix = '\0';
size_t buf_ind;
uint8_t radix;
if (state == NULL || tok == NULL) {
return -1;
}
if (!isdigit(lc)) {
return -1;
}
/* Obtain the prefix if any */
if (lc == '0') {
if ((prefix = lexer_nom(state, false)) == '\0')
return -1;
if (!lexer_is_num_prefix(prefix))
lexer_putback(state, prefix);
}
/*
* Determine the radix based on the prefix
*
* 'x'): Base-16
* 'o'): Base-8
*/
switch (prefix) {
case 'x':
if ((lc = lexer_nom(state, false)) == '\0')
return -1;
radix = 16;
break;
case 'o':
if ((lc = lexer_nom(state, false)) == '\0')
return -1;
radix = 8;
break;
default:
radix = 10;
}
buf_ind = 0;
buf[buf_ind++] = lc;
for (;;) {
c = lexer_nom(state, true);
if (c == '\0') {
return -1;
}
if (c == '_') {
continue;
}
if (!isxdigit(c)) {
lexer_putback(state, c);
buf[buf_ind] = '\0';
break;
}
buf[buf_ind++] = c;
if (buf_ind >= sizeof(buf) - 1) {
trace_error(state, "value exceeds width of u64\n");
return -1;
}
}
tok->type = TT_NUMBER;
tok->v = strtoll(buf, NULL, radix);
return 0;
}
int
lexer_scan(struct rifle_state *state, struct token *res)
{
char c;
if (state == NULL || res == NULL) {
return -1;
}
if ((c = lexer_nom(state, false)) == '\0') {
return -1;
}
switch (c) {
case '+':
res->type = TT_PLUS;
res->c = c;
return 0;
case '-':
res->type = TT_MINUS;
res->c = c;
return 0;
case '*':
res->type = TT_STAR;
res->c = c;
return 0;
case '/':
res->type = TT_SLASH;
res->c = c;
return 0;
case ':':
res->type = TT_COLON;
res->c = c;
return 0;
case '(':
res->type = TT_LPAREN;
res->c = c;
return 0;
case ')':
res->type = TT_RPAREN;
res->c = c;
return 0;
case '{':
res->type = TT_LBRACE;
res->c = c;
return 0;
case '}':
res->type = TT_RBRACE;
res->c = c;
return 0;
case ';':
res->type = TT_SEMI;
res->c = c;
return 0;
case ',':
res->type = TT_COMMA;
res->c = c;
return 0;
case '\n':
res->type = TT_NEWLINE;
res->c = c;
return 0;
case '#':
if ((c = lexer_nom(state, false)) == '\0') {
trace_error(state, "unexpected end of file\n");
return -1;
}
/* Assert that we have a preprocessor directive */
if (lexer_scan_ident(state, c, res) == 0) {
if (lexer_assert_preproc(state, res) < 0)
return -1;
}
return 0;
default:
if (isdigit(c)) {
if (lexer_scan_num(state, c, res) == 0)
return 0;
}
if (lexer_scan_ident(state, c, res) == 0) {
lexer_check_kw(state, res);
return 0;
}
trace_error(state, "unexpected token '%c'\n", c);
break;
}
return -1;
}