rifle/core/lexer.c

#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <unistd.h>
#include "rifle/lexer.h"
#include "rifle/trace.h"

/*
 * Returns true if the given input character is a
 * whitespace character
 */
static inline bool
lexer_is_ws(char c)
{
    switch (c) {
    case '\t':
    case '\r':
    case '\f':
    case ' ':
        return true;
    }

    return false;
}

/*
 * Returns true if the given character is apart of a numbering
 * system prefix (e.g., '0x' or 'o0')
 */
static inline bool
lexer_is_num_prefix(char c)
{
    switch (c) {
    case 'o':
    case 'x':
        return true;
    }

    return false;
}

/*
 * Place a given character into the putback buffer
 *
 * @state: Compiler state
 * @c:     Character to insert
 */
static inline void
lexer_putback(struct rifle_state *state, char c)
{
    if (state == NULL) {
        return;
    }

    state->putback = c;
}

/*
 * Pop a character from the putback buffer
 *
 * @state: Compiler state
 *
 * Returns '\0' on failure
 */
static inline char
lexer_putback_pop(struct rifle_state *state)
{
    char c;

    if (state == NULL) {
        return '\0';
    }

    c = state->putback;
    state->putback = '\0';
    return c;
}

/*
 * Consume a single byte from the input source file
 * while skipping any whitespace characters.
 *
 * @state: Compiler state
 * @keep_ws: If true, preserve whitespace
 *
 * Returns the consumed character on success, otherwise
 * '\0' upon failure.
 */
static char
lexer_nom(struct rifle_state *state, bool keep_ws)
{
    char c;

    if (state == NULL) {
        return '\0';
    }

    /*
     * If there is any data in the putback buffer then
     * we shall take it.
     */
    if ((c = lexer_putback_pop(state)) != '\0') {
        if (keep_ws && lexer_is_ws(c))
            return c;
        if ((!lexer_is_ws(c)))
            return c;
    }

    while (read(state->in_fd, &c, 1) > 0) {
        if (lexer_is_ws(c) && !keep_ws) {
            continue;
        }

        return c;
    }

    return '\0';
}

/*
 * Scan for an identifier in the input source file
 *
 * @state: Compiler state
 * @lc: Last character
 * @tok:   Last token
 *
 * Returns zero on success
 */
static int
lexer_scan_ident(struct rifle_state *state, int lc, struct token *tok)
{
    char *buf, c;
    size_t bufcap, bufsz;

    if (state == NULL || tok == NULL) {
        return -1;
    }

    bufcap = 8;
    bufsz = 0;
    if ((buf = malloc(bufcap)) == NULL) {
        return -1;
    }

    if (!isalnum(lc) && lc != '.' && lc != '_') {
        return -1;
    }

    buf[bufsz++] = lc;
    for (;;) {
        c = lexer_nom(state, true);
        if (c == '\0') {
            return -1;
        }

        if (!isalnum(c) && c != '.' && c != '_') {
            lexer_putback(state, c);
            buf[bufsz] = '\0';
            break;
        }

        buf[bufsz++] = c;
        if (bufsz >= bufcap - 1) {
            bufcap += 8;
            buf = realloc(buf, bufcap);
        }

        if (buf == NULL) {
            return -1;
        }
    }

    tok->type = TT_IDENT;
    tok->s = ptrbox_strdup(&state->ptrbox, buf);
    free(buf);
    return 0;
}

/*
 * Check a identifier and potentially override it if it
 * counts as a directive.
 *
 * @state: Compiler state
 * @tok:   Token
 */
static void
lexer_check_direc(struct rifle_state *state, struct token *tok)
{
    if (state == NULL || tok == NULL) {
        return;
    }

    if (tok->type != TT_IDENT) {
        return;
    }

    /* Check the character after the '.' prefix */
    switch (tok->s[1]) {
    case 'f':
        if (strcmp(tok->s, ".f") == 0) {
            tok->type = TT_F;
            return;
        }

        break;
    case 'e':
        if (strcmp(tok->s, ".extern") == 0) {
            tok->type = TT_EXTERN;
            return;
        }

        break;
    case 's':
        if (strcmp(tok->s, ".struct") == 0) {
            tok->type = TT_STRUCT;
            return;
        }

        break;
    case 'p':
        if (strcmp(tok->s, ".pub") == 0) {
            tok->type = TT_PUB;
            return;
        }

        break;
    }
}

/*
 * Check a identifier and potentially override it if it
 * counts as a reserved keyword.
 *
 * @state: Compiler state
 * @tok:   Token
 */
static void
lexer_check_kw(struct rifle_state *state, struct token *tok)
{
    if (state == NULL || tok == NULL) {
        return;
    }

    if (tok->type != TT_IDENT) {
        return;
    }

    switch (*tok->s) {
    case 'u':
        if (strcmp(tok->s, "u8") == 0) {
            tok->type = TT_U8;
            return;
        }

        if (strcmp(tok->s, "u16") == 0) {
            tok->type = TT_U16;
            return;
        }

        if (strcmp(tok->s, "u32") == 0) {
            tok->type = TT_U32;
            return;
        }

        if (strcmp(tok->s, "u64") == 0) {
            tok->type = TT_U64;
            return;
        }

        break;
    case 'v':
        if (strcmp(tok->s, "void") == 0) {
            tok->type = TT_VOID;
            return;
        }

        break;
    case 'r':
        if (strcmp(tok->s, "return") == 0) {
            tok->type = TT_RETURN;
            return;
        }

        break;
    case 'l':
        if (strcmp(tok->s, "loop") == 0) {
            tok->type = TT_LOOP;
            return;
        }

        break;
    case 'b':
        if (strcmp(tok->s, "break") == 0) {
            tok->type = TT_BREAK;
            return;
        }

        break;
    case '.':
        lexer_check_direc(state, tok);
        break;
    }
}

/*
 * Assert that the given identifier token is actually a preprocessor directive.
 * This is only used when a '#' has been encountered before the identifier.
 *
 * @state: Compiler state
 * @tok:   Last token
 *
 * Returns zero on success
 */
static int
lexer_assert_preproc(struct rifle_state *state, struct token *tok)
{
    if (state == NULL || tok == NULL) {
        return -1;
    }

    if (tok->type != TT_IDENT) {
        return -1;
    }

    switch (*tok->s) {
    case 'd':
        if (strcmp(tok->s, "define") == 0) {
            tok->type = TT_DEFINE;
            return 0;
        }

        break;
    case 'i':
        if (strcmp(tok->s, "ifdef") == 0) {
            tok->type = TT_IFDEF;
            return 0;
        }

        if (strcmp(tok->s, "ifndef") == 0) {
            tok->type = TT_IFNDEF;
            return 0;
        }

        break;
    case 'e':
        if (strcmp(tok->s, "endif") == 0) {
            tok->type = TT_ENDIF;
            return 0;
        }

        break;
    }

    trace_error(state, "bad preprocessor directive\n");
    return -1;
}

/*
 * Scan for a series of numbers
 *
 * @state: Compiler state
 * @lc:    Last character
 * @tok:   Token result
 */
static int
lexer_scan_num(struct rifle_state *state, int lc, struct token *tok)
{
    char c, buf[22];
    char prefix = '\0';
    size_t buf_ind;
    uint8_t radix;

    if (state == NULL || tok == NULL) {
        return -1;
    }

    if (!isdigit(lc)) {
        return -1;
    }

    /* Obtain the prefix if any */
    if (lc == '0') {
        if ((prefix = lexer_nom(state, false)) == '\0')
            return -1;
        if (!lexer_is_num_prefix(prefix))
            lexer_putback(state, prefix);
    }

    /*
     * Determine the radix based on the prefix
     *
     * 'x'): Base-16
     * 'o'): Base-8
     */
    switch (prefix) {
    case 'x':
        if ((lc = lexer_nom(state, false)) == '\0')
            return -1;

        radix = 16;
        break;
    case 'o':
        if ((lc = lexer_nom(state, false)) == '\0')
            return -1;

        radix = 8;
        break;
    default:
        radix = 10;
    }

    buf_ind = 0;
    buf[buf_ind++] = lc;

    for (;;) {
        c = lexer_nom(state, true);
        if (c == '\0') {
            return -1;
        }

        if (c == '_') {
            continue;
        }

        if (!isxdigit(c)) {
            lexer_putback(state, c);
            buf[buf_ind] = '\0';
            break;
        }

        buf[buf_ind++] = c;
        if (buf_ind >= sizeof(buf) - 1) {
            trace_error(state, "value exceeds width of u64\n");
            return -1;
        }
    }

    tok->type = TT_NUMBER;
    tok->v = strtoll(buf, NULL, radix);
    return 0;
}

int
lexer_scan(struct rifle_state *state, struct token *res)
{
    char c;

    if (state == NULL || res == NULL) {
        return -1;
    }

    if ((c = lexer_nom(state, false)) == '\0') {
        return -1;
    }

    switch (c) {
    case '+':
        res->type = TT_PLUS;
        res->c = c;
        return 0;
    case '-':
        res->type = TT_MINUS;
        res->c = c;
        return 0;
    case '*':
        res->type = TT_STAR;
        res->c = c;
        return 0;
    case '/':
        res->type = TT_SLASH;
        res->c = c;
        return 0;
    case ':':
        res->type = TT_COLON;
        res->c = c;
        return 0;
    case '(':
        res->type = TT_LPAREN;
        res->c = c;
        return 0;
    case ')':
        res->type = TT_RPAREN;
        res->c = c;
        return 0;
    case '{':
        res->type = TT_LBRACE;
        res->c = c;
        return 0;
    case '}':
        res->type = TT_RBRACE;
        res->c = c;
        return 0;
    case ';':
        res->type = TT_SEMI;
        res->c = c;
        return 0;
    case ',':
        res->type = TT_COMMA;
        res->c = c;
        return 0;
    case '\n':
        res->type = TT_NEWLINE;
        res->c = c;
        return 0;
    case '#':
        if ((c = lexer_nom(state, false)) == '\0') {
            trace_error(state, "unexpected end of file\n");
            return -1;
        }

        /* Assert that we have a preprocessor directive */
        if (lexer_scan_ident(state, c, res) == 0) {
            if (lexer_assert_preproc(state, res) < 0)
                return -1;
        }

        return 0;
    default:
        if (isdigit(c)) {
            if (lexer_scan_num(state, c, res) == 0)
                return 0;
        }

        if (lexer_scan_ident(state, c, res) == 0) {
            lexer_check_kw(state, res);
            return 0;
        }

        trace_error(state, "unexpected token '%c'\n", c);
        break;
    }

    return -1;
}