538 lines
10 KiB
C
538 lines
10 KiB
C
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <stdbool.h>
|
|
#include <stdlib.h>
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
#include "rifle/lexer.h"
|
|
#include "rifle/trace.h"
|
|
|
|
/*
|
|
* Returns true if the given input character is a
|
|
* whitespace character
|
|
*/
|
|
static inline bool
|
|
lexer_is_ws(char c)
|
|
{
|
|
switch (c) {
|
|
case '\t':
|
|
case '\r':
|
|
case '\f':
|
|
case ' ':
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Returns true if the given character is apart of a numbering
|
|
* system prefix (e.g., '0x' or 'o0')
|
|
*/
|
|
static inline bool
|
|
lexer_is_num_prefix(char c)
|
|
{
|
|
switch (c) {
|
|
case 'o':
|
|
case 'x':
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Place a given character into the putback buffer
|
|
*
|
|
* @state: Compiler state
|
|
* @c: Character to insert
|
|
*/
|
|
static inline void
|
|
lexer_putback(struct rifle_state *state, char c)
|
|
{
|
|
if (state == NULL) {
|
|
return;
|
|
}
|
|
|
|
state->putback = c;
|
|
}
|
|
|
|
/*
|
|
* Pop a character from the putback buffer
|
|
*
|
|
* @state: Compiler state
|
|
*
|
|
* Returns '\0' on failure
|
|
*/
|
|
static inline char
|
|
lexer_putback_pop(struct rifle_state *state)
|
|
{
|
|
char c;
|
|
|
|
if (state == NULL) {
|
|
return '\0';
|
|
}
|
|
|
|
c = state->putback;
|
|
state->putback = '\0';
|
|
return c;
|
|
}
|
|
|
|
/*
|
|
* Consume a single byte from the input source file
|
|
* while skipping any whitespace characters.
|
|
*
|
|
* @state: Compiler state
|
|
* @keep_ws: If true, preserve whitespace
|
|
*
|
|
* Returns the consumed character on success, otherwise
|
|
* '\0' upon failure.
|
|
*/
|
|
static char
|
|
lexer_nom(struct rifle_state *state, bool keep_ws)
|
|
{
|
|
char c;
|
|
|
|
if (state == NULL) {
|
|
return '\0';
|
|
}
|
|
|
|
/*
|
|
* If there is any data in the putback buffer then
|
|
* we shall take it.
|
|
*/
|
|
if ((c = lexer_putback_pop(state)) != '\0') {
|
|
if (keep_ws && lexer_is_ws(c))
|
|
return c;
|
|
if ((!lexer_is_ws(c)))
|
|
return c;
|
|
}
|
|
|
|
while (read(state->in_fd, &c, 1) > 0) {
|
|
if (lexer_is_ws(c) && !keep_ws) {
|
|
continue;
|
|
}
|
|
|
|
return c;
|
|
}
|
|
|
|
return '\0';
|
|
}
|
|
|
|
/*
|
|
* Scan for an identifier in the input source file
|
|
*
|
|
* @state: Compiler state
|
|
* @lc: Last character
|
|
* @tok: Last token
|
|
*
|
|
* Returns zero on success
|
|
*/
|
|
static int
|
|
lexer_scan_ident(struct rifle_state *state, int lc, struct token *tok)
|
|
{
|
|
char *buf, c;
|
|
size_t bufcap, bufsz;
|
|
|
|
if (state == NULL || tok == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
bufcap = 8;
|
|
bufsz = 0;
|
|
if ((buf = malloc(bufcap)) == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
if (!isalnum(lc) && lc != '.' && lc != '_') {
|
|
return -1;
|
|
}
|
|
|
|
buf[bufsz++] = lc;
|
|
for (;;) {
|
|
c = lexer_nom(state, true);
|
|
if (c == '\0') {
|
|
return -1;
|
|
}
|
|
|
|
if (!isalnum(c) && c != '.' && c != '_') {
|
|
lexer_putback(state, c);
|
|
buf[bufsz] = '\0';
|
|
break;
|
|
}
|
|
|
|
buf[bufsz++] = c;
|
|
if (bufsz >= bufcap - 1) {
|
|
bufcap += 8;
|
|
buf = realloc(buf, bufcap);
|
|
}
|
|
|
|
if (buf == NULL) {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
tok->type = TT_IDENT;
|
|
tok->s = ptrbox_strdup(&state->ptrbox, buf);
|
|
free(buf);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Check a identifier and potentially override it if it
|
|
* counts as a directive.
|
|
*
|
|
* @state: Compiler state
|
|
* @tok: Token
|
|
*/
|
|
static void
|
|
lexer_check_direc(struct rifle_state *state, struct token *tok)
|
|
{
|
|
if (state == NULL || tok == NULL) {
|
|
return;
|
|
}
|
|
|
|
if (tok->type != TT_IDENT) {
|
|
return;
|
|
}
|
|
|
|
/* Check the character after the '.' prefix */
|
|
switch (tok->s[1]) {
|
|
case 'f':
|
|
if (strcmp(tok->s, ".f") == 0) {
|
|
tok->type = TT_F;
|
|
return;
|
|
}
|
|
|
|
break;
|
|
case 'e':
|
|
if (strcmp(tok->s, ".extern") == 0) {
|
|
tok->type = TT_EXTERN;
|
|
return;
|
|
}
|
|
|
|
break;
|
|
case 's':
|
|
if (strcmp(tok->s, ".struct") == 0) {
|
|
tok->type = TT_STRUCT;
|
|
return;
|
|
}
|
|
|
|
break;
|
|
case 'p':
|
|
if (strcmp(tok->s, ".pub") == 0) {
|
|
tok->type = TT_PUB;
|
|
return;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Check a identifier and potentially override it if it
|
|
* counts as a reserved keyword.
|
|
*
|
|
* @state: Compiler state
|
|
* @tok: Token
|
|
*/
|
|
static void
|
|
lexer_check_kw(struct rifle_state *state, struct token *tok)
|
|
{
|
|
if (state == NULL || tok == NULL) {
|
|
return;
|
|
}
|
|
|
|
if (tok->type != TT_IDENT) {
|
|
return;
|
|
}
|
|
|
|
switch (*tok->s) {
|
|
case 'u':
|
|
if (strcmp(tok->s, "u8") == 0) {
|
|
tok->type = TT_U8;
|
|
return;
|
|
}
|
|
|
|
if (strcmp(tok->s, "u16") == 0) {
|
|
tok->type = TT_U16;
|
|
return;
|
|
}
|
|
|
|
if (strcmp(tok->s, "u32") == 0) {
|
|
tok->type = TT_U32;
|
|
return;
|
|
}
|
|
|
|
if (strcmp(tok->s, "u64") == 0) {
|
|
tok->type = TT_U64;
|
|
return;
|
|
}
|
|
|
|
break;
|
|
case 'v':
|
|
if (strcmp(tok->s, "void") == 0) {
|
|
tok->type = TT_VOID;
|
|
return;
|
|
}
|
|
|
|
break;
|
|
case 'r':
|
|
if (strcmp(tok->s, "return") == 0) {
|
|
tok->type = TT_RETURN;
|
|
return;
|
|
}
|
|
|
|
break;
|
|
case 'l':
|
|
if (strcmp(tok->s, "loop") == 0) {
|
|
tok->type = TT_LOOP;
|
|
return;
|
|
}
|
|
|
|
break;
|
|
case 'b':
|
|
if (strcmp(tok->s, "break") == 0) {
|
|
tok->type = TT_BREAK;
|
|
return;
|
|
}
|
|
|
|
break;
|
|
case '.':
|
|
lexer_check_direc(state, tok);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Assert that the given identifier token is actually a preprocessor directive.
|
|
* This is only used when a '#' has been encountered before the identifier.
|
|
*
|
|
* @state: Compiler state
|
|
* @tok: Last token
|
|
*
|
|
* Returns zero on success
|
|
*/
|
|
static int
|
|
lexer_assert_preproc(struct rifle_state *state, struct token *tok)
|
|
{
|
|
if (state == NULL || tok == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
if (tok->type != TT_IDENT) {
|
|
return -1;
|
|
}
|
|
|
|
switch (*tok->s) {
|
|
case 'd':
|
|
if (strcmp(tok->s, "define") == 0) {
|
|
tok->type = TT_DEFINE;
|
|
return 0;
|
|
}
|
|
|
|
break;
|
|
case 'i':
|
|
if (strcmp(tok->s, "ifdef") == 0) {
|
|
tok->type = TT_IFDEF;
|
|
return 0;
|
|
}
|
|
|
|
if (strcmp(tok->s, "ifndef") == 0) {
|
|
tok->type = TT_IFNDEF;
|
|
return 0;
|
|
}
|
|
|
|
break;
|
|
case 'e':
|
|
if (strcmp(tok->s, "endif") == 0) {
|
|
tok->type = TT_ENDIF;
|
|
return 0;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
trace_error(state, "bad preprocessor directive\n");
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Scan for a series of numbers
|
|
*
|
|
* @state: Compiler state
|
|
* @lc: Last character
|
|
* @tok: Token result
|
|
*/
|
|
static int
|
|
lexer_scan_num(struct rifle_state *state, int lc, struct token *tok)
|
|
{
|
|
char c, buf[22];
|
|
char prefix = '\0';
|
|
size_t buf_ind;
|
|
uint8_t radix;
|
|
|
|
if (state == NULL || tok == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
if (!isdigit(lc)) {
|
|
return -1;
|
|
}
|
|
|
|
/* Obtain the prefix if any */
|
|
if (lc == '0') {
|
|
if ((prefix = lexer_nom(state, false)) == '\0')
|
|
return -1;
|
|
if (!lexer_is_num_prefix(prefix))
|
|
lexer_putback(state, prefix);
|
|
}
|
|
|
|
/*
|
|
* Determine the radix based on the prefix
|
|
*
|
|
* 'x'): Base-16
|
|
* 'o'): Base-8
|
|
*/
|
|
switch (prefix) {
|
|
case 'x':
|
|
if ((lc = lexer_nom(state, false)) == '\0')
|
|
return -1;
|
|
|
|
radix = 16;
|
|
break;
|
|
case 'o':
|
|
if ((lc = lexer_nom(state, false)) == '\0')
|
|
return -1;
|
|
|
|
radix = 8;
|
|
break;
|
|
default:
|
|
radix = 10;
|
|
}
|
|
|
|
buf_ind = 0;
|
|
buf[buf_ind++] = lc;
|
|
|
|
for (;;) {
|
|
c = lexer_nom(state, true);
|
|
if (c == '\0') {
|
|
return -1;
|
|
}
|
|
|
|
if (c == '_') {
|
|
continue;
|
|
}
|
|
|
|
if (!isxdigit(c)) {
|
|
lexer_putback(state, c);
|
|
buf[buf_ind] = '\0';
|
|
break;
|
|
}
|
|
|
|
buf[buf_ind++] = c;
|
|
if (buf_ind >= sizeof(buf) - 1) {
|
|
trace_error(state, "value exceeds width of u64\n");
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
tok->type = TT_NUMBER;
|
|
tok->v = strtoll(buf, NULL, radix);
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
lexer_scan(struct rifle_state *state, struct token *res)
|
|
{
|
|
char c;
|
|
|
|
if (state == NULL || res == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
if ((c = lexer_nom(state, false)) == '\0') {
|
|
return -1;
|
|
}
|
|
|
|
switch (c) {
|
|
case '+':
|
|
res->type = TT_PLUS;
|
|
res->c = c;
|
|
return 0;
|
|
case '-':
|
|
res->type = TT_MINUS;
|
|
res->c = c;
|
|
return 0;
|
|
case '*':
|
|
res->type = TT_STAR;
|
|
res->c = c;
|
|
return 0;
|
|
case '/':
|
|
res->type = TT_SLASH;
|
|
res->c = c;
|
|
return 0;
|
|
case ':':
|
|
res->type = TT_COLON;
|
|
res->c = c;
|
|
return 0;
|
|
case '(':
|
|
res->type = TT_LPAREN;
|
|
res->c = c;
|
|
return 0;
|
|
case ')':
|
|
res->type = TT_RPAREN;
|
|
res->c = c;
|
|
return 0;
|
|
case '{':
|
|
res->type = TT_LBRACE;
|
|
res->c = c;
|
|
return 0;
|
|
case '}':
|
|
res->type = TT_RBRACE;
|
|
res->c = c;
|
|
return 0;
|
|
case ';':
|
|
res->type = TT_SEMI;
|
|
res->c = c;
|
|
return 0;
|
|
case ',':
|
|
res->type = TT_COMMA;
|
|
res->c = c;
|
|
return 0;
|
|
case '\n':
|
|
res->type = TT_NEWLINE;
|
|
res->c = c;
|
|
return 0;
|
|
case '#':
|
|
if ((c = lexer_nom(state, false)) == '\0') {
|
|
trace_error(state, "unexpected end of file\n");
|
|
return -1;
|
|
}
|
|
|
|
/* Assert that we have a preprocessor directive */
|
|
if (lexer_scan_ident(state, c, res) == 0) {
|
|
if (lexer_assert_preproc(state, res) < 0)
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
default:
|
|
if (isdigit(c)) {
|
|
if (lexer_scan_num(state, c, res) == 0)
|
|
return 0;
|
|
}
|
|
|
|
if (lexer_scan_ident(state, c, res) == 0) {
|
|
lexer_check_kw(state, res);
|
|
return 0;
|
|
}
|
|
|
|
trace_error(state, "unexpected token '%c'\n", c);
|
|
break;
|
|
}
|
|
|
|
return -1;
|
|
}
|