Files
quip/frontend/lexer.c
2026-03-25 19:51:07 -04:00

381 lines
7.2 KiB
C

/*
* Copyright (c) 2026, Mirocom Laboratories
* Provided under the BSD-3 clause
*
* Abstract:
* This file implements the lexer.
* Author:
* Ian M. Moffett <ian@mirocom.org>
*/
#include <sys/types.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdbool.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <ctype.h>
#include "frontend/token.h"
#include "frontend/lexer.h"
#include "common/ptrbox.h"
#include "common/trace.h"
#define SHELL_BLOCK_TERMINATE '~'
/*
* Test if a given character counts as a whitespace character
*
* @c: Character to test
*/
static inline bool
lexer_is_ws(char c)
{
switch (c) {
case '\t':
case '\f':
case '\a':
case ' ':
return true;
}
return false;
}
/*
* Place a character in the lexer putback buffer
*
* @state: Quip state
* @c: Character to insert
*/
static inline void
lexer_putback(struct quip_state *state, char c)
{
if (state == NULL) {
return;
}
state->lex_putback = c;
}
/*
* Pop the last character from the lexer putback
* buffer
*/
static inline char
lexer_putback_pop(struct quip_state *state)
{
char c;
if (state == NULL) {
return '\0';
}
c = state->lex_putback;
state->lex_putback = '\0';
return c;
}
/*
* Consume a single character in a buffered manner from the
* build source file.
*
* @state: Quip state machine
*
* Returns the fetched character on success, otherwise a value
* of '\0' on failure.
*/
static char
lexer_buffer_consume(struct quip_state *state)
{
ssize_t n;
if (state == NULL) {
return '\0';
}
/*
* If there is nothing in the lexer-side buffer, fill it
* with what we can.
*/
if (state->lex_buf_cap == 0) {
n = read(state->in_fd, state->lex_buf, LEX_FILEBUF_LEN);
if (n <= 0)
return '\0';
state->lex_buf_cap = n;
}
/* Grab a single character if not empty */
if (state->lex_buf_i < state->lex_buf_cap) {
return state->lex_buf[state->lex_buf_i++];
}
/* Empty, reset everything and try again */
state->lex_buf_cap = 0;
state->lex_buf_i = 0;
return lexer_buffer_consume(state);
}
/*
* Consume a single character optionally skipping whitespace
*
* @state: Quip state machine
* @skip_ws: If true, skip whitespace
*/
static char
lexer_consume(struct quip_state *state, bool skip_ws)
{
char c;
/*
* If there is anything in the putback buffer, take
* it.
*/
if ((c = lexer_putback_pop(state)) != '\0') {
if (!skip_ws || !lexer_is_ws(c))
return c;
}
while ((c = lexer_buffer_consume(state)) != '\0') {
if (skip_ws && lexer_is_ws(c)) {
continue;
}
return c;
}
return '\0';
}
/*
* Scan a name and create a token if valid
*
* @state: Quip state
* @lc: Last character
* @tokres: Token result written here
*
* Returns zero on success
*/
static int
lexer_scan_name(struct quip_state *state, int lc, struct token *tokres)
{
char c, *buf;
size_t bufsz, bufcap;
if (state == NULL || tokres == NULL) {
return -1;
}
if (!isalpha(lc) && lc != '_' && lc != '-') {
return -1;
}
bufsz = 0;
bufcap = 8;
if ((buf = malloc(bufcap)) == NULL) {
return -1;
}
buf[bufsz++] = lc;
for (;;) {
c = lexer_consume(state, false);
if (!isalnum(c) && c != '_' && c != '-') {
buf[bufsz] = '\0';
lexer_putback(state, c);
break;
}
buf[bufsz++] = c;
if (bufsz >= bufcap - 1) {
bufcap += 8;
buf = realloc(buf, bufcap);
}
if (buf == NULL) {
return -1;
}
}
tokres->type = TT_NAME;
tokres->s = ptrbox_strdup(&state->ptrbox, buf);
free(buf);
return 0;
}
/*
* Scan for a directive token
*
* @state: Quip state
* @tokres: Token result is written here
*
* Returns zero on success
*/
static int
lexer_scan_directive(struct quip_state *state, struct token *tokres)
{
char c;
if ((c = lexer_consume(state, true)) == '\0') {
return -1;
}
if (!isalpha(c)) {
return -1;
}
if (lexer_scan_name(state, c, tokres) < 0) {
trace_error(state, "fatal: error scanning directive\n");
return -1;
}
switch (*tokres->s) {
case 'c':
if (strcmp(tokres->s, "cc") == 0) {
tokres->type = TT_CC;
return 0;
}
break;
case 'l':
if (strcmp(tokres->s, "ld") == 0) {
tokres->type = TT_LD;
return 0;
}
break;
}
return -1;
}
/*
* Scan a shell block
*
* @state: Quip state
* @tokres: Token result is written here
*
* Returns zero on success
*/
static int
lexer_scan_shellblock(struct quip_state *state, struct token *tokres)
{
char c, *buf;
size_t bufsz, bufcap;
bool is_leading = true;
bufsz = 0;
bufcap = 8;
if ((buf = malloc(bufcap)) == NULL) {
return -1;
}
for (;;) {
if ((c = lexer_consume(state, false)) == '\0') {
trace_error(state, "unexpected end of file within shellblock\n");
free(buf);
return -1;
}
/* Handle overflow if needed */
if (bufsz > bufcap - 1) {
bufcap += 8;
buf = realloc(buf, bufcap);
}
/* Handle newlines */
if (c == '\n') {
buf[bufsz++] = ' ';
is_leading = true;
continue;
}
if (c == SHELL_BLOCK_TERMINATE) {
buf[bufsz] = '\0';
break;
}
if (is_leading && lexer_is_ws(c)) {
continue;
}
is_leading = false;
buf[bufsz++] = c;
if (buf == NULL) {
return -1;
}
}
tokres->type = TT_SHELLBLOCK;
tokres->s = ptrbox_strdup(&state->ptrbox, buf);
free(buf);
return 0;
}
/*
* Skip a source code comment
*
* @state: Quip state
*/
static void
lexer_skip_comment(struct quip_state *state)
{
char c;
while ((c = lexer_consume(state, false)) != '\n') {
if (c == '\0') {
break;
}
}
}
int
lexer_scan(struct quip_state *state, struct token *tokres)
{
char c;
if (state == NULL || tokres == NULL) {
return -1;
}
if ((c = lexer_consume(state, true)) == '\0') {
return -1;
}
switch (c) {
case '\n':
tokres->type = TT_NEWLINE;
tokres->c = c;
return 0;
case '.':
if (lexer_scan_directive(state, tokres) != 0) {
return -1;
}
return 0;
case '#':
tokres->type = TT_COMMENT;
tokres->c = c;
lexer_skip_comment(state);
return 0;
case ':':
tokres->c = c;
if ((c = lexer_consume(state, false)) == ':') {
return lexer_scan_shellblock(state, tokres);
}
lexer_putback(state, c);
tokres->type = TT_COLON;
return 0;
default:
if (lexer_scan_name(state, c, tokres) == 0) {
return 0;
}
trace_error(state, "unknown token %c\n", c);
break;
}
return -1;
}