381 lines
7.2 KiB
C
381 lines
7.2 KiB
C
/*
|
|
* Copyright (c) 2026, Mirocom Laboratories
|
|
* Provided under the BSD-3 clause
|
|
*
|
|
* Abstract:
|
|
* This file implements the lexer.
|
|
* Author:
|
|
* Ian M. Moffett <ian@mirocom.org>
|
|
*/
|
|
|
|
#include <sys/types.h>
|
|
#include <stddef.h>
|
|
#include <stdlib.h>
|
|
#include <stdbool.h>
|
|
#include <unistd.h>
|
|
#include <string.h>
|
|
#include <errno.h>
|
|
#include <ctype.h>
|
|
#include "frontend/token.h"
|
|
#include "frontend/lexer.h"
|
|
#include "common/ptrbox.h"
|
|
#include "common/trace.h"
|
|
|
|
#define SHELL_BLOCK_TERMINATE '~'
|
|
|
|
/*
|
|
* Test if a given character counts as a whitespace character
|
|
*
|
|
* @c: Character to test
|
|
*/
|
|
static inline bool
|
|
lexer_is_ws(char c)
|
|
{
|
|
switch (c) {
|
|
case '\t':
|
|
case '\f':
|
|
case '\a':
|
|
case ' ':
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Place a character in the lexer putback buffer
|
|
*
|
|
* @state: Quip state
|
|
* @c: Character to insert
|
|
*/
|
|
static inline void
|
|
lexer_putback(struct quip_state *state, char c)
|
|
{
|
|
if (state == NULL) {
|
|
return;
|
|
}
|
|
|
|
state->lex_putback = c;
|
|
}
|
|
|
|
/*
|
|
* Pop the last character from the lexer putback
|
|
* buffer
|
|
*/
|
|
static inline char
|
|
lexer_putback_pop(struct quip_state *state)
|
|
{
|
|
char c;
|
|
|
|
if (state == NULL) {
|
|
return '\0';
|
|
}
|
|
|
|
c = state->lex_putback;
|
|
state->lex_putback = '\0';
|
|
return c;
|
|
}
|
|
|
|
/*
|
|
* Consume a single character in a buffered manner from the
|
|
* build source file.
|
|
*
|
|
* @state: Quip state machine
|
|
*
|
|
* Returns the fetched character on success, otherwise a value
|
|
* of '\0' on failure.
|
|
*/
|
|
static char
|
|
lexer_buffer_consume(struct quip_state *state)
|
|
{
|
|
ssize_t n;
|
|
|
|
if (state == NULL) {
|
|
return '\0';
|
|
}
|
|
|
|
/*
|
|
* If there is nothing in the lexer-side buffer, fill it
|
|
* with what we can.
|
|
*/
|
|
if (state->lex_buf_cap == 0) {
|
|
n = read(state->in_fd, state->lex_buf, LEX_FILEBUF_LEN);
|
|
if (n <= 0)
|
|
return '\0';
|
|
|
|
state->lex_buf_cap = n;
|
|
}
|
|
|
|
/* Grab a single character if not empty */
|
|
if (state->lex_buf_i < state->lex_buf_cap) {
|
|
return state->lex_buf[state->lex_buf_i++];
|
|
}
|
|
|
|
/* Empty, reset everything and try again */
|
|
state->lex_buf_cap = 0;
|
|
state->lex_buf_i = 0;
|
|
return lexer_buffer_consume(state);
|
|
}
|
|
|
|
/*
|
|
* Consume a single character optionally skipping whitespace
|
|
*
|
|
* @state: Quip state machine
|
|
* @skip_ws: If true, skip whitespace
|
|
*/
|
|
static char
|
|
lexer_consume(struct quip_state *state, bool skip_ws)
|
|
{
|
|
char c;
|
|
|
|
/*
|
|
* If there is anything in the putback buffer, take
|
|
* it.
|
|
*/
|
|
if ((c = lexer_putback_pop(state)) != '\0') {
|
|
if (!skip_ws || !lexer_is_ws(c))
|
|
return c;
|
|
}
|
|
|
|
while ((c = lexer_buffer_consume(state)) != '\0') {
|
|
if (skip_ws && lexer_is_ws(c)) {
|
|
continue;
|
|
}
|
|
|
|
return c;
|
|
}
|
|
|
|
return '\0';
|
|
}
|
|
|
|
/*
|
|
* Scan a name and create a token if valid
|
|
*
|
|
* @state: Quip state
|
|
* @lc: Last character
|
|
* @tokres: Token result written here
|
|
*
|
|
* Returns zero on success
|
|
*/
|
|
static int
|
|
lexer_scan_name(struct quip_state *state, int lc, struct token *tokres)
|
|
{
|
|
char c, *buf;
|
|
size_t bufsz, bufcap;
|
|
|
|
if (state == NULL || tokres == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
if (!isalpha(lc) && lc != '_' && lc != '-') {
|
|
return -1;
|
|
}
|
|
|
|
bufsz = 0;
|
|
bufcap = 8;
|
|
if ((buf = malloc(bufcap)) == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
buf[bufsz++] = lc;
|
|
for (;;) {
|
|
c = lexer_consume(state, false);
|
|
if (!isalnum(c) && c != '_' && c != '-') {
|
|
buf[bufsz] = '\0';
|
|
lexer_putback(state, c);
|
|
break;
|
|
}
|
|
|
|
buf[bufsz++] = c;
|
|
if (bufsz >= bufcap - 1) {
|
|
bufcap += 8;
|
|
buf = realloc(buf, bufcap);
|
|
}
|
|
|
|
if (buf == NULL) {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
tokres->type = TT_NAME;
|
|
tokres->s = ptrbox_strdup(&state->ptrbox, buf);
|
|
free(buf);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Scan for a directive token
|
|
*
|
|
* @state: Quip state
|
|
* @tokres: Token result is written here
|
|
*
|
|
* Returns zero on success
|
|
*/
|
|
static int
|
|
lexer_scan_directive(struct quip_state *state, struct token *tokres)
|
|
{
|
|
char c;
|
|
|
|
if ((c = lexer_consume(state, true)) == '\0') {
|
|
return -1;
|
|
}
|
|
|
|
if (!isalpha(c)) {
|
|
return -1;
|
|
}
|
|
|
|
if (lexer_scan_name(state, c, tokres) < 0) {
|
|
trace_error(state, "fatal: error scanning directive\n");
|
|
return -1;
|
|
}
|
|
|
|
switch (*tokres->s) {
|
|
case 'c':
|
|
if (strcmp(tokres->s, "cc") == 0) {
|
|
tokres->type = TT_CC;
|
|
return 0;
|
|
}
|
|
|
|
break;
|
|
case 'l':
|
|
if (strcmp(tokres->s, "ld") == 0) {
|
|
tokres->type = TT_LD;
|
|
return 0;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Scan a shell block
|
|
*
|
|
* @state: Quip state
|
|
* @tokres: Token result is written here
|
|
*
|
|
* Returns zero on success
|
|
*/
|
|
static int
|
|
lexer_scan_shellblock(struct quip_state *state, struct token *tokres)
|
|
{
|
|
char c, *buf;
|
|
size_t bufsz, bufcap;
|
|
bool is_leading = true;
|
|
|
|
bufsz = 0;
|
|
bufcap = 8;
|
|
if ((buf = malloc(bufcap)) == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
for (;;) {
|
|
if ((c = lexer_consume(state, false)) == '\0') {
|
|
trace_error(state, "unexpected end of file within shellblock\n");
|
|
free(buf);
|
|
return -1;
|
|
}
|
|
|
|
/* Handle overflow if needed */
|
|
if (bufsz > bufcap - 1) {
|
|
bufcap += 8;
|
|
buf = realloc(buf, bufcap);
|
|
}
|
|
|
|
/* Handle newlines */
|
|
if (c == '\n') {
|
|
buf[bufsz++] = ' ';
|
|
is_leading = true;
|
|
continue;
|
|
}
|
|
|
|
if (c == SHELL_BLOCK_TERMINATE) {
|
|
buf[bufsz] = '\0';
|
|
break;
|
|
}
|
|
|
|
if (is_leading && lexer_is_ws(c)) {
|
|
continue;
|
|
}
|
|
|
|
is_leading = false;
|
|
buf[bufsz++] = c;
|
|
if (buf == NULL) {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
tokres->type = TT_SHELLBLOCK;
|
|
tokres->s = ptrbox_strdup(&state->ptrbox, buf);
|
|
free(buf);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Skip a source code comment
|
|
*
|
|
* @state: Quip state
|
|
*/
|
|
static void
|
|
lexer_skip_comment(struct quip_state *state)
|
|
{
|
|
char c;
|
|
|
|
while ((c = lexer_consume(state, false)) != '\n') {
|
|
if (c == '\0') {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
int
|
|
lexer_scan(struct quip_state *state, struct token *tokres)
|
|
{
|
|
char c;
|
|
|
|
if (state == NULL || tokres == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
if ((c = lexer_consume(state, true)) == '\0') {
|
|
return -1;
|
|
}
|
|
|
|
switch (c) {
|
|
case '\n':
|
|
tokres->type = TT_NEWLINE;
|
|
tokres->c = c;
|
|
return 0;
|
|
case '.':
|
|
if (lexer_scan_directive(state, tokres) != 0) {
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
case '#':
|
|
tokres->type = TT_COMMENT;
|
|
tokres->c = c;
|
|
lexer_skip_comment(state);
|
|
return 0;
|
|
case ':':
|
|
tokres->c = c;
|
|
if ((c = lexer_consume(state, false)) == ':') {
|
|
return lexer_scan_shellblock(state, tokres);
|
|
}
|
|
|
|
lexer_putback(state, c);
|
|
tokres->type = TT_COLON;
|
|
return 0;
|
|
default:
|
|
if (lexer_scan_name(state, c, tokres) == 0) {
|
|
return 0;
|
|
}
|
|
|
|
trace_error(state, "unknown token %c\n", c);
|
|
break;
|
|
}
|
|
|
|
return -1;
|
|
}
|