Thun/implementations/uvm-ncc/parser.c

389 lines
8.6 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <uvm/syscalls.h>
#include <string.h>
//
// Simple cons list.
//
#define HEAP_SIZE 1024
u32 heads[HEAP_SIZE];
u32 tails[HEAP_SIZE];
u32 free = 1; // cell 0 is reserved so that 0 can be the empty list.
#define TYPE_OF(pointer) (pointer >> 30)
#define VALUE_OF(pointer) (pointer & 0x3fffffff)
#define JOY_VALUE(type, value) ((type & 3) << 30) | (value & 0x3fffffff)
u32 empty_list = 0;
u8 joyList = 0;
u8 joyInt = 1;
u8 joySymbol = 2;
u8 joyBool = 3;
u32
cons(u32 head, u32 tail)
{
if (free >= HEAP_SIZE)
return -1;
heads[free] = head;
tails[free] = tail;
u32 cell = JOY_VALUE(joyList, free);
++free;
return cell;
}
u32 head(u32 list) { return heads[VALUE_OF(list)]; }
u32 tail(u32 list) { return tails[VALUE_OF(list)]; }
/*u32 cons(u32 head, u32 tail) { return tail; }*/
// And now for a hash table.
// https://benhoyt.com/writings/hash-table-in-c/#hash-tables
// https://en.wikipedia.org/wiki/FowlerNollVo_hash_function
#define FNV_OFFSET 0xcbf29ce484222325
#define FNV_PRIME 0x100000001b3
u64
hash_key(char* key)
{
u64 hash = FNV_OFFSET;
for (char* p = key; *p; ++p) {
hash = hash ^ (u64)(unsigned char)(*p);
hash = hash * FNV_PRIME;
}
return hash;
}
// Capacity is a power of two (10 for now.)
#define EXPONENT 10
#define CAPACITY 1024
#define HASH_MASK 1023
char* hash_table[CAPACITY];
u32
ht_insert(char *symbol)
{
u64 hash = hash_key(symbol);
u32 index = hash % CAPACITY;
char *candidate = hash_table[index];
if (!candidate) {
//print_str("interning ");print_str(symbol);print_endl();
hash_table[index] = symbol;
return JOY_VALUE(joySymbol, VALUE_OF(hash));
}
// https://en.wikipedia.org/wiki/Double_hashing
// Rather than use another hash function I'm going to try
// using the extra bits of the same hash.
u32 increment = ((VALUE_OF(hash) >> EXPONENT) | 1) % CAPACITY;
// If I understand correctly, making the increment odd
// means it will traverse the whole (even-sized) table.
while (candidate) {
// Compare pointers then hashes (since we already have
// one hash I'm guessing that that's cheaper or at least
// no more expensive than string comparision.)
if (candidate == symbol || hash == hash_key(candidate))
break;
index = (index + increment) % CAPACITY;
candidate = hash_table[index];
}
if (!candidate) {
//print_str("interning ");print_str(symbol);print_endl();
hash_table[index] = symbol;
}
return JOY_VALUE(joySymbol, VALUE_OF(hash));
}
char*
ht_lookup(u32 hash)
{
// Note that hash will be truncated to N (N=30 as it happens) bits
// by VALUE_OF().
u32 index = hash % CAPACITY;
char *candidate = hash_table[index];
u32 increment = ((hash >> EXPONENT) | 1) % CAPACITY;
while (candidate) {
if (hash == VALUE_OF(hash_key(candidate)))
return candidate;
index = (index + increment) % CAPACITY;
candidate = hash_table[index];
}
/*error = UNKNOWN_WORD_ERROR;*/
return 0;
}
//
// Simple string storage heap.
//
#define STRING_HEAP_SIZE 100000
char string_heap[STRING_HEAP_SIZE];
u32 string_heap_top = 0;
char*
allocate_string(char *buffer, u32 offset, u32 length)
{
u64 end = string_heap_top + length + 1;
if (end >= STRING_HEAP_SIZE)
return 0;
memcpy(string_heap + string_heap_top, buffer + offset, length);
string_heap[end] = '\0';
u32 new_string = string_heap_top;
string_heap_top = (u32)end + 1;
//print_str("allocating ");print_str(string_heap + new_string);print_endl();
return string_heap + new_string;
}
u32
push_symbol(char *symbol, u32 stack)
{
return cons(JOY_VALUE(joySymbol, ht_insert(symbol)), stack);
}
char* LEFT_BRACKET_symbol = "[";
char* RIGHT_BRACKET_symbol = "]";
u32 LEFT_BRACKET;
u32 RIGHT_BRACKET;
bool
is_integer(char *str, u32 index, u32 length)
{
for (;length; --length) {
char ch = *(str + index + length - 1);
if (!(ch == '0'
|| ch == '1'
|| ch == '2'
|| ch == '3'
|| ch == '4'
|| ch == '5'
|| ch == '6'
|| ch == '7'
|| ch == '8'
|| ch == '9'))
{
return 0;
}
}
return 1;
}
u32
convert_integer(char *str, u32 index, u32 length)
{
u32 result = 0;
length = length + index;
for (; index < length; ++index) {
char ch = *(str + index);
u8 digit = (u8)ch - (u8)'0';
result = result * 10 + digit;
}
//print_str("converted integer ");print_i64(result);print_endl();
return JOY_VALUE(joyInt, result);
}
u32
tokenate(char *str, u32 index, u32 length)
{
if (4 == length
&& *(str + index) == 't'
&& *(str + index + 1) == 'r'
&& *(str + index + 2) == 'u'
&& *(str + index + 3) == 'e'
) {
//print_str("tokenate true");print_endl();
return JOY_VALUE(joyBool, 1);
}
if (5 == length
&& *(str + index) == 'f'
&& *(str + index + 1) == 'a'
&& *(str + index + 2) == 'l'
&& *(str + index + 3) == 's'
&& *(str + index + 4) == 'e'
) {
//print_str("tokenate false");print_endl();
return JOY_VALUE(joyBool, 0);
}
if (is_integer(str, index, length)) {
//print_str("tokenate integer");print_endl();
return convert_integer(str, index, length);
}
// TODO: Convert bools and ints here?
// Use ht_insert to avoid multiple allocations of the same string!
char *token = allocate_string(str, index, length);
if (!token)
return 0; // OOM
return JOY_VALUE(joySymbol, ht_insert(token));
}
u32
tokenize0(char *str, u32 str_length, u32 index, u32 acc)
{
if (index >= str_length) {
//print_i64(index);print_str(" : ");print_str("END tokenize");print_endl();
//print_i64(acc);print_str("<");print_endl();
return acc;
}
//print_i64(index);print_str(" : ");print_str(str + index);print_endl();
char ch = str[index];
if ('[' == ch) {
acc = cons(LEFT_BRACKET, tokenize0(str, str_length, index + 1, acc));
//print_i64(acc);print_str("<[");print_endl();
return acc;
}
if (']' == ch) {
acc = cons(RIGHT_BRACKET, tokenize0(str, str_length, index + 1, acc));
//print_i64(acc);print_str("<]");print_endl();
return acc;
}
if (' ' == ch) {
return tokenize0(str, str_length, index + 1, acc);
}
u32 i = index + 1;
for (; i < str_length; ++i) {
if (str[i] == '[' || str[i] == ']' || str[i] == ' ') {
break;
}
}
// i == str_length OR str[i] is a delimiter char.
return cons(tokenate(str, index, i - index), tokenize0(str, str_length, i, acc));
}
u32
tokenize(char *str)
{
return tokenize0(str, strlen(str), 0, empty_list);
}
u32
rev(u32 el, u32 end)
{
u32 t = tail(el);
tails[el] = end;
if (t) {
return rev(t, el);
}
return el;
}
u32
reverse_list_in_place(u32 el)
{
if (!el) { // Empty list.
return el;
}
u32 t = tail(el);
if (!t) { // Length-one list.
return el;
}
// Re-write tails[el] to point to null
tails[el] = empty_list;
// reverse the tail of this list and prepend it to [el].
return rev(t, el);
}
u32 stack[1000];
u32 stack_top = 0;
u32
text_to_expression(char *str)
{
u32 frame = empty_list;
u32 tokens = tokenize(str);
//print_str("tokens: "); print_joy_list(tokens); print_endl();
//return tokens;
while (tokens) {
u32 tok = head(tokens);
tokens = tail(tokens);
if (LEFT_BRACKET == tok) {
//print_str("left bracket");print_endl();
stack[stack_top] = frame;
++stack_top;
frame = empty_list;
continue;
}
if (RIGHT_BRACKET == tok) {
//print_str("right bracket");print_endl();
tok = reverse_list_in_place(frame);
//print_str("new list: "); print_joy_list(tok); print_endl();
--stack_top; // This first! THEN get the old frame! D'oh!
frame = stack[stack_top];
}
frame = cons(tok, frame);
//print_str("t2e frame: "); print_joy_list(frame); print_endl();
}
return reverse_list_in_place(frame);
}
void
print_joy_value(u32 jv)
{
u8 type = TYPE_OF(jv);
if (type == joyInt) {
print_i64(VALUE_OF(jv));
} else if (type == joyBool) {
print_str(VALUE_OF(jv) ? "true" : "false");
} else if (type == joyList) {
print_str("[");
print_joy_list(jv);
print_str("]");
} else if (type == joySymbol) {
char *str = ht_lookup(VALUE_OF(jv));
/*if (error != NO_ERROR)*/
/* return;*/
print_str(str);
}
}
void
print_joy_list(u32 list)
{
while (list) {
print_joy_value(head(list));
list = tail(list);
if (list) {
print_str(" ");
}
}
}
void
main()
{
memset(string_heap, 0, sizeof(string_heap));
LEFT_BRACKET = JOY_VALUE(joySymbol, ht_insert(LEFT_BRACKET_symbol));
RIGHT_BRACKET = JOY_VALUE(joySymbol, ht_insert(RIGHT_BRACKET_symbol));
//char *buffer = " 1[2[ 3 cats ]4] cats dogs bunnies";
char *buffer = " 1[2]3 bunnies";
/*print_str(allocate_string(buffer, 4, 4)); print_endl();*/
/*print_str(allocate_string(buffer, 2, 4)); print_endl();*/
/*print_str(allocate_string(buffer, 7, 5)); print_endl();*/
u32 jv = text_to_expression(" 1[2[true 3][[]]bob]false[]bob 3[4]5");
//jv = reverse_list_in_place(jv);
if (LEFT_BRACKET == jv) {
print_str("boo");
} else {
//jv = JOY_VALUE(joyList, jv);
print_joy_list(jv);
}
print_endl();
}