diff --git a/implementations/uvm-ncc/parser.c b/implementations/uvm-ncc/parser.c new file mode 100644 index 0000000..a496a29 --- /dev/null +++ b/implementations/uvm-ncc/parser.c @@ -0,0 +1,219 @@ +#include +#include + + + + + +// +// Simple cons list. +// + +#define HEAP_SIZE 1024 +u32 heads[HEAP_SIZE]; +u32 tails[HEAP_SIZE]; +u32 free = 0; +#define TYPE_OF(pointer) (pointer >> 30) +#define VALUE_OF(pointer) (pointer & 0x3fffffff) +#define JOY_VALUE(type, value) ((type & 3) << 30) | (value & 0x3fffffff) +u8 joyInt = 0; +u8 joyList = 1; +u8 joySymbol = 2; +u8 joyBool = 3; +u32 empty_list = 0; +u32 +cons(u32 head, u32 tail) +{ + if (free >= HEAP_SIZE) + return -1; + heads[free] = head; + tails[free] = tail; + u32 cell = JOY_VALUE(joyList, free); + ++free; + return cell; +} +u32 head(u32 list) { return heads[VALUE_OF(list)]; } +u32 tail(u32 list) { return tails[VALUE_OF(list)]; } + + +/*u32 cons(u32 head, u32 tail) { return tail; }*/ + + + + + + + +// And now for a hash table. +// https://benhoyt.com/writings/hash-table-in-c/#hash-tables +// https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function +#define FNV_OFFSET 0xcbf29ce484222325 +#define FNV_PRIME 0x100000001b3 +u64 +hash_key(char* key) +{ + u64 hash = FNV_OFFSET; + for (char* p = key; *p; ++p) { + hash = hash ^ (u64)(unsigned char)(*p); + hash = hash * FNV_PRIME; + } + return hash; +} +// Capacity is a power of two (10 for now.) +#define EXPONENT 10 +#define CAPACITY 1024 +#define HASH_MASK 1023 +char* hash_table[CAPACITY]; +u32 +ht_insert(char *symbol) +{ + u64 hash = hash_key(symbol); + u32 index = hash % CAPACITY; + + char *candidate = hash_table[index]; + if (!candidate) { + print_str("interning ");print_str(symbol);print_endl(); + hash_table[index] = symbol; + return JOY_VALUE(joySymbol, VALUE_OF(hash)); + } + + // https://en.wikipedia.org/wiki/Double_hashing + // Rather than use another hash function I'm going to try + // using the extra bits of the same hash. + u32 increment = ((VALUE_OF(hash) >> EXPONENT) | 1) % CAPACITY; + // If I understand correctly, making the increment odd + // means it will traverse the whole (even-sized) table. + while (candidate) { + // Compare pointers then hashes (since we already have + // one hash I'm guessing that that's cheaper or at least + // no more expensive than string comparision.) + if (candidate == symbol || hash == hash_key(candidate)) + break; + index = (index + increment) % CAPACITY; + candidate = hash_table[index]; + } + if (!candidate) { + hash_table[index] = symbol; + print_str("interning ");print_str(symbol);print_endl(); + } + return JOY_VALUE(joySymbol, VALUE_OF(hash)); +} +char* +ht_lookup(u32 hash) +{ + // Note that hash will be truncated to N (N=30 as it happens) bits + // by VALUE_OF(). + u32 index = hash % CAPACITY; + char *candidate = hash_table[index]; + u32 increment = ((hash >> EXPONENT) | 1) % CAPACITY; + while (candidate) { + if (hash == VALUE_OF(hash_key(candidate))) + return candidate; + index = (index + increment) % CAPACITY; + candidate = hash_table[index]; + } + /*error = UNKNOWN_WORD_ERROR;*/ + return 0; +} + + + + + + + +// +// Simple string storage heap. +// +#define STRING_HEAP_SIZE 100000 +char string_heap[STRING_HEAP_SIZE]; +u32 string_heap_top = 0; +char* +allocate_string(char *buffer, u32 offset, u32 length) +{ + u64 end = string_heap_top + length + 1; + if (end >= STRING_HEAP_SIZE) + return 0; + memcpy(string_heap + string_heap_top, buffer + offset, length); + string_heap[end] = '\0'; + u32 new_string = string_heap_top; + string_heap_top = (u32)end + 1; + return string_heap + new_string; + +} + + +u32 +push_symbol(char *symbol, u32 stack) +{ + return cons(JOY_VALUE(joySymbol, ht_insert(symbol)), stack); +} + + + +#define LEFT_BRACKET 0xffffffff +#define RIGHT_BRACKET 0xfffffffe + +u32 +tokenize0(char *str, u32 str_length, u32 index, u32 acc) +{ + if (index >= str_length) + return acc; + char ch = str[index]; + if ('[' == ch) { + return cons(LEFT_BRACKET, tokenize0(str, str_length, index + 1, acc)); + } + if (']' == ch) { + return cons(RIGHT_BRACKET, tokenize0(str, str_length, index + 1, acc)); + } + if (' ' == ch) { + return tokenize0(str, str_length, index + 1, acc); + } + u32 i = index + 1; + for (; i < str_length; ++i) { + if (str[i] == '[' || str[i] == ']' || str[i] == ' ') { + break; + } + } + // i == str_length OR str[i] is a delimiter char. + char *token = allocate_string(str, index, i - index); + if (!token) + return 0; // OOM + return push_symbol(token, tokenize0(str, str_length, i, acc)); + +} + + +u32 +tokenize(char *str) +{ + return tokenize0(str, strlen(str), 0, empty_list); +} + + +/*u32*/ +/*parse0(u32 tokens, u32 acc)*/ +/*{*/ +/* if (!tokens)*/ +/* return acc;*/ +/* u32 tok = head(tokens);*/ +/* return parse0(tokens, empty_list);*/ +/*}*/ + + +/*u32*/ +/*parse(u32 tokens)*/ +/*{*/ +/* return parse0(tokens, empty_list);*/ +/*}*/ + +void +main() +{ + memset(string_heap, 0, sizeof(string_heap)); + char *buffer = " 1[2[ 3 ]4] cats dogs bunnies"; + /*print_str(allocate_string(buffer, 4, 4)); print_endl();*/ + /*print_str(allocate_string(buffer, 2, 4)); print_endl();*/ + /*print_str(allocate_string(buffer, 7, 5)); print_endl();*/ + tokenize(buffer); +} \ No newline at end of file diff --git a/implementations/uvm-ncc/parser.py b/implementations/uvm-ncc/parser.py new file mode 100644 index 0000000..b43ae91 --- /dev/null +++ b/implementations/uvm-ncc/parser.py @@ -0,0 +1,81 @@ + +u32 parse_head; + +u32 +make_non_list_node(u32 rest) { + // A this point text[parse_head:rest] is a term: symbol or int. + return push_symbol("foo", empty_list); +} + + +// Extract terms from the text until a closing bracket is found. +u32 +parse_list(char *text) +{ + // trim blanks + while (text[parse_head] && text[parse_head] == ' ') ++parse_head; + + if (!text[parse_head]) { + print_str("Missing ']' bracket. A"); + print_endl(); + error = MISSING_CLOSING_BRACKET; + return 0; + }; + + // So now we want to collect all chars up until the + // next '[', ']', blank, or the end of the string. + u32 rest = parse_head; + while (text[rest]) { + if (text[rest] == '[' || text[rest] == ']' || text[rest] == ' ') + break; + //print_str(text + rest);print_endl(); + ++rest; + } + + if (!text[rest]) { + print_str("Missing ']' bracket. B"); + print_endl(); + error = MISSING_CLOSING_BRACKET; + return 0; + }; + + // A this point text[parse_head:rest] is a term: symbol or int. + // or it's empty + u32 diff = rest - parse_head; + u32 result = 0; + if (diff) { + result = make_non_list_node(rest); + parse_head = rest; + /*} else if ('[' == text[rest]) {*/ + /* parse_head = rest + 1;*/ + /* result = cons(parse_list(text), empty_list);*/ + } else if (']' == text[rest]) { + parse_head = rest + 1; + result = empty_list; + } + tails[VALUE_OF(result)] = parse_list(text); + return result; +} + +u32 +parse(char *text) +{ + parse_head = 0; + + // trim blanks + while (text[parse_head] && text[parse_head] == ' ') ++parse_head; + + if (!text[parse_head]) return empty_list; + + if ('[' == text[parse_head]) { + ++parse_head; + u32 list = parse_list(text); + if (error != NO_ERROR) + return 0; + return list; + /*foo = cons(list, foo);*/ + } + /*if (']')*/ + /*print_str(text + parse_head);*/ + /*print_str(text + parse_head);*/ +}