Joy parser in NCC.

2023-03-04 08:25:32 -08:00 · 2023-03-04 08:25:32 -08:00 · fc5992c23b
parent cf37e52550
commit fc5992c23b
2 changed files with 300 additions and 0 deletions
--- a/implementations/uvm-ncc/parser.c
+++ b/implementations/uvm-ncc/parser.c
@ -0,0 +1,219 @@
 #include <uvm/syscalls.h>
 #include <string.h>
 //
 // Simple cons list.
 //
 #define HEAP_SIZE 1024
 u32 heads[HEAP_SIZE];
 u32 tails[HEAP_SIZE];
 u32 free = 0;
 #define TYPE_OF(pointer) (pointer >> 30)
 #define VALUE_OF(pointer) (pointer & 0x3fffffff)
 #define JOY_VALUE(type, value) ((type & 3) << 30) | (value & 0x3fffffff)
 u8 joyInt = 0;
 u8 joyList = 1;
 u8 joySymbol = 2;
 u8 joyBool = 3;
 u32 empty_list = 0;
 u32
 cons(u32 head, u32 tail)
 {
 	if (free >= HEAP_SIZE)
 		return -1;
 	heads[free] = head;
 	tails[free] = tail;
 	u32 cell = JOY_VALUE(joyList, free);
 	++free;
 	return cell;
 }
 u32 head(u32 list) { return heads[VALUE_OF(list)]; }
 u32 tail(u32 list) { return tails[VALUE_OF(list)]; }
 /*u32 cons(u32 head, u32 tail) { return tail; }*/
 // And now for a hash table.
 // https://benhoyt.com/writings/hash-table-in-c/#hash-tables
 // https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function
 #define FNV_OFFSET 0xcbf29ce484222325
 #define FNV_PRIME 0x100000001b3
 u64
 hash_key(char* key)
 {
    u64 hash = FNV_OFFSET;
    for (char* p = key; *p; ++p) {
        hash = hash ^ (u64)(unsigned char)(*p);
        hash = hash * FNV_PRIME;
    }
    return hash;
 }
 // Capacity is a power of two (10 for now.)
 #define EXPONENT 10
 #define CAPACITY 1024
 #define HASH_MASK 1023
 char* hash_table[CAPACITY];
 u32
 ht_insert(char *symbol)
 {
 	u64 hash = hash_key(symbol);
 	u32 index = hash % CAPACITY;
 	char *candidate = hash_table[index];
 	if (!candidate) {
 		print_str("interning ");print_str(symbol);print_endl();
 		hash_table[index] = symbol;
 		return JOY_VALUE(joySymbol, VALUE_OF(hash));
 	}
 	// https://en.wikipedia.org/wiki/Double_hashing
 	// Rather than use another hash function I'm going to try
 	// using the extra bits of the same hash.
 	u32 increment = ((VALUE_OF(hash) >> EXPONENT) | 1) % CAPACITY;
 	// If I understand correctly, making the increment odd
 	// means it will traverse the whole (even-sized) table.
 	while (candidate) {
 		// Compare pointers then hashes (since we already have
 		// one hash I'm guessing that that's cheaper or at least
 		// no more expensive than string comparision.)
 		if (candidate == symbol || hash == hash_key(candidate))
 			break;
 		index = (index + increment) % CAPACITY;
 		candidate = hash_table[index];
 	}
 	if (!candidate) {
 		hash_table[index] = symbol;
 		print_str("interning ");print_str(symbol);print_endl();
 	}
 	return JOY_VALUE(joySymbol, VALUE_OF(hash));
 }
 char*
 ht_lookup(u32 hash)
 {
 	// Note that hash will be truncated to N (N=30 as it happens) bits
 	// by VALUE_OF().
 	u32 index = hash % CAPACITY;
 	char *candidate = hash_table[index];
 	u32 increment = ((hash >> EXPONENT) | 1) % CAPACITY;
 	while (candidate) {
 		if (hash == VALUE_OF(hash_key(candidate)))
 			return candidate;
 		index = (index + increment) % CAPACITY;
 		candidate = hash_table[index];
 	}
 	/*error = UNKNOWN_WORD_ERROR;*/
 	return 0;
 }
 //
 // Simple string storage heap.
 //
 #define STRING_HEAP_SIZE 100000
 char string_heap[STRING_HEAP_SIZE];
 u32 string_heap_top = 0;
 char*
 allocate_string(char *buffer, u32 offset, u32 length)
 {
 	u64 end = string_heap_top + length + 1;
 	if (end >= STRING_HEAP_SIZE)
 		return 0;
 	memcpy(string_heap + string_heap_top, buffer + offset, length);
 	string_heap[end] = '\0';
 	u32 new_string = string_heap_top;
 	string_heap_top = (u32)end + 1;
 	return string_heap + new_string;
 }
 u32
 push_symbol(char *symbol, u32 stack)
 {
 	return cons(JOY_VALUE(joySymbol, ht_insert(symbol)), stack);
 }
 #define LEFT_BRACKET 0xffffffff
 #define RIGHT_BRACKET 0xfffffffe
 u32
 tokenize0(char *str, u32 str_length, u32 index, u32 acc)
 {
 	if (index >= str_length)
 		return acc;
 	char ch = str[index];
 	if ('[' == ch) {
 		return cons(LEFT_BRACKET, tokenize0(str, str_length, index + 1, acc));
 	}
 	if (']' == ch) {
 		return cons(RIGHT_BRACKET, tokenize0(str, str_length, index + 1, acc));
 	}
 	if (' ' == ch) {
 		return tokenize0(str, str_length, index + 1, acc);
 	}
 	u32 i = index + 1;
 	for (; i < str_length; ++i) {
 		if (str[i] == '[' || str[i] == ']' || str[i] == ' ') {
 			break;
 		}
 	}
 	// i == str_length OR str[i] is a delimiter char.
 	char *token = allocate_string(str, index, i - index);
 	if (!token)
 		return 0;  // OOM
 	return push_symbol(token, tokenize0(str, str_length, i, acc));
 }
 u32
 tokenize(char *str)
 {
 	return tokenize0(str, strlen(str), 0, empty_list);
 }
 /*u32*/
 /*parse0(u32 tokens, u32 acc)*/
 /*{*/
 /*	if (!tokens)*/
 /*		return acc;*/
 /*	u32 tok = head(tokens);*/
 /*	return parse0(tokens, empty_list);*/
 /*}*/
 /*u32*/
 /*parse(u32 tokens)*/
 /*{*/
 /*	return parse0(tokens, empty_list);*/
 /*}*/
 void
 main()
 {
 	memset(string_heap, 0, sizeof(string_heap));
 	char *buffer = " 1[2[ 3 ]4] cats dogs bunnies";
 	/*print_str(allocate_string(buffer, 4, 4)); print_endl();*/
 	/*print_str(allocate_string(buffer, 2, 4)); print_endl();*/
 	/*print_str(allocate_string(buffer, 7, 5)); print_endl();*/
 	tokenize(buffer);
 }
--- a/implementations/uvm-ncc/parser.py
+++ b/implementations/uvm-ncc/parser.py
@ -0,0 +1,81 @@
 u32 parse_head;
 u32
 make_non_list_node(u32 rest) {
 	// A this point text[parse_head:rest] is a term: symbol or int.
 	return push_symbol("foo", empty_list);
 }
 // Extract terms from the text until a closing bracket is found.
 u32
 parse_list(char *text)
 {
 	// trim blanks
 	while (text[parse_head] && text[parse_head] == ' ') ++parse_head;
 	if (!text[parse_head]) {
 		print_str("Missing ']' bracket. A");
 		print_endl();
 		error = MISSING_CLOSING_BRACKET;
 		return 0;
 	};
 	// So now we want to collect all chars up until the
 	// next '[', ']', blank, or the end of the string.
 	u32 rest = parse_head;
 	while (text[rest]) {
 		if (text[rest] == '[' || text[rest] == ']' || text[rest] == ' ')
 			break;
 		//print_str(text + rest);print_endl();
 		++rest;
 	}
 	if (!text[rest]) {
 		print_str("Missing ']' bracket. B");
 		print_endl();
 		error = MISSING_CLOSING_BRACKET;
 		return 0;
 	};
 	// A this point text[parse_head:rest] is a term: symbol or int.
 	// or it's empty
 	u32 diff = rest - parse_head;
 	u32 result = 0;
 	if (diff) {
 		result = make_non_list_node(rest);
 		parse_head = rest;
 	/*} else if ('[' == text[rest]) {*/
 	/*	parse_head = rest + 1;*/
 	/*	result = cons(parse_list(text), empty_list);*/
 	} else if (']' == text[rest]) {
 		parse_head = rest + 1;
 		result = empty_list;
 	}
 	tails[VALUE_OF(result)] = parse_list(text);
 	return result;
 }
 u32
 parse(char *text)
 {
 	parse_head = 0;
 	// trim blanks
 	while (text[parse_head] && text[parse_head] == ' ') ++parse_head;
 	if (!text[parse_head]) return empty_list;
 	if ('[' == text[parse_head]) {
 		++parse_head;
 		u32 list = parse_list(text);
 		if (error != NO_ERROR)
 			return 0;
 		return list;
 		/*foo = cons(list, foo);*/
 	}
 	/*if (']')*/
 	/*print_str(text + parse_head);*/
 	/*print_str(text + parse_head);*/
 }