Thun/implementations/uvm-ncc/joy.c

582 lines
19 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright © 2023 Simon Forman
//
// This file is part of Thun
//
// Thun is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Thun is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Thun. If not see <http://www.gnu.org/licenses/>.
//
#include <uvm/syscalls.h>
#include <string.h>
/*
███████╗██████╗ ██████╗ ██████╗ ██████╗
██╔════╝██╔══██╗██╔══██╗██╔═══██╗██╔══██╗
█████╗ ██████╔╝██████╔╝██║ ██║██████╔╝
██╔══╝ ██╔══██╗██╔══██╗██║ ██║██╔══██╗
███████╗██║ ██║██║ ██║╚██████╔╝██║ ██║
╚══════╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═╝
██╗ ██╗ █████╗ ███╗ ██╗██████╗ ██╗ ██╗███╗ ██╗ ██████╗
██║ ██║██╔══██╗████╗ ██║██╔══██╗██║ ██║████╗ ██║██╔════╝
███████║███████║██╔██╗ ██║██║ ██║██║ ██║██╔██╗ ██║██║ ███╗
██╔══██║██╔══██║██║╚██╗██║██║ ██║██║ ██║██║╚██╗██║██║ ██║
██║ ██║██║ ██║██║ ╚████║██████╔╝███████╗██║██║ ╚████║╚██████╔╝
╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═══╝╚═════╝ ╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
No setjmp/longjmp, so let's have a global error value and check it after ops.
*/
u64 error = 0;
#define NO_ERROR 0
#define UNKNOWN_WORD_ERROR 1
#define MISSING_CLOSING_BRACKET 2
#define EXTRA_CLOSING_BRACKET 3
#define CONS_HEAP_OOM 4
#define STRING_HEAP_OOM 5
/*
char *error_messages[3] = {
"",
"Unknown word",
"Missing closing bracket"
};
*/
/*
██████╗ ██████╗ ███╗ ██╗███████╗ ██╗ ██╗███████╗ █████╗ ██████╗
██╔════╝██╔═══██╗████╗ ██║██╔════╝ ██║ ██║██╔════╝██╔══██╗██╔══██╗
██║ ██║ ██║██╔██╗ ██║███████╗ ███████║█████╗ ███████║██████╔╝
██║ ██║ ██║██║╚██╗██║╚════██║ ██╔══██║██╔══╝ ██╔══██║██╔═══╝
╚██████╗╚██████╔╝██║ ╚████║███████║ ██║ ██║███████╗██║ ██║██║
╚═════╝ ╚═════╝ ╚═╝ ╚═══╝╚══════╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═╝
Cons Heap
We don't have Unions, Enums, or Typedefs. So how do we represent Joy types?
In SICP they use a pair of arrays of pointers, one for heads and one
for tails.
> A pointer to a pair is an index into the two vectors.
*/
#define HEAP_SIZE 1024
u32 heads[HEAP_SIZE];
u32 tails[HEAP_SIZE];
// cell 0 is reserved so that 0 can be the empty list.
u32 free = 1;
// > We also need a representation for objects other than pairs (such as
// > numbers and symbols) and a way to distinguish one kind of data from
// > another. There are many methods of accomplishing this, but they all
// > reduce to using typed pointers, that is, to extending the notion of
// > ``pointer'' to include information on data type.
// Let's use u32 with the two MSB's for the type tag.
#define TYPE_OF(pointer) (pointer >> 30)
#define VALUE_OF(pointer) (pointer & 0x3fffffff)
#define JOY_VALUE(type, value) ((type & 3) << 30) | (value & 0x3fffffff)
/*
This means that our ints are restricted to 30 bits for now, until
I implement bignums.
In the Thun dialect of Joy we have four types of values:
Integers, Booleans, Symbols, and Lists.
*/
u8 joyList = 0;
u8 joyInt = 1;
u8 joySymbol = 2;
u8 joyBool = 3;
// Because the type tag for lists is 0 the empty list is just 0;
u32 empty_list = 0;
u32
cons(u32 head, u32 tail)
{
if (free >= HEAP_SIZE) {
error = CONS_HEAP_OOM;
return -1;
}
heads[free] = head;
tails[free] = tail;
u32 cell = JOY_VALUE(joyList, free);
++free;
return cell;
}
u32 head(u32 list) { return heads[VALUE_OF(list)]; }
u32 tail(u32 list) { return tails[VALUE_OF(list)]; }
/*
███████╗████████╗██████╗ ██╗███╗ ██╗ ██████╗
██╔════╝╚══██╔══╝██╔══██╗██║████╗ ██║██╔════╝
███████╗ ██║ ██████╔╝██║██╔██╗ ██║██║ ███╗
╚════██║ ██║ ██╔══██╗██║██║╚██╗██║██║ ██║
███████║ ██║ ██║ ██║██║██║ ╚████║╚██████╔╝
╚══════╝ ╚═╝ ╚═╝ ╚═╝╚═╝╚═╝ ╚═══╝ ╚═════╝
██╗ ██╗███████╗ █████╗ ██████╗
██║ ██║██╔════╝██╔══██╗██╔══██╗
███████║█████╗ ███████║██████╔╝
██╔══██║██╔══╝ ██╔══██║██╔═══╝
██║ ██║███████╗██║ ██║██║
╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═╝
Simple string storage heap.
We need a place to keep symbol strings.
*/
#define STRING_HEAP_SIZE 100000
char string_heap[STRING_HEAP_SIZE];
u32 string_heap_top = 0;
char*
allocate_string(char *buffer, u32 offset, u32 length)
{
u64 end = string_heap_top + length + 1;
if (end >= STRING_HEAP_SIZE) {
error = STRING_HEAP_OOM;
return 0;
}
memcpy(string_heap + string_heap_top, buffer + offset, length);
string_heap[end] = '\0';
u32 new_string = string_heap_top;
string_heap_top = (u32)end + 1;
//print_str("allocating ");print_str(string_heap + new_string);print_endl();
return string_heap + new_string;
}
/*
██████╗ ██████╗ ██╗███╗ ██╗████████╗███████╗██████╗
██╔══██╗██╔══██╗██║████╗ ██║╚══██╔══╝██╔════╝██╔══██╗
██████╔╝██████╔╝██║██╔██╗ ██║ ██║ █████╗ ██████╔╝
██╔═══╝ ██╔══██╗██║██║╚██╗██║ ██║ ██╔══╝ ██╔══██╗
██║ ██║ ██║██║██║ ╚████║ ██║ ███████╗██║ ██║
╚═╝ ╚═╝ ╚═╝╚═╝╚═╝ ╚═══╝ ╚═╝ ╚══════╝╚═╝ ╚═╝
Printer
*/
void
print_joy_value(u32 jv)
{
u8 type = TYPE_OF(jv);
if (type == joyInt) {
print_i64(VALUE_OF(jv));
} else if (type == joyBool) {
print_str(VALUE_OF(jv) ? "true" : "false");
} else if (type == joyList) {
print_str("[");
print_joy_list(jv);
print_str("]");
} else if (type == joySymbol) {
char *str = ht_lookup(VALUE_OF(jv));
if (error != NO_ERROR)
return;
print_str(str);
}
}
void
print_joy_list(u32 list)
{
while (list) {
print_joy_value(head(list));
if (error != NO_ERROR)
return;
list = tail(list);
if (list) {
print_str(" ");
}
}
}
/*
██╗ ██╗ █████╗ ███████╗██╗ ██╗
██║ ██║██╔══██╗██╔════╝██║ ██║
███████║███████║███████╗███████║
██╔══██║██╔══██║╚════██║██╔══██║
██║ ██║██║ ██║███████║██║ ██║
╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝
████████╗ █████╗ ██████╗ ██╗ ███████╗
╚══██╔══╝██╔══██╗██╔══██╗██║ ██╔════╝
██║ ███████║██████╔╝██║ █████╗
██║ ██╔══██║██╔══██╗██║ ██╔══╝
██║ ██║ ██║██████╔╝███████╗███████╗
╚═╝ ╚═╝ ╚═╝╚═════╝ ╚══════╝╚══════╝
And now for a hash table.
This table maps between hashes of symbol strings which are used in the
tagged pointers in Joy values and strings which are stored in the string
heap.
TODO: bool ht_has(char *str, u32 index, u32 length) to see if a fragment
of a string buffer is a symbol in the hash table.
FNV hash function.
https://benhoyt.com/writings/hash-table-in-c/#hash-tables
https://en.wikipedia.org/wiki/FowlerNollVo_hash_function
*/
#define FNV_OFFSET 0xcbf29ce484222325
#define FNV_PRIME 0x100000001b3
u64
hash_key(char* key)
{
u64 hash = FNV_OFFSET;
for (char* p = key; *p; ++p) {
hash = hash ^ (u64)(unsigned char)(*p);
hash = hash * FNV_PRIME;
}
return hash;
}
u64
hash_fragment(char *str, u32 index, u32 length)
{
u64 hash = FNV_OFFSET;
for (char* p = (str + index); length; --length, ++p) {
hash = hash ^ (u64)(unsigned char)(*p);
hash = hash * FNV_PRIME;
}
return hash;
}
// Capacity is a power of two (10 for now.)
#define EXPONENT 10
#define CAPACITY 1024
#define HASH_MASK 1023
// Note that there's no checking for filling the table and expanding.
// For now, I'm just going to use a "large enough" table and hope
// for the best. (We have thirty bits to work with so the obvious
// thing to do is make the exponent fifteen, half for the hash key
// and half for the increment.)
char* hash_table[CAPACITY];
u32
ht_insert(char *symbol)
{
u64 hash = hash_key(symbol);
u32 index = hash % CAPACITY;
char *candidate = hash_table[index];
if (!candidate) {
hash_table[index] = symbol;
return JOY_VALUE(joySymbol, VALUE_OF(hash));
}
// https://en.wikipedia.org/wiki/Double_hashing
// Rather than use another hash function I'm going to try
// using the extra bits of the same hash.
u32 increment = ((VALUE_OF(hash) >> EXPONENT) | 1) % CAPACITY;
// If I understand correctly, making the increment odd
// means it will traverse the whole (even-sized) table.
while (candidate) {
// Compare pointers then hashes (since we already have
// one hash I'm guessing that that's cheaper or at least
// no more expensive than string comparision.)
if (candidate == symbol || hash == hash_key(candidate))
break;
index = (index + increment) % CAPACITY;
candidate = hash_table[index];
}
if (!candidate) {
hash_table[index] = symbol;
}
return JOY_VALUE(joySymbol, VALUE_OF(hash));
}
char*
ht_lookup(u32 hash)
{
// Note that hash must be truncated to N (N=30 as it happens) bits
// by VALUE_OF().
u32 index = hash % CAPACITY;
char *candidate = hash_table[index];
u32 increment = ((hash >> EXPONENT) | 1) % CAPACITY;
while (candidate) {
if (hash == VALUE_OF(hash_key(candidate)))
return candidate;
index = (index + increment) % CAPACITY;
candidate = hash_table[index];
}
error = UNKNOWN_WORD_ERROR;
return 0;
}
u32
ht_has(char *str, u32 index, u32 length)
{
u32 hash = VALUE_OF(hash_fragment(str, index, length));
ht_lookup(hash);
if (UNKNOWN_WORD_ERROR == error) {
error = NO_ERROR;
return 0;
}
return hash;
}
/******************************************************************************/
u32
push_symbol(char *symbol, u32 stack)
{
return cons(JOY_VALUE(joySymbol, ht_insert(symbol)), stack);
}
u32
push_int(u32 n, u32 stack)
{
return cons(JOY_VALUE(joyInt, n), stack);
}
/******************************************************************************/
bool
is_integer(char *str, u32 index, u32 length)
{
for (;length; --length) {
char ch = *(str + index + length - 1);
if (!(ch == '0'
|| ch == '1'
|| ch == '2'
|| ch == '3'
|| ch == '4'
|| ch == '5'
|| ch == '6'
|| ch == '7'
|| ch == '8'
|| ch == '9'))
{
return 0;
}
}
return 1;
}
u32
convert_integer(char *str, u32 index, u32 length)
{
u32 result = 0;
length = length + index;
for (; index < length; ++index) {
char ch = *(str + index);
u8 digit = (u8)ch - (u8)'0';
result = result * 10 + digit;
}
//print_str("converted integer ");print_i64(result);print_endl();
return JOY_VALUE(joyInt, result);
}
/******************************************************************************/
/*
██████╗ █████╗ ██████╗ ███████╗███████╗██████╗
██╔══██╗██╔══██╗██╔══██╗██╔════╝██╔════╝██╔══██╗
██████╔╝███████║██████╔╝███████╗█████╗ ██████╔╝
██╔═══╝ ██╔══██║██╔══██╗╚════██║██╔══╝ ██╔══██╗
██║ ██║ ██║██║ ██║███████║███████╗██║ ██║
╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝╚══════╝╚═╝ ╚═╝
Parser
*/
u32
intern(char *str, u32 index, u32 length)
{
u32 symbol_hash = ht_has(str, index, length);
if (!symbol_hash) {
char *token = allocate_string(str, index, length);
if (error != NO_ERROR) {
//print_str("a. Error code: ");print_i64(error);print_endl();
return 0;
}
symbol_hash = ht_insert(token);
}
return JOY_VALUE(joySymbol, symbol_hash);
}
u32
tokenate(char *str, u32 index, u32 length)
{
if (4 == length
&& *(str + index) == 't'
&& *(str + index + 1) == 'r'
&& *(str + index + 2) == 'u'
&& *(str + index + 3) == 'e'
) {
//print_str("tokenate true");print_endl();
return JOY_VALUE(joyBool, 1);
}
if (5 == length
&& *(str + index) == 'f'
&& *(str + index + 1) == 'a'
&& *(str + index + 2) == 'l'
&& *(str + index + 3) == 's'
&& *(str + index + 4) == 'e'
) {
//print_str("tokenate false");print_endl();
return JOY_VALUE(joyBool, 0);
}
if (is_integer(str, index, length)) {
//print_str("tokenate integer");print_endl();
return convert_integer(str, index, length);
}
return intern(str, index, length);
}
int
is_delimiter(char ch)
{
return ch == '[' || ch == ']' || ch == ' ';
}
// Store in-progress lists. Here as in the hash table I'm not checking
// for capacity overload or anything like that. If you think you're going
// to parse more than a five hundred '[' chars then increase the size of
// this array.
u32 t2e_stack[1000];
u32 t2e_stack_top = 0;
#define T2E_PUSH(thing) t2e_stack[t2e_stack_top] = (thing); ++t2e_stack_top; (thing) = empty_list;
#define T2E_POP(thing) if (!t2e_stack_top) { error = EXTRA_CLOSING_BRACKET; return 0; }; --t2e_stack_top; (thing) = t2e_stack[t2e_stack_top];
u32
text_to_expression(char *str)
{
u32 index = 0;
u32 end = empty_list;
u32 top = empty_list;
u32 tok = empty_list;
u64 str_length = strlen(str); // TODO: rewrite so we don't iterate through the string twice.
while (index < str_length) {
char ch = str[index];
if (' ' == ch) {
++index;
continue;
}
if ('[' == ch) { // start new list
++index;
T2E_PUSH(end)
T2E_PUSH(top)
continue;
}
if (']' == ch) { // finish last new list
++index;
tok = top;
T2E_POP(top)
T2E_POP(end)
} else {
u32 i = index + 1;
for (; i < str_length && !is_delimiter(str[i]); ++i) {}
// i == str_length OR str[i] is a delimiter char.
tok = tokenate(str, index, i - index);
index = i;
}
u32 cell = cons(tok, empty_list);
if (end) tails[end] = cell;
if (!top) top = cell;
end = cell;
}
if (t2e_stack_top) {
error = MISSING_CLOSING_BRACKET;
return empty_list;
}
return top;
}
u32
joy(u32 stack, u32 expression)
{
u32 term;
while (expression) {
term = head(expression);
expression = tail(expression);
//print_joy_value(term);print_endl();
//print_i64(expression);print_endl();
if (TYPE_OF(term) == joySymbol) {
char *str = ht_lookup(VALUE_OF(term));
if (error != NO_ERROR)
return 0;
print_str(str);print_endl();
} else { // type == joyInt || type == joyBool || type == joyList
stack = cons(term, stack);
}
}
return stack;
}
void
main()
{
memset(hash_table, 0, sizeof(hash_table));
memset(string_heap, 0, sizeof(string_heap));
memset(t2e_stack, 0, sizeof(t2e_stack));
error = NO_ERROR;
// TODO: these should be global.
u32 joy_true = JOY_VALUE(joyBool, 1);
u32 joy_false = JOY_VALUE(joyBool, 0);
/*
u32 stack = empty_list;
stack = push_int(23, stack);
stack = cons(joy_true, stack);
stack = push_int(42, stack);
stack = push_symbol("cats", stack);
u32 el = empty_list;
el = push_int(48, el);
el = cons(el, el);
stack = cons(el, stack);
stack = cons(joy_false, stack);
stack = push_int(273, stack);
print_joy_list(stack);
print_endl();
*/
u32 expression = text_to_expression(" 1[2[true 3][aa[aa bb] aa bb cc]bob]false[]bob 3[4] ga[]ry");
print_joy_list(expression);
print_endl();
u32 stack = joy(empty_list, expression);
print_joy_list(stack);
print_endl();
}