Did i not commit this?

This commit is contained in:
Simon Forman 2023-03-10 14:11:54 -08:00
parent 7bfcdd2758
commit c9e620c281
1 changed files with 581 additions and 0 deletions

View File

@ -0,0 +1,581 @@
// Copyright © 2023 Simon Forman
//
// This file is part of Thun
//
// Thun is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Thun is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Thun. If not see <http://www.gnu.org/licenses/>.
//
#include <uvm/syscalls.h>
#include <string.h>
/*
No setjmp/longjmp, so let's have a global error value and check it after ops.
*/
u64 error = 0;
#define NO_ERROR 0
#define UNKNOWN_WORD_ERROR 1
#define MISSING_CLOSING_BRACKET 2
#define EXTRA_CLOSING_BRACKET 3
#define CONS_HEAP_OOM 4
#define STRING_HEAP_OOM 5
/*
char *error_messages[3] = {
"",
"Unknown word",
"Missing closing bracket"
};
*/
/*
Cons Heap
We don't have Unions, Enums, or Typedefs. So how do we represent Joy types?
In SICP they use a pair of arrays of pointers, one for heads and one
for tails.
> A pointer to a pair is an index into the two vectors.
*/
#define HEAP_SIZE 1024
u32 heads[HEAP_SIZE];
u32 tails[HEAP_SIZE];
// cell 0 is reserved so that 0 can be the empty list.
u32 free = 1;
// > We also need a representation for objects other than pairs (such as
// > numbers and symbols) and a way to distinguish one kind of data from
// > another. There are many methods of accomplishing this, but they all
// > reduce to using typed pointers, that is, to extending the notion of
// > ``pointer'' to include information on data type.
// Let's use u32 with the two MSB's for the type tag.
#define TYPE_OF(pointer) (pointer >> 30)
#define VALUE_OF(pointer) (pointer & 0x3fffffff)
#define JOY_VALUE(type, value) ((type & 3) << 30) | (value & 0x3fffffff)
/*
This means that our ints are restricted to 30 bits for now, until
I implement bignums.
In the Thun dialect of Joy we have four types of values:
Integers, Booleans, Symbols, and Lists.
*/
u8 joyList = 0;
u8 joyInt = 1;
u8 joySymbol = 2;
u8 joyBool = 3;
// Because the type tag for lists is 0 the empty list is just 0;
u32 empty_list = 0;
u32
cons(u32 head, u32 tail)
{
if (free >= HEAP_SIZE) {
error = CONS_HEAP_OOM;
return -1;
}
heads[free] = head;
tails[free] = tail;
u32 cell = JOY_VALUE(joyList, free);
++free;
return cell;
}
u32 head(u32 list) { return heads[VALUE_OF(list)]; }
u32 tail(u32 list) { return tails[VALUE_OF(list)]; }
/*
Simple string storage heap.
We need a place to keep symbol strings.
*/
#define STRING_HEAP_SIZE 100000
char string_heap[STRING_HEAP_SIZE];
u32 string_heap_top = 0;
char*
allocate_string(char *buffer, u32 offset, u32 length)
{
u64 end = string_heap_top + length + 1;
if (end >= STRING_HEAP_SIZE) {
error = STRING_HEAP_OOM;
return 0;
}
memcpy(string_heap + string_heap_top, buffer + offset, length);
string_heap[end] = '\0';
u32 new_string = string_heap_top;
string_heap_top = (u32)end + 1;
//print_str("allocating ");print_str(string_heap + new_string);print_endl();
return string_heap + new_string;
}
/*
Printer
*/
void
print_joy_value(u32 jv)
{
u8 type = TYPE_OF(jv);
if (type == joyInt) {
print_i64(VALUE_OF(jv));
} else if (type == joyBool) {
print_str(VALUE_OF(jv) ? "true" : "false");
} else if (type == joyList) {
print_str("[");
print_joy_list(jv);
print_str("]");
} else if (type == joySymbol) {
char *str = ht_lookup(VALUE_OF(jv));
if (error != NO_ERROR)
return;
print_str(str);
}
}
void
print_joy_list(u32 list)
{
while (list) {
print_joy_value(head(list));
if (error != NO_ERROR)
return;
list = tail(list);
if (list) {
print_str(" ");
}
}
}
/*
And now for a hash table.
This table maps between hashes of symbol strings which are used in the
tagged pointers in Joy values and strings which are stored in the string
heap.
TODO: bool ht_has(char *str, u32 index, u32 length) to see if a fragment
of a string buffer is a symbol in the hash table.
FNV hash function.
https://benhoyt.com/writings/hash-table-in-c/#hash-tables
https://en.wikipedia.org/wiki/FowlerNollVo_hash_function
*/
#define FNV_OFFSET 0xcbf29ce484222325
#define FNV_PRIME 0x100000001b3
u64
hash_key(char* key)
{
u64 hash = FNV_OFFSET;
for (char* p = key; *p; ++p) {
hash = hash ^ (u64)(unsigned char)(*p);
hash = hash * FNV_PRIME;
}
return hash;
}
u64
hash_fragment(char *str, u32 index, u32 length)
{
u64 hash = FNV_OFFSET;
for (char* p = (str + index); length; --length, ++p) {
hash = hash ^ (u64)(unsigned char)(*p);
hash = hash * FNV_PRIME;
}
return hash;
}
// Capacity is a power of two (10 for now.)
#define EXPONENT 10
#define CAPACITY 1024
#define HASH_MASK 1023
// Note that there's no checking for filling the table and expanding.
// For now, I'm just going to use a "large enough" table and hope
// for the best. (We have thirty bits to work with so the obvious
// thing to do is make the exponent fifteen, half for the hash key
// and half for the increment.)
char* hash_table[CAPACITY];
u32
ht_insert(char *symbol)
{
u64 hash = hash_key(symbol);
u32 index = hash % CAPACITY;
char *candidate = hash_table[index];
if (!candidate) {
hash_table[index] = symbol;
return JOY_VALUE(joySymbol, VALUE_OF(hash));
}
// https://en.wikipedia.org/wiki/Double_hashing
// Rather than use another hash function I'm going to try
// using the extra bits of the same hash.
u32 increment = ((VALUE_OF(hash) >> EXPONENT) | 1) % CAPACITY;
// If I understand correctly, making the increment odd
// means it will traverse the whole (even-sized) table.
while (candidate) {
// Compare pointers then hashes (since we already have
// one hash I'm guessing that that's cheaper or at least
// no more expensive than string comparision.)
if (candidate == symbol || hash == hash_key(candidate))
break;
index = (index + increment) % CAPACITY;
candidate = hash_table[index];
}
if (!candidate) {
hash_table[index] = symbol;
}
return JOY_VALUE(joySymbol, VALUE_OF(hash));
}
char*
ht_lookup(u32 hash)
{
// Note that hash must be truncated to N (N=30 as it happens) bits
// by VALUE_OF().
u32 index = hash % CAPACITY;
char *candidate = hash_table[index];
u32 increment = ((hash >> EXPONENT) | 1) % CAPACITY;
while (candidate) {
if (hash == VALUE_OF(hash_key(candidate)))
return candidate;
index = (index + increment) % CAPACITY;
candidate = hash_table[index];
}
error = UNKNOWN_WORD_ERROR;
return 0;
}
u32
ht_has(char *str, u32 index, u32 length)
{
u32 hash = VALUE_OF(hash_fragment(str, index, length));
ht_lookup(hash);
if (UNKNOWN_WORD_ERROR == error) {
error = NO_ERROR;
return 0;
}
return hash;
}
/******************************************************************************/
u32
push_symbol(char *symbol, u32 stack)
{
return cons(JOY_VALUE(joySymbol, ht_insert(symbol)), stack);
}
u32
push_int(u32 n, u32 stack)
{
return cons(JOY_VALUE(joyInt, n), stack);
}
/******************************************************************************/
bool
is_integer(char *str, u32 index, u32 length)
{
for (;length; --length) {
char ch = *(str + index + length - 1);
if (!(ch == '0'
|| ch == '1'
|| ch == '2'
|| ch == '3'
|| ch == '4'
|| ch == '5'
|| ch == '6'
|| ch == '7'
|| ch == '8'
|| ch == '9'))
{
return 0;
}
}
return 1;
}
u32
convert_integer(char *str, u32 index, u32 length)
{
u32 result = 0;
length = length + index;
for (; index < length; ++index) {
char ch = *(str + index);
u8 digit = (u8)ch - (u8)'0';
result = result * 10 + digit;
}
//print_str("converted integer ");print_i64(result);print_endl();
return JOY_VALUE(joyInt, result);
}
/******************************************************************************/
/*
Parser
*/
u32
intern(char *str, u32 index, u32 length)
{
u32 symbol_hash = ht_has(str, index, length);
if (!symbol_hash) {
char *token = allocate_string(str, index, length);
if (error != NO_ERROR) {
//print_str("a. Error code: ");print_i64(error);print_endl();
return 0;
}
symbol_hash = ht_insert(token);
}
return JOY_VALUE(joySymbol, symbol_hash);
}
u32
tokenate(char *str, u32 index, u32 length)
{
if (4 == length
&& *(str + index) == 't'
&& *(str + index + 1) == 'r'
&& *(str + index + 2) == 'u'
&& *(str + index + 3) == 'e'
) {
//print_str("tokenate true");print_endl();
return JOY_VALUE(joyBool, 1);
}
if (5 == length
&& *(str + index) == 'f'
&& *(str + index + 1) == 'a'
&& *(str + index + 2) == 'l'
&& *(str + index + 3) == 's'
&& *(str + index + 4) == 'e'
) {
//print_str("tokenate false");print_endl();
return JOY_VALUE(joyBool, 0);
}
if (is_integer(str, index, length)) {
//print_str("tokenate integer");print_endl();
return convert_integer(str, index, length);
}
return intern(str, index, length);
}
int
is_delimiter(char ch)
{
return ch == '[' || ch == ']' || ch == ' ';
}
// Store in-progress lists. Here as in the hash table I'm not checking
// for capacity overload or anything like that. If you think you're going
// to parse more than a five hundred '[' chars then increase the size of
// this array.
u32 t2e_stack[1000];
u32 t2e_stack_top = 0;
#define T2E_PUSH(thing) t2e_stack[t2e_stack_top] = (thing); ++t2e_stack_top; (thing) = empty_list;
#define T2E_POP(thing) if (!t2e_stack_top) { error = EXTRA_CLOSING_BRACKET; return 0; }; --t2e_stack_top; (thing) = t2e_stack[t2e_stack_top];
u32
text_to_expression(char *str)
{
u32 index = 0;
u32 end = empty_list;
u32 top = empty_list;
u32 tok = empty_list;
u64 str_length = strlen(str); // TODO: rewrite so we don't iterate through the string twice.
while (index < str_length) {
char ch = str[index];
if (' ' == ch) {
++index;
continue;
}
if ('[' == ch) { // start new list
++index;
T2E_PUSH(end)
T2E_PUSH(top)
continue;
}
if (']' == ch) { // finish last new list
++index;
tok = top;
T2E_POP(top)
T2E_POP(end)
} else {
u32 i = index + 1;
for (; i < str_length && !is_delimiter(str[i]); ++i) {}
// i == str_length OR str[i] is a delimiter char.
tok = tokenate(str, index, i - index);
index = i;
}
u32 cell = cons(tok, empty_list);
if (end) tails[end] = cell;
if (!top) top = cell;
end = cell;
}
if (t2e_stack_top) {
error = MISSING_CLOSING_BRACKET;
return empty_list;
}
return top;
}
u32
joy(u32 stack, u32 expression)
{
u32 term;
while (expression) {
term = head(expression);
expression = tail(expression);
//print_joy_value(term);print_endl();
//print_i64(expression);print_endl();
if (TYPE_OF(term) == joySymbol) {
char *str = ht_lookup(VALUE_OF(term));
if (error != NO_ERROR)
return 0;
print_str(str);print_endl();
} else { // type == joyInt || type == joyBool || type == joyList
stack = cons(term, stack);
}
}
return stack;
}
void
main()
{
memset(hash_table, 0, sizeof(hash_table));
memset(string_heap, 0, sizeof(string_heap));
memset(t2e_stack, 0, sizeof(t2e_stack));
error = NO_ERROR;
// TODO: these should be global.
u32 joy_true = JOY_VALUE(joyBool, 1);
u32 joy_false = JOY_VALUE(joyBool, 0);
/*
u32 stack = empty_list;
stack = push_int(23, stack);
stack = cons(joy_true, stack);
stack = push_int(42, stack);
stack = push_symbol("cats", stack);
u32 el = empty_list;
el = push_int(48, el);
el = cons(el, el);
stack = cons(el, stack);
stack = cons(joy_false, stack);
stack = push_int(273, stack);
print_joy_list(stack);
print_endl();
*/
u32 expression = text_to_expression(" 1[2[true 3][aa[aa bb] aa bb cc]bob]false[]bob 3[4] ga[]ry");
print_joy_list(expression);
print_endl();
u32 stack = joy(empty_list, expression);
print_joy_list(stack);
print_endl();
}