Fold parser code into joy_types.c
Add some docs, minor cleanup.
This commit is contained in:
parent
87aef6f06d
commit
0a9cdba456
|
|
@ -1,37 +1,76 @@
|
|||
// Copyright © 2023 Simon Forman
|
||||
//
|
||||
// This file is part of Thun
|
||||
//
|
||||
// Thun is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// Thun is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Thun. If not see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
#include <uvm/syscalls.h>
|
||||
// In the Thun dialect of Joy we have four types of values:
|
||||
// Integers, Booleans, Symbols, and Lists.
|
||||
// We don't have Unions, Enums, or Typedefs.
|
||||
//
|
||||
// So how do we represent Joy types?
|
||||
//
|
||||
// In SICP they use a pair of arrays of pointers, one for heads and one
|
||||
// for tails.
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
██████╗ ██████╗ ███╗ ██╗███████╗ ██╗ ██╗███████╗ █████╗ ██████╗
|
||||
██╔════╝██╔═══██╗████╗ ██║██╔════╝ ██║ ██║██╔════╝██╔══██╗██╔══██╗
|
||||
██║ ██║ ██║██╔██╗ ██║███████╗ ███████║█████╗ ███████║██████╔╝
|
||||
██║ ██║ ██║██║╚██╗██║╚════██║ ██╔══██║██╔══╝ ██╔══██║██╔═══╝
|
||||
╚██████╗╚██████╔╝██║ ╚████║███████║ ██║ ██║███████╗██║ ██║██║
|
||||
╚═════╝ ╚═════╝ ╚═╝ ╚═══╝╚══════╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═╝
|
||||
Cons Heap
|
||||
|
||||
We don't have Unions, Enums, or Typedefs. So how do we represent Joy types?
|
||||
In SICP they use a pair of arrays of pointers, one for heads and one
|
||||
for tails.
|
||||
|
||||
> A pointer to a pair is an index into the two vectors.
|
||||
|
||||
*/
|
||||
|
||||
#define HEAP_SIZE 1024
|
||||
|
||||
u32 heads[HEAP_SIZE];
|
||||
u32 tails[HEAP_SIZE];
|
||||
|
||||
// > A pointer to a pair is an index into the two vectors.
|
||||
|
||||
u32 free = 0;
|
||||
// cell 0 is reserved so that 0 can be the empty list.
|
||||
u32 free = 1;
|
||||
|
||||
// > We also need a representation for objects other than pairs (such as
|
||||
// numbers and symbols) and a way to distinguish one kind of data from
|
||||
// another. There are many methods of accomplishing this, but they all
|
||||
// reduce to using typed pointers, that is, to extending the notion of
|
||||
// ``pointer'' to include information on data type.
|
||||
// > numbers and symbols) and a way to distinguish one kind of data from
|
||||
// > another. There are many methods of accomplishing this, but they all
|
||||
// > reduce to using typed pointers, that is, to extending the notion of
|
||||
// > ``pointer'' to include information on data type.
|
||||
|
||||
// Let's use u32 with the two MSB's for the type tag.
|
||||
|
||||
#define TYPE_OF(pointer) (pointer >> 30)
|
||||
#define VALUE_OF(pointer) (pointer & 0x3fffffff)
|
||||
#define JOY_VALUE(type, value) ((type & 3) << 30) | (value & 0x3fffffff)
|
||||
|
||||
u8 joyInt = 0;
|
||||
u8 joyList = 1;
|
||||
/*
|
||||
This means that our ints are restricted to 30 bits for now, until
|
||||
I implement bignums.
|
||||
|
||||
|
||||
In the Thun dialect of Joy we have four types of values:
|
||||
|
||||
Integers, Booleans, Symbols, and Lists.
|
||||
*/
|
||||
u8 joyList = 0;
|
||||
u8 joyInt = 1;
|
||||
u8 joySymbol = 2;
|
||||
u8 joyBool = 3;
|
||||
|
||||
// Because the type tag for lists is 0 the empty list is just 0;
|
||||
u32 empty_list = 0;
|
||||
|
||||
u32
|
||||
|
|
@ -51,6 +90,49 @@ u32 head(u32 list) { return heads[VALUE_OF(list)]; }
|
|||
u32 tail(u32 list) { return tails[VALUE_OF(list)]; }
|
||||
|
||||
|
||||
|
||||
/*
|
||||
███████╗████████╗██████╗ ██╗███╗ ██╗ ██████╗
|
||||
██╔════╝╚══██╔══╝██╔══██╗██║████╗ ██║██╔════╝
|
||||
███████╗ ██║ ██████╔╝██║██╔██╗ ██║██║ ███╗
|
||||
╚════██║ ██║ ██╔══██╗██║██║╚██╗██║██║ ██║
|
||||
███████║ ██║ ██║ ██║██║██║ ╚████║╚██████╔╝
|
||||
╚══════╝ ╚═╝ ╚═╝ ╚═╝╚═╝╚═╝ ╚═══╝ ╚═════╝
|
||||
|
||||
██╗ ██╗███████╗ █████╗ ██████╗
|
||||
██║ ██║██╔════╝██╔══██╗██╔══██╗
|
||||
███████║█████╗ ███████║██████╔╝
|
||||
██╔══██║██╔══╝ ██╔══██║██╔═══╝
|
||||
██║ ██║███████╗██║ ██║██║
|
||||
╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═╝
|
||||
Simple string storage heap.
|
||||
|
||||
We need a place to keep symbol strings.
|
||||
|
||||
*/
|
||||
|
||||
#define STRING_HEAP_SIZE 100000
|
||||
|
||||
char string_heap[STRING_HEAP_SIZE];
|
||||
u32 string_heap_top = 0;
|
||||
|
||||
char*
|
||||
allocate_string(char *buffer, u32 offset, u32 length)
|
||||
{
|
||||
u64 end = string_heap_top + length + 1;
|
||||
if (end >= STRING_HEAP_SIZE)
|
||||
return 0;
|
||||
memcpy(string_heap + string_heap_top, buffer + offset, length);
|
||||
string_heap[end] = '\0';
|
||||
u32 new_string = string_heap_top;
|
||||
string_heap_top = (u32)end + 1;
|
||||
//print_str("allocating ");print_str(string_heap + new_string);print_endl();
|
||||
return string_heap + new_string;
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
// No setjmp/longjmp, so let's have a global error value and check it after ops.
|
||||
u64 error;
|
||||
|
||||
|
|
@ -58,6 +140,19 @@ u64 error;
|
|||
#define UNKNOWN_WORD_ERROR 1
|
||||
#define MISSING_CLOSING_BRACKET 2
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
/*
|
||||
██████╗ ██████╗ ██╗███╗ ██╗████████╗███████╗██████╗
|
||||
██╔══██╗██╔══██╗██║████╗ ██║╚══██╔══╝██╔════╝██╔══██╗
|
||||
██████╔╝██████╔╝██║██╔██╗ ██║ ██║ █████╗ ██████╔╝
|
||||
██╔═══╝ ██╔══██╗██║██║╚██╗██║ ██║ ██╔══╝ ██╔══██╗
|
||||
██║ ██║ ██║██║██║ ╚████║ ██║ ███████╗██║ ██║
|
||||
╚═╝ ╚═╝ ╚═╝╚═╝╚═╝ ╚═══╝ ╚═╝ ╚══════╝╚═╝ ╚═╝
|
||||
Printer
|
||||
*/
|
||||
|
||||
void
|
||||
print_joy_value(u32 jv)
|
||||
{
|
||||
|
|
@ -93,9 +188,32 @@ print_joy_list(u32 list)
|
|||
}
|
||||
|
||||
|
||||
// And now for a hash table.
|
||||
// https://benhoyt.com/writings/hash-table-in-c/#hash-tables
|
||||
// https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function
|
||||
/*
|
||||
██╗ ██╗ █████╗ ███████╗██╗ ██╗
|
||||
██║ ██║██╔══██╗██╔════╝██║ ██║
|
||||
███████║███████║███████╗███████║
|
||||
██╔══██║██╔══██║╚════██║██╔══██║
|
||||
██║ ██║██║ ██║███████║██║ ██║
|
||||
╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝
|
||||
|
||||
████████╗ █████╗ ██████╗ ██╗ ███████╗
|
||||
╚══██╔══╝██╔══██╗██╔══██╗██║ ██╔════╝
|
||||
██║ ███████║██████╔╝██║ █████╗
|
||||
██║ ██╔══██║██╔══██╗██║ ██╔══╝
|
||||
██║ ██║ ██║██████╔╝███████╗███████╗
|
||||
╚═╝ ╚═╝ ╚═╝╚═════╝ ╚══════╝╚══════╝
|
||||
And now for a hash table.
|
||||
|
||||
This table maps between hashes of symbol strings which are used in the tagged pointers in Joy values
|
||||
and strings which are stored in the string heap.
|
||||
|
||||
|
||||
FNV hash function.
|
||||
|
||||
https://benhoyt.com/writings/hash-table-in-c/#hash-tables
|
||||
https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function
|
||||
|
||||
*/
|
||||
|
||||
#define FNV_OFFSET 0xcbf29ce484222325
|
||||
#define FNV_PRIME 0x100000001b3
|
||||
|
|
@ -154,10 +272,10 @@ ht_insert(char *symbol)
|
|||
char*
|
||||
ht_lookup(u32 hash)
|
||||
{
|
||||
// Note that hash will be truncated to N (N=30 as it happens) bits
|
||||
// by VALUE_OF().
|
||||
u32 index = hash % CAPACITY;
|
||||
char *candidate = hash_table[index];
|
||||
// Note that hash will be truncated to N (N=30 as it happens) bits
|
||||
// by VALUE_OF().
|
||||
u32 increment = ((hash >> EXPONENT) | 1) % CAPACITY;
|
||||
while (candidate) {
|
||||
if (hash == VALUE_OF(hash_key(candidate)))
|
||||
|
|
@ -169,42 +287,245 @@ ht_lookup(u32 hash)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
u32
|
||||
push_symbol(char *symbol, u32 stack)
|
||||
{
|
||||
return cons(JOY_VALUE(joySymbol, ht_insert(symbol)), stack);
|
||||
}
|
||||
u32
|
||||
push_int(u32 n, u32 stack)
|
||||
{
|
||||
return cons(JOY_VALUE(joyInt, n), stack);
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
bool
|
||||
is_integer(char *str, u32 index, u32 length)
|
||||
{
|
||||
for (;length; --length) {
|
||||
char ch = *(str + index + length - 1);
|
||||
if (!(ch == '0'
|
||||
|| ch == '1'
|
||||
|| ch == '2'
|
||||
|| ch == '3'
|
||||
|| ch == '4'
|
||||
|| ch == '5'
|
||||
|| ch == '6'
|
||||
|| ch == '7'
|
||||
|| ch == '8'
|
||||
|| ch == '9'))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
u32
|
||||
convert_integer(char *str, u32 index, u32 length)
|
||||
{
|
||||
u32 result = 0;
|
||||
length = length + index;
|
||||
for (; index < length; ++index) {
|
||||
char ch = *(str + index);
|
||||
u8 digit = (u8)ch - (u8)'0';
|
||||
result = result * 10 + digit;
|
||||
}
|
||||
//print_str("converted integer ");print_i64(result);print_endl();
|
||||
return JOY_VALUE(joyInt, result);
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
/*
|
||||
████████╗ ██████╗ ██╗ ██╗███████╗███╗ ██╗██╗███████╗███████╗██████╗
|
||||
╚══██╔══╝██╔═══██╗██║ ██╔╝██╔════╝████╗ ██║██║╚══███╔╝██╔════╝██╔══██╗
|
||||
██║ ██║ ██║█████╔╝ █████╗ ██╔██╗ ██║██║ ███╔╝ █████╗ ██████╔╝
|
||||
██║ ██║ ██║██╔═██╗ ██╔══╝ ██║╚██╗██║██║ ███╔╝ ██╔══╝ ██╔══██╗
|
||||
██║ ╚██████╔╝██║ ██╗███████╗██║ ╚████║██║███████╗███████╗██║ ██║
|
||||
╚═╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═══╝╚═╝╚══════╝╚══════╝╚═╝ ╚═╝
|
||||
Tokenizer
|
||||
|
||||
*/
|
||||
|
||||
char* LEFT_BRACKET_symbol = "[";
|
||||
char* RIGHT_BRACKET_symbol = "]";
|
||||
// Filled in in main().
|
||||
u32 LEFT_BRACKET;
|
||||
u32 RIGHT_BRACKET;
|
||||
|
||||
|
||||
u32
|
||||
tokenate(char *str, u32 index, u32 length)
|
||||
{
|
||||
if (4 == length
|
||||
&& *(str + index) == 't'
|
||||
&& *(str + index + 1) == 'r'
|
||||
&& *(str + index + 2) == 'u'
|
||||
&& *(str + index + 3) == 'e'
|
||||
) {
|
||||
//print_str("tokenate true");print_endl();
|
||||
return JOY_VALUE(joyBool, 1);
|
||||
}
|
||||
if (5 == length
|
||||
&& *(str + index) == 'f'
|
||||
&& *(str + index + 1) == 'a'
|
||||
&& *(str + index + 2) == 'l'
|
||||
&& *(str + index + 3) == 's'
|
||||
&& *(str + index + 4) == 'e'
|
||||
) {
|
||||
//print_str("tokenate false");print_endl();
|
||||
return JOY_VALUE(joyBool, 0);
|
||||
}
|
||||
if (is_integer(str, index, length)) {
|
||||
//print_str("tokenate integer");print_endl();
|
||||
return convert_integer(str, index, length);
|
||||
}
|
||||
// TODO: Use ht_insert to avoid multiple allocations of the same string!
|
||||
char *token = allocate_string(str, index, length);
|
||||
if (!token)
|
||||
return 0; // OOM
|
||||
return JOY_VALUE(joySymbol, ht_insert(token));
|
||||
}
|
||||
|
||||
|
||||
u32
|
||||
tokenize0(char *str, u32 str_length, u32 index, u32 acc)
|
||||
{
|
||||
if (index >= str_length) {
|
||||
//print_i64(index);print_str(" : ");print_str("END tokenize");print_endl();
|
||||
//print_i64(acc);print_str("<");print_endl();
|
||||
return acc;
|
||||
}
|
||||
//print_i64(index);print_str(" : ");print_str(str + index);print_endl();
|
||||
char ch = str[index];
|
||||
if ('[' == ch) {
|
||||
acc = cons(LEFT_BRACKET, tokenize0(str, str_length, index + 1, acc));
|
||||
//print_i64(acc);print_str("<[");print_endl();
|
||||
return acc;
|
||||
}
|
||||
if (']' == ch) {
|
||||
acc = cons(RIGHT_BRACKET, tokenize0(str, str_length, index + 1, acc));
|
||||
//print_i64(acc);print_str("<]");print_endl();
|
||||
return acc;
|
||||
}
|
||||
if (' ' == ch) {
|
||||
return tokenize0(str, str_length, index + 1, acc);
|
||||
}
|
||||
u32 i = index + 1;
|
||||
for (; i < str_length; ++i) {
|
||||
if (str[i] == '[' || str[i] == ']' || str[i] == ' ') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// i == str_length OR str[i] is a delimiter char.
|
||||
return cons(tokenate(str, index, i - index), tokenize0(str, str_length, i, acc));
|
||||
|
||||
}
|
||||
|
||||
|
||||
u32
|
||||
tokenize(char *str)
|
||||
{
|
||||
return tokenize0(str, strlen(str), 0, empty_list);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
██████╗ █████╗ ██████╗ ███████╗███████╗██████╗
|
||||
██╔══██╗██╔══██╗██╔══██╗██╔════╝██╔════╝██╔══██╗
|
||||
██████╔╝███████║██████╔╝███████╗█████╗ ██████╔╝
|
||||
██╔═══╝ ██╔══██║██╔══██╗╚════██║██╔══╝ ██╔══██╗
|
||||
██║ ██║ ██║██║ ██║███████║███████╗██║ ██║
|
||||
╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝╚══════╝╚═╝ ╚═╝
|
||||
Parser
|
||||
|
||||
*/
|
||||
|
||||
u32
|
||||
_reverse_list_in_place(u32 el, u32 end)
|
||||
{
|
||||
u32 t = tail(el);
|
||||
tails[el] = end;
|
||||
return t ? _reverse_list_in_place(t, el) : el;
|
||||
}
|
||||
|
||||
u32
|
||||
reverse_list_in_place(u32 el)
|
||||
{
|
||||
return el ? _reverse_list_in_place(el, empty_list) : el;
|
||||
}
|
||||
|
||||
u32 t2e_stack[1000];
|
||||
u32 t2e_stack_top = 0;
|
||||
|
||||
u32
|
||||
text_to_expression(char *str)
|
||||
{
|
||||
u32 frame = empty_list;
|
||||
u32 tokens = tokenize(str);
|
||||
//print_str("tokens: "); print_joy_list(tokens); print_endl();
|
||||
//return tokens;
|
||||
while (tokens) {
|
||||
u32 tok = head(tokens);
|
||||
tokens = tail(tokens);
|
||||
if (LEFT_BRACKET == tok) {
|
||||
//print_str("left bracket");print_endl();
|
||||
t2e_stack[t2e_stack_top] = frame;
|
||||
++t2e_stack_top;
|
||||
frame = empty_list;
|
||||
continue;
|
||||
}
|
||||
if (RIGHT_BRACKET == tok) {
|
||||
//print_str("right bracket");print_endl();
|
||||
tok = reverse_list_in_place(frame);
|
||||
//print_str("new list: "); print_joy_list(tok); print_endl();
|
||||
--t2e_stack_top;
|
||||
frame = t2e_stack[t2e_stack_top];
|
||||
}
|
||||
frame = cons(tok, frame);
|
||||
//print_str("t2e frame: "); print_joy_list(frame); print_endl();
|
||||
}
|
||||
return reverse_list_in_place(frame);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
main()
|
||||
{
|
||||
LEFT_BRACKET = JOY_VALUE(joySymbol, ht_insert(LEFT_BRACKET_symbol));
|
||||
RIGHT_BRACKET = JOY_VALUE(joySymbol, ht_insert(RIGHT_BRACKET_symbol));
|
||||
// TODO: these should be global.
|
||||
u32 joy_true = JOY_VALUE(joyBool, 1);
|
||||
u32 joy_false = JOY_VALUE(joyBool, 0);
|
||||
|
||||
memset(hash_table, 0, sizeof(hash_table));
|
||||
memset(string_heap, 0, sizeof(string_heap));
|
||||
memset(t2e_stack, 0, sizeof(t2e_stack));
|
||||
error = NO_ERROR;
|
||||
|
||||
|
||||
/*
|
||||
u32 stack = empty_list;
|
||||
|
||||
stack = cons(23, stack);
|
||||
stack = push_int(23, stack);
|
||||
stack = cons(joy_true, stack);
|
||||
stack = cons(42, stack);
|
||||
|
||||
stack = push_int(42, stack);
|
||||
stack = push_symbol("cats", stack);
|
||||
|
||||
u32 el = empty_list;
|
||||
|
||||
el = cons(48, el);
|
||||
el = push_int(48, el);
|
||||
el = cons(el, el);
|
||||
stack = cons(el, stack);
|
||||
|
||||
stack = cons(joy_false, stack);
|
||||
stack = cons(273, stack);
|
||||
|
||||
stack = push_int(273, stack);
|
||||
print_joy_list(stack);
|
||||
print_endl();
|
||||
*/
|
||||
|
||||
print_joy_list(text_to_expression(" 1[2[true 3][[]]bob]false[]bob 3[4]5"));
|
||||
print_endl();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue