From c8f67f02f95f6beff589636f393ef3a7a9bffaa4 Mon Sep 17 00:00:00 2001 From: Simon Forman Date: Tue, 20 Sep 2022 16:29:35 -0700 Subject: [PATCH] Parser in GNU Prolog. Wonky handling of UTF_8 blankspace. --- implementations/GNUProlog/blanks.py | 33 ++++++++++++ implementations/GNUProlog/parser.prolog | 68 +++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 implementations/GNUProlog/blanks.py create mode 100644 implementations/GNUProlog/parser.prolog diff --git a/implementations/GNUProlog/blanks.py b/implementations/GNUProlog/blanks.py new file mode 100644 index 0000000..5c8de76 --- /dev/null +++ b/implementations/GNUProlog/blanks.py @@ -0,0 +1,33 @@ + + +# https://www.lesinskis.com/python-unicode-whitespace.html +UNICODE_WHITESPACE_CHARACTERS = [ + "\u0009", # character tabulation + "\u000a", # line feed + "\u000b", # line tabulation + "\u000c", # form feed + "\u000d", # carriage return + "\u0020", # space + "\u0085", # next line + "\u00a0", # no-break space + "\u1680", # ogham space mark + "\u2000", # en quad + "\u2001", # em quad + "\u2002", # en space + "\u2003", # em space + "\u2004", # three-per-em space + "\u2005", # four-per-em space + "\u2006", # six-per-em space + "\u2007", # figure space + "\u2008", # punctuation space + "\u2009", # thin space + "\u200A", # hair space + "\u2028", # line separator + "\u2029", # paragraph separator + "\u202f", # narrow no-break space + "\u205f", # medium mathematical space + "\u3000", # ideographic space +] + +for ch in UNICODE_WHITESPACE_CHARACTERS: + print(f'blank --> {list(ch.encode("utf_8"))}.') diff --git a/implementations/GNUProlog/parser.prolog b/implementations/GNUProlog/parser.prolog new file mode 100644 index 0000000..6ba902f --- /dev/null +++ b/implementations/GNUProlog/parser.prolog @@ -0,0 +1,68 @@ + + +joy_lex([tok(Token)|Ls]) --> chars(TokenCodes), !, {atom_codes(Token, TokenCodes)}, joy_lex(Ls). +joy_lex([ lbracket|Ls]) --> "[", !, joy_lex(Ls). +joy_lex([ rbracket|Ls]) --> "]", !, joy_lex(Ls). + +joy_lex(Ls) --> blank, !, joy_lex(Ls). + +joy_lex([]) --> []. + + +% Then parse the tokens converting them to Prolog values and building up +% the list structures (if any.) + +%joy_parse([J|Js]) --> joy_term(J), !, joy_parse(Js). +%joy_parse([]) --> []. +% +%joy_term(list(J)) --> [lbracket], !, joy_parse(J), [rbracket]. +%joy_term(Token) --> [tok(Codes)], {joy_token(Token, Codes)}. +% +%joy_token(int(I), Codes) :- number(I, Codes, []), !. % See dcg/basics. +%joy_token(bool(true), `true`) :- !. +%joy_token(bool(false), `false`) :- !. +%joy_token(symbol(S), Codes) :- atom_codes(S, Codes). +% +% +%text_to_expression(Text, Expression) :- +% phrase(joy_lex(Tokens), Text), !, +% phrase(joy_parse(Expression), Tokens). + +% Apologies for all the (green, I hope) cuts. The strength of the Joy +% syntax is that it's uninteresting. + +chars([Ch|Rest]) --> char(Ch), chars(Rest). +chars([Ch]) --> char(Ch). + +char(Ch) --> \+ blank, [Ch], { Ch \== 0'[, Ch \== 0'] }. + + +blank --> [9]. +blank --> [10]. +blank --> [11]. +blank --> [12]. +blank --> [13]. +blank --> [32]. +blank --> [194, 133]. +blank --> [194, 160]. +blank --> [225, 154, 128]. +blank --> [226, 128, 128]. +blank --> [226, 128, 129]. +blank --> [226, 128, 130]. +blank --> [226, 128, 131]. +blank --> [226, 128, 132]. +blank --> [226, 128, 133]. +blank --> [226, 128, 134]. +blank --> [226, 128, 135]. +blank --> [226, 128, 136]. +blank --> [226, 128, 137]. +blank --> [226, 128, 138]. +blank --> [226, 128, 168]. +blank --> [226, 128, 169]. +blank --> [226, 128, 175]. +blank --> [226, 129, 159]. +blank --> [227, 128, 128]. + + + +do :- phrase(joy_lex(Tokens), "23[15]"). \ No newline at end of file