Parser in GNU Prolog.

Wonky handling of UTF_8 blankspace.
2022-09-20 16:29:35 -07:00 · 2022-09-20 16:29:35 -07:00 · c8f67f02f9
parent 3f4e9d9fe2
commit c8f67f02f9
2 changed files with 101 additions and 0 deletions
--- a/implementations/GNUProlog/blanks.py
+++ b/implementations/GNUProlog/blanks.py
@ -0,0 +1,33 @@
+
+
+# https://www.lesinskis.com/python-unicode-whitespace.html
+UNICODE_WHITESPACE_CHARACTERS = [
+    "\u0009", # character tabulation
+    "\u000a", # line feed
+    "\u000b", # line tabulation
+    "\u000c", # form feed
+    "\u000d", # carriage return
+    "\u0020", # space
+    "\u0085", # next line
+    "\u00a0", # no-break space
+    "\u1680", # ogham space mark
+    "\u2000", # en quad
+    "\u2001", # em quad
+    "\u2002", # en space
+    "\u2003", # em space
+    "\u2004", # three-per-em space
+    "\u2005", # four-per-em space
+    "\u2006", # six-per-em space
+    "\u2007", # figure space
+    "\u2008", # punctuation space
+    "\u2009", # thin space
+    "\u200A", # hair space
+    "\u2028", # line separator
+    "\u2029", # paragraph separator
+    "\u202f", # narrow no-break space
+    "\u205f", # medium mathematical space
+    "\u3000", # ideographic space
+]
+
+for ch in UNICODE_WHITESPACE_CHARACTERS:
+    print(f'blank --> {list(ch.encode("utf_8"))}.')
--- a/implementations/GNUProlog/parser.prolog
+++ b/implementations/GNUProlog/parser.prolog
@ -0,0 +1,68 @@
+
+
+joy_lex([tok(Token)|Ls]) --> chars(TokenCodes), !,  {atom_codes(Token, TokenCodes)}, joy_lex(Ls).
+joy_lex([  lbracket|Ls]) --> "[",          !, joy_lex(Ls).
+joy_lex([  rbracket|Ls]) --> "]",          !, joy_lex(Ls).
+
+joy_lex(Ls) --> blank, !, joy_lex(Ls).
+
+joy_lex([]) --> [].
+
+
+% Then parse the tokens converting them to Prolog values and building up
+% the list structures (if any.)
+
+%joy_parse([J|Js]) --> joy_term(J), !, joy_parse(Js).
+%joy_parse([]) --> [].
+%
+%joy_term(list(J)) --> [lbracket], !, joy_parse(J), [rbracket].
+%joy_term(Token) --> [tok(Codes)], {joy_token(Token, Codes)}.
+%
+%joy_token(int(I), Codes) :- number(I, Codes, []), !.  % See dcg/basics.
+%joy_token(bool(true), `true`) :- !.
+%joy_token(bool(false), `false`) :- !.
+%joy_token(symbol(S), Codes) :- atom_codes(S, Codes).
+%
+%
+%text_to_expression(Text, Expression) :-
+%    phrase(joy_lex(Tokens), Text), !,
+%    phrase(joy_parse(Expression), Tokens).
+
+% Apologies for all the (green, I hope) cuts.  The strength of the Joy
+% syntax is that it's uninteresting.
+
+chars([Ch|Rest]) --> char(Ch), chars(Rest).
+chars([Ch])      --> char(Ch).
+
+char(Ch) --> \+ blank, [Ch], { Ch \== 0'[, Ch \== 0'] }.
+
+
+blank --> [9].
+blank --> [10].
+blank --> [11].
+blank --> [12].
+blank --> [13].
+blank --> [32].
+blank --> [194, 133].
+blank --> [194, 160].
+blank --> [225, 154, 128].
+blank --> [226, 128, 128].
+blank --> [226, 128, 129].
+blank --> [226, 128, 130].
+blank --> [226, 128, 131].
+blank --> [226, 128, 132].
+blank --> [226, 128, 133].
+blank --> [226, 128, 134].
+blank --> [226, 128, 135].
+blank --> [226, 128, 136].
+blank --> [226, 128, 137].
+blank --> [226, 128, 138].
+blank --> [226, 128, 168].
+blank --> [226, 128, 169].
+blank --> [226, 128, 175].
+blank --> [226, 129, 159].
+blank --> [227, 128, 128].
+
+
+
+do :- phrase(joy_lex(Tokens), "23[15]").