Parser in GNU Prolog.

Wonky handling of UTF_8 blankspace.
2022-09-20 16:29:35 -07:00 · 2022-09-20 16:29:35 -07:00 · c8f67f02f9
parent 3f4e9d9fe2
commit c8f67f02f9
2 changed files with 101 additions and 0 deletions
--- a/implementations/GNUProlog/blanks.py
+++ b/implementations/GNUProlog/blanks.py
@ -0,0 +1,33 @@
 # https://www.lesinskis.com/python-unicode-whitespace.html
 UNICODE_WHITESPACE_CHARACTERS = [
    "\u0009", # character tabulation
    "\u000a", # line feed
    "\u000b", # line tabulation
    "\u000c", # form feed
    "\u000d", # carriage return
    "\u0020", # space
    "\u0085", # next line
    "\u00a0", # no-break space
    "\u1680", # ogham space mark
    "\u2000", # en quad
    "\u2001", # em quad
    "\u2002", # en space
    "\u2003", # em space
    "\u2004", # three-per-em space
    "\u2005", # four-per-em space
    "\u2006", # six-per-em space
    "\u2007", # figure space
    "\u2008", # punctuation space
    "\u2009", # thin space
    "\u200A", # hair space
    "\u2028", # line separator
    "\u2029", # paragraph separator
    "\u202f", # narrow no-break space
    "\u205f", # medium mathematical space
    "\u3000", # ideographic space
 ]
 for ch in UNICODE_WHITESPACE_CHARACTERS:
    print(f'blank --> {list(ch.encode("utf_8"))}.')
--- a/implementations/GNUProlog/parser.prolog
+++ b/implementations/GNUProlog/parser.prolog
@ -0,0 +1,68 @@
 joy_lex([tok(Token)|Ls]) --> chars(TokenCodes), !,  {atom_codes(Token, TokenCodes)}, joy_lex(Ls).
 joy_lex([  lbracket|Ls]) --> "[",          !, joy_lex(Ls).
 joy_lex([  rbracket|Ls]) --> "]",          !, joy_lex(Ls).
 joy_lex(Ls) --> blank, !, joy_lex(Ls).
 joy_lex([]) --> [].
 % Then parse the tokens converting them to Prolog values and building up
 % the list structures (if any.)
 %joy_parse([J|Js]) --> joy_term(J), !, joy_parse(Js).
 %joy_parse([]) --> [].
 %
 %joy_term(list(J)) --> [lbracket], !, joy_parse(J), [rbracket].
 %joy_term(Token) --> [tok(Codes)], {joy_token(Token, Codes)}.
 %
 %joy_token(int(I), Codes) :- number(I, Codes, []), !.  % See dcg/basics.
 %joy_token(bool(true), `true`) :- !.
 %joy_token(bool(false), `false`) :- !.
 %joy_token(symbol(S), Codes) :- atom_codes(S, Codes).
 %
 %
 %text_to_expression(Text, Expression) :-
 %    phrase(joy_lex(Tokens), Text), !,
 %    phrase(joy_parse(Expression), Tokens).
 % Apologies for all the (green, I hope) cuts.  The strength of the Joy
 % syntax is that it's uninteresting.
 chars([Ch|Rest]) --> char(Ch), chars(Rest).
 chars([Ch])      --> char(Ch).
 char(Ch) --> \+ blank, [Ch], { Ch \== 0'[, Ch \== 0'] }.
 blank --> [9].
 blank --> [10].
 blank --> [11].
 blank --> [12].
 blank --> [13].
 blank --> [32].
 blank --> [194, 133].
 blank --> [194, 160].
 blank --> [225, 154, 128].
 blank --> [226, 128, 128].
 blank --> [226, 128, 129].
 blank --> [226, 128, 130].
 blank --> [226, 128, 131].
 blank --> [226, 128, 132].
 blank --> [226, 128, 133].
 blank --> [226, 128, 134].
 blank --> [226, 128, 135].
 blank --> [226, 128, 136].
 blank --> [226, 128, 137].
 blank --> [226, 128, 138].
 blank --> [226, 128, 168].
 blank --> [226, 128, 169].
 blank --> [226, 128, 175].
 blank --> [226, 129, 159].
 blank --> [227, 128, 128].
 do :- phrase(joy_lex(Tokens), "23[15]").