Parser in GNU Prolog.

Wonky handling of UTF_8 blankspace.
This commit is contained in:
Simon Forman 2022-09-20 16:29:35 -07:00
parent 3f4e9d9fe2
commit c8f67f02f9
2 changed files with 101 additions and 0 deletions

View File

@ -0,0 +1,33 @@
# https://www.lesinskis.com/python-unicode-whitespace.html
UNICODE_WHITESPACE_CHARACTERS = [
"\u0009", # character tabulation
"\u000a", # line feed
"\u000b", # line tabulation
"\u000c", # form feed
"\u000d", # carriage return
"\u0020", # space
"\u0085", # next line
"\u00a0", # no-break space
"\u1680", # ogham space mark
"\u2000", # en quad
"\u2001", # em quad
"\u2002", # en space
"\u2003", # em space
"\u2004", # three-per-em space
"\u2005", # four-per-em space
"\u2006", # six-per-em space
"\u2007", # figure space
"\u2008", # punctuation space
"\u2009", # thin space
"\u200A", # hair space
"\u2028", # line separator
"\u2029", # paragraph separator
"\u202f", # narrow no-break space
"\u205f", # medium mathematical space
"\u3000", # ideographic space
]
for ch in UNICODE_WHITESPACE_CHARACTERS:
print(f'blank --> {list(ch.encode("utf_8"))}.')

View File

@ -0,0 +1,68 @@
joy_lex([tok(Token)|Ls]) --> chars(TokenCodes), !, {atom_codes(Token, TokenCodes)}, joy_lex(Ls).
joy_lex([ lbracket|Ls]) --> "[", !, joy_lex(Ls).
joy_lex([ rbracket|Ls]) --> "]", !, joy_lex(Ls).
joy_lex(Ls) --> blank, !, joy_lex(Ls).
joy_lex([]) --> [].
% Then parse the tokens converting them to Prolog values and building up
% the list structures (if any.)
%joy_parse([J|Js]) --> joy_term(J), !, joy_parse(Js).
%joy_parse([]) --> [].
%
%joy_term(list(J)) --> [lbracket], !, joy_parse(J), [rbracket].
%joy_term(Token) --> [tok(Codes)], {joy_token(Token, Codes)}.
%
%joy_token(int(I), Codes) :- number(I, Codes, []), !. % See dcg/basics.
%joy_token(bool(true), `true`) :- !.
%joy_token(bool(false), `false`) :- !.
%joy_token(symbol(S), Codes) :- atom_codes(S, Codes).
%
%
%text_to_expression(Text, Expression) :-
% phrase(joy_lex(Tokens), Text), !,
% phrase(joy_parse(Expression), Tokens).
% Apologies for all the (green, I hope) cuts. The strength of the Joy
% syntax is that it's uninteresting.
chars([Ch|Rest]) --> char(Ch), chars(Rest).
chars([Ch]) --> char(Ch).
char(Ch) --> \+ blank, [Ch], { Ch \== 0'[, Ch \== 0'] }.
blank --> [9].
blank --> [10].
blank --> [11].
blank --> [12].
blank --> [13].
blank --> [32].
blank --> [194, 133].
blank --> [194, 160].
blank --> [225, 154, 128].
blank --> [226, 128, 128].
blank --> [226, 128, 129].
blank --> [226, 128, 130].
blank --> [226, 128, 131].
blank --> [226, 128, 132].
blank --> [226, 128, 133].
blank --> [226, 128, 134].
blank --> [226, 128, 135].
blank --> [226, 128, 136].
blank --> [226, 128, 137].
blank --> [226, 128, 138].
blank --> [226, 128, 168].
blank --> [226, 128, 169].
blank --> [226, 128, 175].
blank --> [226, 129, 159].
blank --> [227, 128, 128].
do :- phrase(joy_lex(Tokens), "23[15]").