Parser in GNU Prolog.
Wonky handling of UTF_8 blankspace.
This commit is contained in:
parent
3f4e9d9fe2
commit
c8f67f02f9
|
|
@ -0,0 +1,33 @@
|
||||||
|
|
||||||
|
|
||||||
|
# https://www.lesinskis.com/python-unicode-whitespace.html
|
||||||
|
UNICODE_WHITESPACE_CHARACTERS = [
|
||||||
|
"\u0009", # character tabulation
|
||||||
|
"\u000a", # line feed
|
||||||
|
"\u000b", # line tabulation
|
||||||
|
"\u000c", # form feed
|
||||||
|
"\u000d", # carriage return
|
||||||
|
"\u0020", # space
|
||||||
|
"\u0085", # next line
|
||||||
|
"\u00a0", # no-break space
|
||||||
|
"\u1680", # ogham space mark
|
||||||
|
"\u2000", # en quad
|
||||||
|
"\u2001", # em quad
|
||||||
|
"\u2002", # en space
|
||||||
|
"\u2003", # em space
|
||||||
|
"\u2004", # three-per-em space
|
||||||
|
"\u2005", # four-per-em space
|
||||||
|
"\u2006", # six-per-em space
|
||||||
|
"\u2007", # figure space
|
||||||
|
"\u2008", # punctuation space
|
||||||
|
"\u2009", # thin space
|
||||||
|
"\u200A", # hair space
|
||||||
|
"\u2028", # line separator
|
||||||
|
"\u2029", # paragraph separator
|
||||||
|
"\u202f", # narrow no-break space
|
||||||
|
"\u205f", # medium mathematical space
|
||||||
|
"\u3000", # ideographic space
|
||||||
|
]
|
||||||
|
|
||||||
|
for ch in UNICODE_WHITESPACE_CHARACTERS:
|
||||||
|
print(f'blank --> {list(ch.encode("utf_8"))}.')
|
||||||
|
|
@ -0,0 +1,68 @@
|
||||||
|
|
||||||
|
|
||||||
|
joy_lex([tok(Token)|Ls]) --> chars(TokenCodes), !, {atom_codes(Token, TokenCodes)}, joy_lex(Ls).
|
||||||
|
joy_lex([ lbracket|Ls]) --> "[", !, joy_lex(Ls).
|
||||||
|
joy_lex([ rbracket|Ls]) --> "]", !, joy_lex(Ls).
|
||||||
|
|
||||||
|
joy_lex(Ls) --> blank, !, joy_lex(Ls).
|
||||||
|
|
||||||
|
joy_lex([]) --> [].
|
||||||
|
|
||||||
|
|
||||||
|
% Then parse the tokens converting them to Prolog values and building up
|
||||||
|
% the list structures (if any.)
|
||||||
|
|
||||||
|
%joy_parse([J|Js]) --> joy_term(J), !, joy_parse(Js).
|
||||||
|
%joy_parse([]) --> [].
|
||||||
|
%
|
||||||
|
%joy_term(list(J)) --> [lbracket], !, joy_parse(J), [rbracket].
|
||||||
|
%joy_term(Token) --> [tok(Codes)], {joy_token(Token, Codes)}.
|
||||||
|
%
|
||||||
|
%joy_token(int(I), Codes) :- number(I, Codes, []), !. % See dcg/basics.
|
||||||
|
%joy_token(bool(true), `true`) :- !.
|
||||||
|
%joy_token(bool(false), `false`) :- !.
|
||||||
|
%joy_token(symbol(S), Codes) :- atom_codes(S, Codes).
|
||||||
|
%
|
||||||
|
%
|
||||||
|
%text_to_expression(Text, Expression) :-
|
||||||
|
% phrase(joy_lex(Tokens), Text), !,
|
||||||
|
% phrase(joy_parse(Expression), Tokens).
|
||||||
|
|
||||||
|
% Apologies for all the (green, I hope) cuts. The strength of the Joy
|
||||||
|
% syntax is that it's uninteresting.
|
||||||
|
|
||||||
|
chars([Ch|Rest]) --> char(Ch), chars(Rest).
|
||||||
|
chars([Ch]) --> char(Ch).
|
||||||
|
|
||||||
|
char(Ch) --> \+ blank, [Ch], { Ch \== 0'[, Ch \== 0'] }.
|
||||||
|
|
||||||
|
|
||||||
|
blank --> [9].
|
||||||
|
blank --> [10].
|
||||||
|
blank --> [11].
|
||||||
|
blank --> [12].
|
||||||
|
blank --> [13].
|
||||||
|
blank --> [32].
|
||||||
|
blank --> [194, 133].
|
||||||
|
blank --> [194, 160].
|
||||||
|
blank --> [225, 154, 128].
|
||||||
|
blank --> [226, 128, 128].
|
||||||
|
blank --> [226, 128, 129].
|
||||||
|
blank --> [226, 128, 130].
|
||||||
|
blank --> [226, 128, 131].
|
||||||
|
blank --> [226, 128, 132].
|
||||||
|
blank --> [226, 128, 133].
|
||||||
|
blank --> [226, 128, 134].
|
||||||
|
blank --> [226, 128, 135].
|
||||||
|
blank --> [226, 128, 136].
|
||||||
|
blank --> [226, 128, 137].
|
||||||
|
blank --> [226, 128, 138].
|
||||||
|
blank --> [226, 128, 168].
|
||||||
|
blank --> [226, 128, 169].
|
||||||
|
blank --> [226, 128, 175].
|
||||||
|
blank --> [226, 129, 159].
|
||||||
|
blank --> [227, 128, 128].
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
do :- phrase(joy_lex(Tokens), "23[15]").
|
||||||
Loading…
Reference in New Issue