/* 

  Tokenization using DCG rules in Picat.

  From  Michael A. Covington: "Tokenization using DCG Rules"
  http://www.covingtoninnovations.com/mc/projpaper.pdf

  This model was created by Hakan Kjellerstrand, hakank@gmail.com
  See also my Picat page: http://www.hakank.org/picat/

*/

import v3_utils.

main => go.

% This tokenizes to: [we,owe,$,1,',',0,',',0,to,agent,7]
% Which might not be what we want
% 
go ?=>

  S = "We owe $1,000,000 to Agent 007",
  Cs = [ord(C) : C in S], % convert to ASCII  
  token_list(What,Cs,[]),
  write(What), nl,
  nl.
go => true.

% The test in the paper:
% """
% [we,owe,$,1,',',48,',',576,'.',24,to,agent,7,for,version,3,'.',14159,!]
% '[There should not be any alternatives here...]'
% """
go2 ?=>
  test,
  nl.
go2 => true.


%
% hakank: Program listing from the paper below. All
%         comments (except the one by "hakank") are
%         from the paper.
%

%
% File sampleproj.pl - M. Covington - 2001 April 21
% % A tokenizer using DCG rules.
% :- use_module(library(lists)).
% provides append/3 in SICStus Prolog

% A test predicate to demonstrate that it works
test :-
  % hakank: Note that the DCG below works on ASCII values, so
  %         we first convert it to a list of ASCII value.
  S = " We owe $1,048,576.24 to Agent 007 for Version 3.14159! ",
  % S = "We owe $1,048,576.24 to Agent 007 for Version 3.14159!",  % without leading/trailing blanks
  Cs = [ord(C) : C in S], % convert to ASCII
  
  token_list(What,Cs,[]),
  write(What), nl,
  write('[There should not be any alternatives here...]'),
  nl,
  fail.

% A token list is a series of zero or more tokens.
% Its argument consists of the list of tokens, as atoms and numbers.
% The cut ensures that the maximum number of characters is
% gathered into each token.
%
% To tokenize a string, do this:
%   ?- token_list(Result," the string ",[]).
%

token_list([T|Rest]) --> blank0,token(T),!,token_list(Rest).
token_list([]) --> blank0. % blank0 is a series of zero or more blanks.

blank0 --> [C],{char_type(C,blank)},!,blank0.
blank0 --> [].

% Several kinds of tokens.
% This is where lists of characters get converted into atoms or numbers.
token(T) --> special(L), {atom_codes(T,L)}.
token(T) --> word(W),    {atom_codes(T,W)}.
token(T) --> numeral(N), {number_codes(T,N)}.

% A word is a series of one or more letters.
% The rules are ordered so that we first try to gather as many
% characters into one digit_string as possible.
word([L|Rest]) --> letter(L),word(Rest).
word([L]) --> letter(L).

% A numeral is a list of characters that constitute a number.
% The argument of numeral(...) is the list of character codes.
numeral([C1,C2,C3|N]) --> ",", digit(C1), digit(C2), digit(C3), numeral(N).
numeral([C1,C2,C3])   --> ",", digit(C1), digit(C2), digit(C3).
numeral([C|N]) --> digit(C), numeral(N). % multiple digits
numeral([C])   --> digit(C).             % single digit
numeral(N)     --> decimal_part(N).      % decimal point and more digits

decimal_part([46|Rest]) --> ".", digit_string(Rest).
digit_string([D|N]) --> digit(D),digit_string(N).
digit_string([D]) --> digit(D).

% Various kinds of characters...
digit(C)        --> [C], {char_type(C,numeric)}.
special([C])    --> [C], {char_type(C,special)}.
letter(C)       --> [C], {char_type(C,lowercase)}.
letter(C)       --> [U], {char_type(U,uppercase), C is U+32}.

%
% Conversion to lowercase
% char_type(+Code,?Type)
%   Classifies a character (ASCII code) as
%   blank, numeric, uppercase, lowercase, or special.
%   Adapted from Covington 1994.
%
char_type(Code,Y) :-             % blanks, other ctrl codes
  Code =< 32,!,
  Y = blank.
char_type(Code,Y) :-  % digits
  48 =< Code,
  Code =< 57,!,
  Y = numeric.
char_type(Code,Y) :-  % lowercase letters
  97 =< Code,
  Code=< 122,!,
  Y = lowercase.
char_type(Code,Y) :-  % uppercase letters
  65 =< Code,
  Code=< 90,!,
  Y = uppercase.
char_type(_,special).    % all others

% End of sampleproj.pl