/* Tokenization using DCG rules in Picat. From Michael A. Covington: "Tokenization using DCG Rules" http://www.covingtoninnovations.com/mc/projpaper.pdf This model was created by Hakan Kjellerstrand, hakank@gmail.com See also my Picat page: http://www.hakank.org/picat/ */ import v3_utils. main => go. % This tokenizes to: [we,owe,$,1,',',0,',',0,to,agent,7] % Which might not be what we want % go ?=> S = "We owe $1,000,000 to Agent 007", Cs = [ord(C) : C in S], % convert to ASCII token_list(What,Cs,[]), write(What), nl, nl. go => true. % The test in the paper: % """ % [we,owe,$,1,',',48,',',576,'.',24,to,agent,7,for,version,3,'.',14159,!] % '[There should not be any alternatives here...]' % """ go2 ?=> test, nl. go2 => true. % % hakank: Program listing from the paper below. All % comments (except the one by "hakank") are % from the paper. % % % File sampleproj.pl - M. Covington - 2001 April 21 % % A tokenizer using DCG rules. % :- use_module(library(lists)). % provides append/3 in SICStus Prolog % A test predicate to demonstrate that it works test :- % hakank: Note that the DCG below works on ASCII values, so % we first convert it to a list of ASCII value. S = " We owe $1,048,576.24 to Agent 007 for Version 3.14159! ", % S = "We owe $1,048,576.24 to Agent 007 for Version 3.14159!", % without leading/trailing blanks Cs = [ord(C) : C in S], % convert to ASCII token_list(What,Cs,[]), write(What), nl, write('[There should not be any alternatives here...]'), nl, fail. % A token list is a series of zero or more tokens. % Its argument consists of the list of tokens, as atoms and numbers. % The cut ensures that the maximum number of characters is % gathered into each token. % % To tokenize a string, do this: % ?- token_list(Result," the string ",[]). % token_list([T|Rest]) --> blank0,token(T),!,token_list(Rest). token_list([]) --> blank0. % blank0 is a series of zero or more blanks. blank0 --> [C],{char_type(C,blank)},!,blank0. blank0 --> []. % Several kinds of tokens. % This is where lists of characters get converted into atoms or numbers. token(T) --> special(L), {atom_codes(T,L)}. token(T) --> word(W), {atom_codes(T,W)}. token(T) --> numeral(N), {number_codes(T,N)}. % A word is a series of one or more letters. % The rules are ordered so that we first try to gather as many % characters into one digit_string as possible. word([L|Rest]) --> letter(L),word(Rest). word([L]) --> letter(L). % A numeral is a list of characters that constitute a number. % The argument of numeral(...) is the list of character codes. numeral([C1,C2,C3|N]) --> ",", digit(C1), digit(C2), digit(C3), numeral(N). numeral([C1,C2,C3]) --> ",", digit(C1), digit(C2), digit(C3). numeral([C|N]) --> digit(C), numeral(N). % multiple digits numeral([C]) --> digit(C). % single digit numeral(N) --> decimal_part(N). % decimal point and more digits decimal_part([46|Rest]) --> ".", digit_string(Rest). digit_string([D|N]) --> digit(D),digit_string(N). digit_string([D]) --> digit(D). % Various kinds of characters... digit(C) --> [C], {char_type(C,numeric)}. special([C]) --> [C], {char_type(C,special)}. letter(C) --> [C], {char_type(C,lowercase)}. letter(C) --> [U], {char_type(U,uppercase), C is U+32}. % % Conversion to lowercase % char_type(+Code,?Type) % Classifies a character (ASCII code) as % blank, numeric, uppercase, lowercase, or special. % Adapted from Covington 1994. % char_type(Code,Y) :- % blanks, other ctrl codes Code =< 32,!, Y = blank. char_type(Code,Y) :- % digits 48 =< Code, Code =< 57,!, Y = numeric. char_type(Code,Y) :- % lowercase letters 97 =< Code, Code=< 122,!, Y = lowercase. char_type(Code,Y) :- % uppercase letters 65 =< Code, Code=< 90,!, Y = uppercase. char_type(_,special). % all others % End of sampleproj.pl