/* Rosalind DNA problem in Picat. https://rosalind.info/problems/dna/ """ A string is simply an ordered collection of symbols selected from some alphabet and formed into a word; the length of a string is the number of symbols that it contains. An example of a length 21 DNA string (whose alphabet contains the symbols 'A', 'C', 'G', and 'T') is "ATGCTTCAGAAAGGTCTTACG." Given: A DNA string s of length at most 1000 nt. Return: Four integers (separated by spaces) counting the respective number of times that the symbols 'A', 'C', 'G', and 'T' occur in s Sample Dataset AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC Sample Output 20 12 17 21 """ This model was created by Hakan Kjellerstrand, hakank@gmail.com See also my Picat page: http://www.hakank.org/picat/ */ import rosalind_utils. main => go. go ?=> DNA = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC", println(dna=DNA), dna(DNA,[]), println(dcg_parse_ok), time(dna1(DNA) = M1), println(m1=M1), P2Init = $p(0,0,0,0), time(dna2(DNA,P2Init,P2)), println(ps2=P2), P3Init = $p(0,0,0,0), time(dna2(DNA,P3Init,P3)), println(ps3=P3), time(dna4(DNA) = L4), println(l4=L4), time(dna5(DNA,L5)), println(l5=L5), time(L6=dna6(DNA)), println(l6=L6), time(L7=dna7(DNA)), println(l7=L7), nl. go => true. % % Test some larger random DNA strings. % % - dna1/1: foreach loop and map: slow % - dna2/3: recursion of list, each character in a separate clause. using cut: fast % - dna3/3: recursion of list, one clause: fast % - dna4/1: foreach loop and list: quite fast % - dna5/2: recursion and map: slow % - dna6/1: list comprehension: not very fast % - dna7/1: list comprehension: not very fast % % For sequences of length 1_111_150: % - dna1/1: about 0.122s % - dna2/3: (w/o cut: about 0.425s) with cut: 0.03s fastest! % - dna3/3: about 0.034s (almost as fast as dna2/1) % - dna4/1: about 0.053s % - dna5/2: about 0.120s % - dna6/1: about 0.062s % - dna6/1: about 0.065s % % With length 11_111_150 % - dna1/1: about 1.245s % - dna2/3: with cut: 0.323s fastest! % - dna3/3: about 0.334s (almost as fast as dna2/1) % - dna4/1: about 0.536s % - dna5/2: about 1.236s (slightly faster than dna1/1) % - dna6/1: about 0.634s % - dna7/1: about 0.661s % Summary: % - Using maps is quite slow for large strings. % - Using cuts speeds up things.. % go2 ?=> garbage_collect(300_000_000), _ = random2(), % DNALen = 150, DNALen = 1_111_150, % DNALen = 11_111_150, N = 10, test2(DNALen,N), nl. go2 => true. % % Generate all possible DNA strings below a certain length via DCG. % go3 ?=> garbage_collect(300_000_000), N = 10, member(Len,1..N), println(len=Len), DNA = new_list(Len), dna(DNA,[]), println(DNA), fail, nl. go3 => true. go4 => DNA = "ACAGAGGTTGCATAAACAATTGGTCACGCTTCCAATCTGGCTTCGGTGTGCAGGTTTTGTCACGTAATGACGAAGGTACTCCCAGTTTGAATACCTAAACACAGACATTGTGTGACAGGCGCTCTCGTTGAACTCATCCAGCTTTAGTGAGTCAGCAGGGCAGCCCACAACTCTAATACAGCTGCTCCGCGCCAAAGATAGCTCGAAACTGTCGAACTCAAACCAAAGGCATTTTTCGTATCCTAAGTTACATATTGACTCAACGCAATCGAGAAAGTAGCTACGAACAGAGCCCTGGGATATACTCGAGCCGATCTTTCCGACGAGGATCTTTACTACTGGCTGCATTGCACGGGTGTAGTTACTATCACGTACATCTAGCGGCAAAGACGAGCCTACTTTCCTACTTTAGTCAGACCGCAAGCAGACATTATCGCCCTCACATGCCTCGATTGTTTGACGTACATTTCTTGCACGACGTATGACCTGGCCAATAATGACCTACGTGTCCAGATGTGCAGAGCTCCTACGAGTGGTTATTACGCTGCTCTTGTCCATCAGTACCATGGCCCCGACATATTGCAATTTCGGCCAGTGCGTCAATCCCTGTGTCAGGAGAGAAAATGACCCCGTGCGGCTAACAATGGCGACCAATACGAGCCAAACCTGCGTTGGGTACGGTGTCGAATATAGTGTCTTCGTGCACTAGAAGCAAGGCCCACTCCGTATATTACGTCGCACATTCGATCCCAAGTATGCGGGGGGAGTACGAGCGGGGTTGGGGCACCATGACAAACATCGTGTACCAACCCGTGAGTCCGGGACATGTTCTTGTACCTTGGGTAACCTCAAGTCAATTTCTACGTCAAGCGACACGTCACGTTACCAATCGACACCCTATATGCTGGCTCAAACCTGTCTCAGTGCAAGGCGGCCCGGTTACGCGTAAAGTTGCGCTAGTGGCCATCCGA", P2Init = $p(0,0,0,0), time(dna2(DNA,P2Init,P2)), println(ps2=P2), nl. go4 => true. % % Test random DNA strings. % test2(_DNALen,0). test2(DNALen, N) :- println(loop=N), println(dna_len=DNALen), DNA = new_list(DNALen), println(dna_gen1), time(dna_gen1(DNA,"",[])), % dna1 garbage_collect(300_000_000), println(dna1), time(dna1(DNA) = M1), println(m1=M1), % dna2 garbage_collect(300_000_000), println(dna2), P2Init = $p(0,0,0,0), time(dna2(DNA,P2Init,P2)), println(ps2=P2), % dna3 println(dna3), garbage_collect(300_000_000), P3Init = $p(0,0,0,0), time(dna3(DNA,P3Init,P3)), println(ps3=P3), % dna4 println(dna4), garbage_collect(300_000_000), time(P4 = dna4(DNA)), println(ps4=P4), % dna5 println(dna5), garbage_collect(300_000_000), time(dna5(DNA,L5)), println(l5=L5), % dna6 println(dna6), garbage_collect(300_000_000), time(L6=dna6(DNA)), println(l6=L6), % dna7 println(dna7), garbage_collect(300_000_000), time(L7=dna7(DNA)), println(l7=L7), test2(DNALen,N-1). % % dna1/1: Map + foreach loop % dna1(DNA) = [M.get(C,0) : C in "ACGT"] => M = new_map(['A'=0,'C'=0,'G'=0,'T'=0]), foreach(S in DNA) M.put(S,M.get(S)+1) end. % % The third (Ps) is just a placeholder for the return value. % Without cut: the slowest, % with cut it's the fastest, slighly faster than dna3/2 % dna2([],Ps,Ps). dna2(['A'|Cs],p(AA,CC,GG,TT),Ps) :- AA2 = AA + 1, !, dna2(Cs,$p(AA2,CC,GG,TT),Ps). dna2(['C'|Cs],p(AA,CC,GG,TT),Ps) :- CC2 = CC + 1, !, dna2(Cs,$p(AA,CC2,GG,TT),Ps). dna2(['G'|Cs],p(AA,CC,GG,TT),Ps) :- GG2 = GG + 1, !, dna2(Cs,$p(AA,CC,GG2,TT),Ps). dna2(['T'|Cs],p(AA,CC,GG,TT),Ps) :- TT2 = TT + 1, !, dna2(Cs,$p(AA,CC,GG,TT2),Ps). % % Same idea as dna2/1 but collected in one clause. % Quite faster than dna2/3 and a little faster than dna1/1, % and slightly faster than dna4/1. % However, if using cut then dna2 it's fastest % dna3([],Ps,Ps). dna3([C|Cs],p(AA,CC,GG,TT),Ps) :- (C == 'A' -> AA2 = AA + 1, dna3(Cs,$p(AA2,CC,GG,TT),Ps) ; C == 'C' -> CC2 = CC + 1, dna3(Cs,$p(AA,CC2,GG,TT),Ps) ; C == 'G' -> GG2 = GG + 1, dna3(Cs,$p(AA,CC,GG2,TT),Ps) ; C == 'T' -> TT2 = TT + 1, dna3(Cs,$p(AA,CC,GG,TT2),Ps) ; true ). % % Foreach loop % dna4(DNA) = Counts => Counts1 = [0,0,0,0], foreach(S in DNA) if S == 'A' then Counts1[1] := Counts1[1] + 1 elseif S == 'C' then Counts1[2] := Counts1[2] + 1 elseif S == 'G' then Counts1[3] := Counts1[3] + 1 elseif S == 'T' then Counts1[4] := Counts1[4] + 1 end end, Counts = Counts1. % % Combination of dna2/3 and dna1/1, using Map and recursion. % Quite slow, about as dna1/1. % dna5(DNA,L) :- M0 = new_map(['A'=0,'C'=0,'G'=0,'T'=0]), dna5(DNA,M0,M), L = [M.get(C,0) : C in "ACGT"]. dna5([],M,M). dna5([C|Cs],M0,M) :- M0.put(C,M0.get(C)+1), dna5(Cs,M0,M). dna6(DNA) = [As,Cs,Gs,Ts] => As = sum([1 : C in DNA, C == 'A']), Cs = sum([1 : C in DNA, C == 'C']), Gs = sum([1 : C in DNA, C == 'G']), Ts = sum([1 : C in DNA, C == 'T']). dna7(DNA) = [sum([1 : D in DNA, D == C]) : C in "ACGT"].