/* Probabilistic propgramming (PPL) utils in Picat. Utility functions for probabilistic programming (ppl_*). Configurations: * Setting a configuration parameter: set_config(Parameter,Values) * Getting a configuration parameter: get_config(Parameter,Default) In addition explicitly using set_config/2 (which are used internally), it's recommended to add configuration value instead in the paramteter list of run_model/3. For example: run_model(10_000,$model(Obs),[mean,show_probs_rat_trunc, truncate_size=5, presentation=["c","b","a"], hpd_intervals=[0.84,0.9999], random_seed=42]) Supported configurations: - truncate_size (for show_probs/1 and show_probs_trunc) default 4 - presentation: The presentation order of random variables. default: sort by the order in which they are presented in the model - hpd_intervals: The HPD intervals for show_hpd_intervals default: [0.5,0.84,0.90,0.95,0.99,0.999,0.9999,0.99999] - percentiles: The percentiles for show_percentiles default: [0.001,0.01,0.025,0.05,0.10,0.25,0.50,0.75,0.84,0.90,0.95,0.975,0.99,0.999,0.9999,0.99999] - histogram_limit: Sets the upper limit of unique values for integer/discrete. Above that it will be seen as continuous values. default: 40 - random_seed: Sets the random seed default: none, i.e. random2() is used instead - min_accepted_samples=MinAcceptedSamples: This is an alternative to the number of runs given by run_model/2-3. If set then continue to run the model until the number of accepted samples >= MinAcceptedSamples Default: no restriction (i.e. run the model the given number of runs) - show_accepted_samples=: If set to true and min_accepted_samples=MinAcceptedSamples is set, then show the number of accepted samples. - use_local_tabling=[false|true] (experimental) Default: false If true then, all calls to the model clears the table area. This is for use for memoising recursive problems, especially for multi-function recursion (one function is calling another function, which in turn calls the first function). See ppl_hurricane.pi and ppl_italian_murder.pi (go3/0) for an example. Note: This should be considered experimental since it have effects in other parts of PPL which is using tabling. - garbage_collect_when: N Default: 1000 Runs garbage_collect() each N'th iteration. This might be useful when there are much data that are handles/transferred, and/or Picat report that memory is exhaused. - scatter_plot_size: [Width,Height] Size of show_scatter_plot, Default: [40,20] $ perl -nle 'print $1 if m!^([a-z][^\s]+?) =!' ppl_utils.pi | sort add_all(L) add(Name,Value) add_post(Data,Y) all_different(L) analyse_post(Data) analyse_post(Data,Post) analyse_post_simple(Data) analyse_post_simple(Data,Post) analyze_post(Data) analyze_post(Data,Post) analyze_post_simple(Data) analyze_post_simple(Data,Post) and_ratf(X) and_rat(X) argmax(L) argmax_random_ties(L) argmin(L) argmin_random_ties(L) argsort(L) b2i(V) but_last(L) cases(L) case(Val,List) cdf_from_data(Data) cdf_from_pdf(PDF) check1(Cond) check(Cond) check_duplicates(L) check_format(V) check_parameter(Parameter=Var,Pres) collect(List) comb_sum(Len,N,Sum) condt(Cond,Then,Else) count_occurrences_list(What,L) count_occurrences_sublist(Sub,L) count_occurrences(What,L) credible_intervals_exact(Dist,HPD) data_iqr(Data) data_mad(Data) data_percentiles(Data,Pcts) data_range(Data) debug(S) debug(Text,S) draw_until_one_pattern(Lst,Patterns,Choices) draw_until_pattern(Lst,Pattern,Choices) draw_without_replacement(A) draw_without_replacement(N,A) draw_without_replacement(N,A,Ns) equivalence(S1,S2) filter_runs(Runs,Val) find_first(L,Val,Default) flip_until_n_successes(P,N) flip_until_pattern(Patterns,Lst) float_to_rational(X) generate_01_runs(N,P) get_config_map() get_config(Parameter,Default) get_freq(List) get_percentiles(Data) get_probs(L) get_records(A) get_records(A,MaxVal,Aux,Op) get_records(A,Op) get_records_geq(A) get_records_ix(A) get_records_ix(A,Op) get_records_ix_geq(A) get_records_ix(Ix,A,MaxVal,Aux,Op) get_runs1(As,Aux) get_runs(As) get_runs_lens(Xs) get_runs_op(As,Aux,Op) get_runs_op(Xs,Op) get_store() implication(S1,S2) index_of_sub(List,Sub) interpret_fit_score(S) kurtosis(Data) kurtosis_excess(Data) markov_chain_loop(Transitions,N,A) markov_chain_n(Transitions,InitState,N,Num) markov_chain(Transitions,InitState,N) mean(Data) median(Data) median_sorted(Sorted) mod_replace(Val,Mod) normalized_rmse(Data,Post) normalized_rmse_explained(NRMSE) num_runs([]) num_runs(A) num_runs(T,A,Acc) observe_abc_adj(Data,Ys) observe_abc_adj(Data,Ys,Scale) observe_abc(Data,Ys) observe_abc(Data,Ys,Scale) observe_abc(Data,Ys,Scale,Methods) observe_abc_discrete(Data,Ys) observe_abc_discrete(Data,Ys,Scale) observed_ok() observe(Obs) observe(Obs1,Obs2) observe(Obs1,Obs2,Obs3) observe(Obs1,Obs2,Obs3,Obs4) one_hot(N,I) ones(N,Val) online_moments(Data) pdf_from_data(Data) plot_ascii_points([],Grid,_,_,_,_,_,_) plot_ascii_points([[X,Y]|Rest],Grid0,Width,Height,MinX,MinY,DX,DY) plot_ascii_scatter(Data) post_summary(Data) printf_list(L) print_list(L) print_matrix(M) print_matrix(M,Name) q1(Data) q3(Data) quantile_from_cdf(CDF,Q) quantile_from_data(Data,Q) random_transition_matrix(N) random_transition_matrix(N,V) recommend_abc_stats(Data) recommend_abc_stats_explained(Data) repeat_string(S,N) replace_nth([H|Rest],N,Val) replace_nth([_|Rest],1,Val) rep(N,V) resample_all(Data) resample(Data) resample(N,Data) reset_store() rest(L) rmse_percentiles(Data,Post) roundf(V,Prec) run_model(Runs,Model) run_model(Runs,Model,Pres) set_config(Parameter,Value) show_all(Map) show_all(Map,Ps) show_credible_intervals_exact(Dist,Qs) show(Data) show(Data,Ps) show_freq(List) show_freq_prob(List) show_histogram(Data) show_hpd_intervals(Data) show_matrix(M) show_matrix(M,Name) show_percentiles(Data) show_probs(L) show_probs_rat(L) show_probs_rat_trunc(L) show_probs_trunc(L) show_scatter_plot(Data) show_scatter_plot(Data,Width,Height) show_simple_stats(Data) show_stats(Data) show_store_lengths() show_summary_stats(Data) shuffle1(List) shuffle(L) simplex(L) skewness(Data) stdev(Xs) std_population(Data) sublists(L,N) swap(L,I,J) take_last(A,N) to_probabilities(L) to_rat(X) update_all(Map,L) update_list_all(Map,L) update_list(Map,Key,Value) update(Map,Key) variance_population(Data) variance_sample(Data) variance(Xs) xor(A,B) This program was created by Hakan Kjellerstrand, hakank@gmail.com See also my Picat page: http://www.hakank.org/picat/ */ module ppl_utils. import ppl_distributions. import ppl_common_utils. import util. import cp. /* Updating a map. */ % Update the count of Key update(Map,Key) => Map.put(Key,Map.get(Key,0)+1). % Add the Value in the list of Key update_list(Map,Key,Value) => % Map.put(Key,Map.get(Key,[])++[Value]). Map.put(Key,[Value|Map.get(Key,[])]). update_all(Map,L) => foreach(V in L) Map.update(V) end. update_list_all(Map,L) => foreach([V,K] in L) Map.update_list(V,K) end. /* Handling the data / samples / trace store - reset_store: Resets the store. Should be the called before a new call to run_model/1-2. Note: this is only needed if there are more than one call to run_model/1-2, but it's a good habit. - add(Name,Value) add_all(List) Add the element(s) to the data store - get store: returns the data store */ reset_store() => get_global_map(store).clear(). get_store() = get_global_map(store). show_store_lengths() => println("Variable lengths:"), Store = get_store(), foreach(K in Store.keys.sort) println(K=Store.get(K).len) end. % Add the value Value to the store list with the name Name add(Name,Value) => AddOrder = get_global_map(add_order), Order = AddOrder.get(add_order), if not membchk(Name,Order) then AddOrder.put(add_order,AddOrder.get(add_order)++[Name]), end, Map = get_global_map(store), % Map.put(Name,Map.get(Name,[])++[Value]). Map.put(Name,[Value|Map.get(Name,[])]). % A little faster, but reversed order % Add all pairs of [Name1,Value1,Name2,Value2] % to the store add_all(L) => foreach([Name,Value] in L) add(Name,Value) end. /* add_post(Data,Y) This is a convenience function when using BDA for adding the posterior data set (Y). It adds - foreach element in Data and Y - - "post ": The value of Y[I] - "post diff:", The difference between Y[I] and Data[I]. This can then be used for together with the (posterior) summary function post_summary(Data), see below. */ add_post(Data,Y) => Len = min(Data.len,Y.len), foreach(I in 1..Len) IS = I.to_string, add("post " ++ IS,Y[I]), add("post "++ IS ++ " diff",Y[I]-Data[I]), end. /* Configuration The configuration contains some parameters that are globally to the model. */ % Return the configuration map get_config_map() = get_global_map(config). % Get the value of configuration parameter Parameter, % with the default value Value get_config(Parameter,Default) = get_global_map(config).get(Parameter,Default). % Set the configuration parameter Parameter to the value of L set_config(Parameter,Value) => Config = get_config_map(), Config.put(Parameter,Value). % % Round the number V with a precision of Prec. % Eg. roundf(100.1234567,0.001) -> 100.123 % roundf(V,Prec) = round(V/Prec)*Prec. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % float_to_rational(X) -> (Num,Den) % default: MaxDen=1e9, Eps=1e-12 % float_to_rational(X, MaxDen, Eps) -> (Num,Den) % customize bounds/precision % % Uses continued fractions with a final truncation step to respect MaxDen. % Always returns lowest terms (gcd-reduced). Handles negatives. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% to_rat(X) = float_to_rational(X). and_rat(X) = to_fstring("%w (%w)", X, to_rat(X)). and_ratf(X) = to_fstring("%.17f (%w)", X, to_rat(X)). % float_to_rational(X) = float_to_rational(X, 1000000000, 1.0e-17). % float_to_rational(X) = float_to_rational(X, 10**17, 1.0e-17). float_to_rational(X) = float_to_rational(X, 10**10, 1.0e-10). float_to_rational(X, MaxDen, Eps) = $(Num'/'Den) => % basic checks Sign = cond(X < 0.0, -1, 1), Xabs = abs(X), if Xabs =:= 0.0 then Num = 0, Den = 1 else % Continued-fraction convergents: % p[-2]=0,q[-2]=1 ; p[-1]=1,q[-1]=0 Pm2 = 0, Qm2 = 1, Pm1 = 1, Qm1 = 0, Xcur = Xabs, Num = 0, Den = 1, Done = false, while (Done == false) A = floor(Xcur).to_int(), P = A*Pm1 + Pm2, Q = A*Qm1 + Qm2, if Q > MaxDen then % Truncate last partial quotient to fit MaxDen K = (MaxDen - Qm2) // Qm1, if K < 0 then K := 0 end, P := K*Pm1 + Pm2, Q := K*Qm1 + Qm2, Num := P, Den := Q, Done := true else % Accept this convergent if within tolerance or exact Err = abs(P / Q - Xabs), Frac = Xcur - A, if (Err =< Eps) ; (Frac =:= 0.0) then Num := P, Den := Q, Done := true else % advance Pm2 := Pm1, Qm2 := Qm1, Pm1 := P, Qm1 := Q, Xcur := 1.0 / Frac end end end, % apply sign and reduce to lowest terms Num := Sign * Num, G = gcd(abs(Num), Den), if G > 1 then Num := Num // G, Den := Den // G end end. % Repeat a string S N times. repeat_string(S,N) = [S : _ in 1..N].flatten. /* Wrapper for showing different procedures and functions. Example: show(Data,[show_histogram,show_hpd_intervals],[mean]) */ show(Data) => show(Data,[show_probs_trunc,mean,variance,show_percentiles,show_hpd_intervals,show_histogram,show_summary_stats]). show(Data,Ps) => foreach(P in Ps,not (P = (_ = _))) if once(P.to_string.find("show",_,_)) then % This is a procedure show_* catch(call(P,Data),_,_) else % This is a function catch(T=apply(P,Data),E,_), if var(E) then println(P=T) end end end, nl. show_all(Map) => AddOrder = get_global_map(add_order).get(add_order,[]), % Presentation = get_config(presentation,Map.keys.sort), Presentation = get_config(presentation,AddOrder), foreach(K in Presentation) if not Map.has_key(K) then Error=to_fstring("Key %w is not a variable name. Check the presentation list.",K).to_atom, throw($error(Error)) end, printf("var : %w\n", K), show(Map.get(K)) end. show_all(Map,Ps) => AddOrder = get_global_map(add_order).get(add_order,[]), % Presentation = get_config(presentation,Map.keys.sort), Presentation = get_config(presentation,AddOrder), foreach(K in Presentation) if not Map.has_key(K) then Error=to_fstring("Key %w is not a variable name. Check the presentation list.",K).to_atom, throw($error(Error)) end, printf("var : %w\n", K), show(Map.get(K),Ps) end. /* Run a model: run_model(Runs,Model) run_model(Runs,Model,Pres) Example: The model can have parameters, here ObservedHeight (which must be instantiated) - run_model(20_000,$model(ObservedHeight),[show_probs,presentation=["z","a","t"]]) - run_model(20_000,$model(ObservedHeight)) Showing the default parameters for show_all/0-1. Note: The model name must be $ escaped if there's a parameter. Without parameters, no $ escape is needed (and it should not end with '()' since it will then be evaluated directly as a function). - run_model(20_000,model) */ run_model(Runs,Model) => run_model(Runs,Model,default). run_model(Runs,Model,Pres) => garbage_collect(400_000_000), Observations = get_global_map(observations), Store = get_global_map(store), Store.clear, Config = get_config_map(), Config.clear, VariableOrder = get_global_map(variable_order), VariableOrder.clear, VariableOrder.put(order,[]), AddOrder = get_global_map(add_order), AddOrder.put(add_order,[]), if check_parameter(random_seed=RandomSeed,Pres) then println(randomSeed=RandomSeed), _ = random(RandomSeed) else _ = random2() end, InitializeTable = false, if check_parameter(use_local_tabling=true,Pres) then % Using "local" table, i.e. for each run. println(use_local_tabling=true), InitializeTable := true end, GarbageCollectWhen = 1000, if check_parameter(garbage_collect_when=When,Pres) then println(garbage_collect_when=When), GarbageCollectWhen := When end, if check_parameter(min_accepted_samples=MinAcceptedSamples,Pres) then % println(min_accepted_samples=MinAcceptedSamples), NumAcceptedSamples = get_global_map(num_accepted_samples), NumAcceptedSamples.put(num_accepted_samples,0), NumAcceptedSamples.put(count_accepted_samples,true), NumAcceptedSamples.put(total_num_samples,0), if check_parameter(show_accepted_samples=true,Pres) then NumAcceptedSamples.put(show_accepted_samples,true) else NumAcceptedSamples.put(show_accepted_samples,false) end, OK = false, NumSamples = 0, while(OK == false) if InitializeTable then initialize_table end, NumSamples := NumSamples + 1, if NumSamples mod GarbageCollectWhen = 0 then garbage_collect() end, Observations.clear(), call(Model), if NumAcceptedSamples.get(num_accepted_samples) >= MinAcceptedSamples then OK := true end end, else foreach(C in 1..Runs) if InitializeTable then initialize_table end, % Clear the observations for this new run Observations.clear(), if C mod GarbageCollectWhen == 0 then garbage_collect() end, call(Model), end end, if Pres == default then show_all(Store) else Pres2 = [], foreach(P in Pres), % This is a configuration parameter: Parameter=Value if P = (Parameter=Value) then set_config(Parameter,Value) else Pres2 := Pres2 ++ [P] end end, show_all(Store,Pres2) end. check_parameter(Parameter=Var,Pres) => once(member(Parameter=Var,Pres)). /* Observe something. Note: If there are multiple observations, they must be collected into one single expression using ()'s. */ observe(Obs) => Map = get_global_map(observations), if cond(Obs,true,false) == true then Map.update_list("observations",true), else Map.update_list("observations",false) end. % Some wrapper that might be usefule. observe(L), list(L) => foreach(Obs in L) observe(Obs) end. observe(Obs1,Obs2) => observe(Obs1), observe(Obs2). observe(Obs1,Obs2,Obs3) => observe(Obs1,Obs2), observe(Obs3). observe(Obs1,Obs2,Obs3,Obs4) => observe(Obs1,Obs2,Obs3), observe(Obs4). /* observe_abc(Data,Ys) observe_abc(Data,Ys,Scale) observe_abc(Data,Ys,Scale,Methods) This is a convenience procedure for observing that a generated dataset (Ys) is "the same" as the original dataset Data. It uses a simple ABC (Approximate Bayesian Computation) inspired method for this. The Scale is for scaling the limit of the accepted interval (e.g. 1/4). Default Scale is 1. The Metods is a list of the statistics to use. Default is to use both mean and standard deviation (stdev). */ observe_abc(Data,Ys) => observe_abc(Data,Ys,1.0,[mean,stdev]). observe_abc(Data,Ys,Scale) => observe_abc(Data,Ys,Scale,[mean,stdev]). observe_abc(Data,Ys,Scale,Methods) => Mean = Data.mean, Stdev = Data.stdev, % Fix for very small standard deviation and discrete Data % if Stdev <= 0.01, integer(Data[1]) then % observe_abc_discrete(Data,Ys) % else if Stdev <= 0.01 then WarningMap = get_global_map(observe_abs_warning), HasSeenWarning = WarningMap.get(has_seen_warning,false), if HasSeenWarning == false then printf("Stdev is very small: %.5f and might give spurious results. Please adjust the scale parameter\n", Stdev), WarningMap.put(has_seen_warning,true) end end, YsMean = Ys.mean, YsStdev = Ys.stdev, if membchk(mean,Methods) then observe(abs(Mean-YsMean)<=Stdev*Scale) end, if membchk(stdev,Methods) then observe(abs(Stdev-YsStdev)<=Stdev*Scale) end, if membchk(variance,Methods) then Variance = Data.variance, YsVariance = Ys.variance, observe(abs(Variance-YsVariance)<=Variance*Scale) end, if membchk(skewness,Methods) then Skewness = Data.skewness, YsSkewness = Ys.skewness, observe(abs(Skewness-YsSkewness)<=Stdev*Scale) end, if membchk(kurtosis,Methods) then Kurtosis = Data.kurtosis, YsKurtosis = Ys.kurtosis, observe(abs(Kurtosis-YsKurtosis)<=Stdev*Scale) end, if membchk(median,Methods) then Median = Data.median, YsMedian = Ys.median, observe(abs(Median-YsMedian)<=Stdev*Scale) end, % General version of quantiles: % Note these must be $-escaped in the Methods list: % E.g. observe_abc(Data,Y,1,[mean,stdev,$q(01),$q(90),$q(99)]) % Qs = [ Q : V in Methods, V=$q(Q)], foreach(Q in Qs) DataQ = data_quantile(Data,Q/100), YsQ = data_quantile(Ys,Q/100), observe(abs(DataQ - YsQ) =< Stdev*Scale) end, % And some standard versions if membchk(q10,Methods) then Q10 = data_quantile(Data,0.10), YsQ10 = data_quantile(Ys,0.10), observe(abs(Q10 - YsQ10) =< Stdev*Scale) end, if membchk(q25,Methods) then Q25 = data_quantile(Data,0.25), YsQ25 = data_quantile(Ys,0.25), observe(abs(Q25 - YsQ25) =< Stdev*Scale) end, if membchk(q50,Methods) then Q50 = data_quantile(Data,0.50), YsQ50 = data_quantile(Ys,0.50), observe(abs(Q50 - YsQ50) =< Stdev*Scale) end, if membchk(q75,Methods) then Q75 = data_quantile(Data,0.75), YsQ75 = data_quantile(Ys,0.75), observe(abs(Q75 - YsQ75) =< Stdev*Scale) end, if membchk(q90,Methods) then Q90 = data_quantile(Data,0.90), YsQ90 = data_quantile(Ys,0.90), observe(abs(Q90 - YsQ90) =< Stdev*Scale) end, % Moments Moments = [ K : V in Methods, V=$moment(K)], foreach(K in Moments) MData = data_moment(Data,K), MYs = data_moment(Ys,K), observe(abs(MData - MYs) =< Stdev*Scale) end, CentralMoments = [ K : V in Methods, V=$central_moment(K)], foreach(K in CentralMoments) MData = data_central_moment(Data,K), MYs = data_central_moment(Ys,K), observe(abs(MData - MYs) =< Stdev*Scale) end, StandardizedMoments = [ K : V in Methods, V=$standardized_moment(K)], foreach(K in StandardizedMoments) MData = data_standardized_moment(Data,K), MYs = data_standardized_moment(Ys,K), % dimensionless observe(abs(MData - MYs) =< Scale) end, if membchk(range,Methods) then Range = data_range(Data), YRange = data_range(Ys), observe(abs(Range - YRange) =< Stdev*Scale) end, if membchk(iqr,Methods) then IQR = data_iqr(Data), YIQR = data_iqr(Ys), observe(abs(IQR - YIQR) =< Stdev*Scale) end, if membchk(mad,Methods) then MAD = data_mad(Data), YMAD = data_mad(Ys), observe(abs(MAD - YMAD) =< Stdev*Scale) end, if membchk(sum,Methods) then Sum = Data.sum, YSum = Ys.sum, observe(abs(Sum - YSum) =< Stdev*Scale) end. observe_abc_discrete(Data,Ys) => observe_abc_discrete(Data,1). observe_abc_discrete(Data,Ys,Scale) => Mean = Data.mean, IsDiscrete = cond(integer(Data[1]),true,false), if IsDiscrete then Stdev = max(Data.stdev,1) else Stdev = Data.stdev end, if Stdev <= 0.01 then WarningMap = get_global_map(observe_abs_warning), HasSeenWarning = WarningMap.get(has_seen_warning,false), if HasSeenWarning == false then printf("Stdev is very small: %.5f and might give spurious results. Please adjust the scale parameter\n", Stdev), WarningMap.put(has_seen_warning,true) end end, YsMean = Ys.mean, YsStdev = Ys.stdev, observe(abs(Mean-YsMean)<=Stdev*Scale), observe(abs(Stdev-YsStdev)<=Stdev*Scale). /* ------------------------------------------------------------ recommend_abc_stats(Data) ------------------------------------------------------------ Analyzes the dataset Data and recommends a list of ABC summary statistics to use in observe_abc/4. The goal is to automatically choose statistics that are informative and robust for the given data distribution. Logic: - Checks skewness, kurtosis, and basic spread. - Uses heuristic thresholds to classify the data as: * roughly normal * mildly skewed * strongly skewed or heavy-tailed - Adds robust or shape-sensitive statistics accordingly. Returns: Methods : list of atoms and/or $q(N) terms suitable for observe_abc/4 Example: Picat> Data = normal_dist_n(100,15,1000000), Picat> recommend_abc_stats(Data) [mean,stdev] Picat> Data = pareto_dist_n(1,3,1000000), Picat> recommend_abc_stats(Data) [median,iqr,mad,q10,q90,kurtosis] ------------------------------------------------------------ */ recommend_abc_stats(Data) = Methods => if not list(Data) then throw($error('domain_error(list,Data),recommend_abc_stats/1')) elseif length(Data) < 10 then % Too small sample: keep it simple Methods = [mean, stdev] else Skew = skewness(Data), Kurt = kurtosis(Data), MinVal = min(Data), MaxVal = max(Data), Range = MaxVal - MinVal, SD = stdev(Data), Ratio = cond(SD =:= 0.0, 0.0, Range / SD), if abs(Skew) < 0.1, abs(Kurt) < 0.5 then % Roughly normal-like Methods = [mean, stdev] elseif abs(Skew) < 0.5, abs(Kurt) < 2.0 then % Mildly skewed, moderate tails Methods = [mean, stdev, q25, q75] elseif abs(Skew) >= 0.5, abs(Kurt) < 5.0 then % Clearly skewed Methods = [median, iqr, mad, skewness] elseif abs(Skew) >= 0.5, abs(Kurt) >= 5.0 then % Strongly skewed and heavy-tailed Methods = [median, iqr, mad, q10, q90, kurtosis] else % Catch-all fallback Methods = [mean, stdev, median] end, % Optional: add range for data with extreme spread if Ratio > 10.0 then Methods := Methods ++ [range] end end. /* ------------------------------------------------------------ recommend_abc_stats_explained(Data) ------------------------------------------------------------ Analyzes a dataset and recommends a set of ABC summary statistics to use in observe_abc/4, together with a short text explanation. Returns: [Methods, Comment] Methods : list of recommended statistics Comment : short plain-text explanation of reasoning Example: Picat> Data = normal_dist_n(100,15,1000000), Picat> [Methods,Comment] = recommend_abc_stats_explained(Data), Picat> println(Methods), Picat> println(Comment). Output: [mean,stdev] "Data appears roughly symmetric with light tails. Using mean and stdev." ------------------------------------------------------------ */ recommend_abc_stats_explained(Data) = [Methods, Comment] => if not list(Data) then throw($error('domain_error(list,Data),recommend_abc_stats_explained/1')) elseif length(Data) < 10 then Methods = [mean, stdev], Comment = "Sample size too small for reliable shape analysis. Using mean and stdev only." else Skew = skewness(Data), Kurt = kurtosis(Data), MinVal = min(Data), MaxVal = max(Data), Range = MaxVal - MinVal, SD = stdev(Data), Ratio = cond(SD =:= 0.0, 0.0, Range / SD), if abs(Skew) < 0.1, abs(Kurt) < 0.5 then Methods = [mean, stdev], Comment = "Data appears roughly symmetric with light tails. Using mean and stdev." elseif abs(Skew) < 0.5, abs(Kurt) < 2.0 then Methods = [mean, stdev, q25, q75], Comment = "Data shows mild skewness or moderate tails. Adding quantiles for shape." elseif abs(Skew) >= 0.5, abs(Kurt) < 5.0 then Methods = [median, iqr, mad, skewness], Comment = "Data is clearly skewed. Using robust statistics (median, IQR, MAD, skewness)." elseif abs(Skew) >= 0.5, abs(Kurt) >= 5.0 then Methods = [median, iqr, mad, q10, q90, kurtosis], Comment = "Data appears strongly skewed and heavy-tailed. Using robust shape statistics." else Methods = [mean, stdev, median], Comment = "Data shows irregular structure. Using combined central and robust summaries." end, % Optional: detect extreme range vs. spread if Ratio > 10.0 then Methods := Methods ++ [range], Comment := Comment ++ " Large range detected; added range statistic." end end. /* Adjustable observe_abc. Very experimental! */ observe_abc_adj(Data,Ys) => observe_abc_adj(Data,Ys,1.0). observe_abc_adj(Data,Ys,Scale) => AdjScaleMap = get_global_map(abc_scale), AdjScale = AdjScaleMap.get(abc_scale,Scale), % println(adjScale=AdjScale), Mean = Data.mean, Stdev = Data.stdev, % Fix for very small standard deviation and discrete Data % if Stdev <= 0.01, integer(Data[1]) then % observe_abc_discrete(Data,Ys) % else if Stdev <= 0.01 then WarningMap = get_global_map(observe_abs_warning), HasSeenWarning = WarningMap.get(has_seen_warning,false), if HasSeenWarning == false then printf("Stdev is very small: %.5f and might give spurious results. Please adjust the scale parameter\n", Stdev), WarningMap.put(has_seen_warning,true) end end, YsMean = Ys.mean, YsStdev = Ys.stdev, Len = min(Data.len,Ys.len), DiffSum = [Ys[I]-Data[I] : I in 1..Len].sum, StdevScaled = Stdev*AdjScale, if DiffSum > 10 then NewAdjScale := AdjScale / 1.01, NewStdevScaled = Stdev*NewAdjScale, if NewAdjScale > 0.1, NewStdevScaled > 0.5 then println([data_mean=Mean,data_stdev=Stdev,ys_mean=YsMean,ys_stdev=YsStdev,diff_sum=DiffSum,diff_sum_div_n=(DiffSum/Len)]), AdjScale := NewAdjScale, AdjScaleMap.put(abc_scale,AdjScale), StdevScaled := NewStdevScaled, println("Adjusting adj scale. New"=AdjScale=(1/AdjScale)=StdevScaled) end end, observe(abs(Mean-YsMean)<=StdevScaled), observe(abs(Stdev-YsStdev)<=StdevScaled). /* Check if all observations evaluates to true */ observed_ok() => Map = get_global_map(observations), if not Map.has_key("observations") then throw($error('observed_ok: There are no observations. Please check the model.')) else % If the user has num_accepted_sample=... NumAcceptedSamples = get_global_map(num_accepted_samples), if NumAcceptedSamples.get(count_accepted_samples,false) == true then DoAcceptedSamples = true, NumAcceptedSamples.put(total_num_samples,NumAcceptedSamples.get(total_num_samples)+1), else DoAcceptedSamples = false end, if Map.get("observations").remove_dups == [true] then if DoAcceptedSamples then NumAcceptedSamples.put(num_accepted_samples,NumAcceptedSamples.get(num_accepted_samples,0)+1), if NumAcceptedSamples.get(show_accepted_samples) == true then AcceptedSamples = NumAcceptedSamples.get(num_accepted_samples), TotalSamples = NumAcceptedSamples.get(total_num_samples), printf("Num accepted samples: %d Total samples: %d (%.7f%%)\n", AcceptedSamples, TotalSamples, (AcceptedSamples/TotalSamples)) end end, true else false end end. data_range(Data) = max(Data) - min(Data). data_iqr(Data) = data_quantile(Data,0.75) - data_quantile(Data,0.25). data_mad(Data) = data_quantile([abs(X - data_quantile(Data,0.5)) : X in Data], 0.5). /* ------------------------------------------------------------ Moment Functions for Datasets ------------------------------------------------------------ These functions compute the k-th raw, central, and standardized moments of a numeric dataset Data = [X1, X2, ..., Xn]. ------------------------------------------------------------ 1. data_moment(Data, K) ------------------------------------------------------------ Computes the raw (uncentered) k-th moment: m_k = (1/n) * sum( X_i^k ) Meaning: Measures overall magnitude. Sensitive to both scale and location (shift). For example: m1 = mean m2 = mean of squares Parameters: Data : list of numeric values (length >= 1) K : integer >= 1 Returns: The raw moment value (float) Throws: domain_error if Data is not a list or is empty domain_error if K is not a positive integer Units: Same as (data unit)^K Example: data_moment([1,2,3], 3) -> 12.0 ------------------------------------------------------------ 2. data_central_moment(Data, K) ------------------------------------------------------------ Computes the central (about the mean) k-th moment: μ_k = (1/n) * sum( (X_i - mean)^k ) Meaning: Measures shape around the mean, independent of shift. Examples: μ2 = variance μ3 = third central moment (raw skewness numerator) μ4 = fourth central moment (raw kurtosis numerator) Parameters: Data : list of numeric values (length >= 1) K : integer >= 1 Returns: The k-th central moment (float) Throws: domain_error if Data is not a list or is empty domain_error if K is not a positive integer Units: Same as (data unit)^K Example: data_central_moment([1,2,3,4,5], 2) -> 2.0 ------------------------------------------------------------ 3. data_standardized_moment(Data, K) ------------------------------------------------------------ Computes the standardized (dimensionless) k-th moment: γ_k = μ_k / (σ^k) where μ_k is the central moment and σ is the standard deviation. Meaning: Provides a scale-free measure of shape. Useful for comparing distributions with different units or scales. Examples: γ3 = skewness γ4 = kurtosis Parameters: Data : list of numeric values (length >= 2) K : integer >= 1 Returns: The standardized moment (float) Throws: domain_error if Data length < 2 domain_error if variance is zero domain_error if K is not a positive integer Units: Dimensionless Example: data_standardized_moment([1,2,3,4,5], 4) -> 1.7 ------------------------------------------------------------ Implementation notes: ------------------------------------------------------------ * Uses float-first arithmetic. * Throws explicit $error(domain_error(...)) on invalid inputs. * For variance or higher moments, division by N (population) is used. Replace with (N-1) if you prefer sample moments. * When variance = 0.0, standardized moments are undefined and a domain_error is thrown. * All computations use pow/2; replace with safe_pow/2 for additional underflow protection if desired. ------------------------------------------------------------ Integration in ABC framework: ------------------------------------------------------------ These functions can be used as ABC summary statistics, via parameterized functors such as: $moment(K) $central_moment(K) $standardized_moment(K) Example usage in observe_abc: elseif M = $standardized_moment(K) then MData = data_standardized_moment(Data,K), MYs = data_standardized_moment(Ys,K), observe(abs(MData - MYs) =< Scale) ------------------------------------------------------------ ------------------------------------------------------------ Named and commonly used moments ------------------------------------------------------------ Order (k) | Moment type | Common name | Symbol / Relation | Interpretation -----------+---------------+-------------+-----------------------------+----------------------------- 1 | Raw / Central | Mean | μ = E[X] | Location (center of mass) 2 | Central | Variance | σ² = E[(X - μ)²] | Spread or dispersion 3 | Standardized | Skewness | γ₁ = μ₃ / σ³ | Asymmetry (left/right tail) 4 | Standardized | Kurtosis | γ₂ = μ₄ / σ⁴ | Tail heaviness / peakedness Notes: * The zeroth moment (k = 0) is always 1 for normalized distributions. * Moments of order higher than 4 rarely have names or practical use. * Higher moments (k ≥ 5) mainly emphasize rare outliers and are numerically unstable for finite datasets. ------------------------------------------------------------ ------------------------------------------------------------ Interpretation summary of statistical moments ------------------------------------------------------------ k | What it measures | Reliability from data --+-------------------------+----------------------------- 1 | Location (mean) | Very stable 2 | Scale / variability | Stable 3 | Asymmetry (skewness) | Needs moderate sample size 4 | Tail weight (kurtosis) | Needs large sample size ≥5 | Finer shape details | Highly unstable / rarely useful Notes: * The first two moments (mean and variance) are almost always reliable. * The 3rd and 4th moments (skewness, kurtosis) require larger samples to estimate accurately. * Moments beyond 4 are seldom useful in practice; they are dominated by outliers and numerical noise. ------------------------------------------------------------ */ /* Raw k-th moment */ data_moment(Data, K) = M => if not list(Data) then throw($error('domain_error(list,Data),data_moment/2')) elseif length(Data) == 0 then throw($error('domain_error(length,Data),data_moment/2')) elseif not integer(K) ; K < 1 then throw($error('domain_error(number,K),data_moment/2')) else N = length(Data), M = sum([pow(X, K) : X in Data]) / N end. /* Central k-th moment */ data_central_moment(Data, K) = M => if not list(Data) then throw($error('domain_error(list,Data),data_central_moment/2')) elseif length(Data) == 0 then throw($error('domain_error(length,Data),data_central_moment/2')) elseif not integer(K) ; K < 1 then throw($error('domain_error(number,K),data_central_moment/2')) else N = length(Data), Mean = sum(Data) / N, M = sum([safe_pow(X - Mean, K) : X in Data]) / N end. /* Standardized k-th moment */ data_standardized_moment(Data, K) = M => N = length(Data), if N < 2 then throw($error('domain_error(length,Data),data_standardized_moment/2')) else Mean = sum(Data) / N, Devs = [X - Mean : X in Data], Var = sum([D * D : D in Devs]) / N, SD = sqrt(Var), if SD =:= 0.0 then throw($error('domain_error(number,SD),data_standardized_moment/2')) else M = sum([pow(D / SD, K) : D in Devs]) / N end end. /* post_summary(Data) prints a summary of the difference with Data and the posterior samples that was generated with add_post(Data,Y). */ post_summary(Data) => println("Post summary:"), Store = get_store(), Post = [], Diffs = [], Vs = [], foreach(I in 1..Data.len) IS = I.to_string, P = Store.get("post "++IS).mean, Post := Post ++ [P], D = Store.get("post "++IS++" diff").mean, Diffs := Diffs ++ [D], Vs := Vs ++ [[Data[I],P,D]] end, foreach([V,P,D] in Vs) println([orig=V,post=P,diff=D]) end, println([sum_diffs=Diffs.sum,sum_abs_diffs=Diffs.map(abs).sum]), nl. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % show_histogram(Data) % ------------------------------------------------------------ % Prints an ASCII histogram for numeric (continuous/discrete) % or symbolic data. Format: % value: count ### (probability / cumulative) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% show_histogram(Data) => if Data == [] then println("Empty dataset."), fail end, Limit = get_config(histogram_limit,40), N = length(Data), Unique = sort(remove_dups(Data)), UniqueLen = Unique.len, % Determine data type if number(Data[1]), UniqueLen > 1 then Min = min(Data), Max = max(Data), % Heuristic: if all are integers and count of unique <= 40 (Limit), discrete if integer(Data[1]), length(Unique) =< Limit then show_histogram_discrete(Unique, Data, N) else show_histogram_continuous(Min, Max, Data, N, min(UniqueLen,Limit)) end else show_histogram_symbolic(Data, N) end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Continuous numeric histogram %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% show_histogram_continuous(Min, Max, Data, N, Bins) => Step = (Max - Min) / Bins, Counts = new_list(Bins, 0), foreach(X in Data) I = min(Bins, max(1, round((X - Min) / Step) + 1)), Counts[I] := Counts[I] + 1 end, MaxCount = max(Counts), Cum = 0.0, printf("Histogram (total %d)\n", Data.len), foreach(I in 1..Bins) X0 = Min + (I-1)*Step, C = Counts[I], P = C / N, Cum := Cum + P, BarLen = round(60 * C / MaxCount), Bar = repeat_string("#", BarLen), printf("%7.3f: %5d %-60s (%.3f / %.3f)\n", X0, C, Bar, P, Cum) end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Discrete numeric histogram (integers) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% show_histogram_discrete(Unique, Data, N) => printf("Histogram (total %d)\n", Data.len), MaxCount = max([count_eq(Data, X) : X in Unique]), Cum = 0.0, foreach(X in Unique) C = count_eq(Data, X), P = C / N, Cum := Cum + P, BarLen = round(60 * C / MaxCount), Bar = repeat_string("#", BarLen), printf("%7d: %5d %-60s (%.3f / %.3f)\n", X, C, Bar, P, Cum) end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Symbolic histogram %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% show_histogram_symbolic(Data, N) => Symbols = sort(remove_dups(Data)), printf("Histogram (total %d)\n", Data.len), MaxCount = max([count_eq(Data, S) : S in Symbols]), Cum = 0.0, foreach(S in Symbols) C = count_eq(Data, S), P = C / N, Cum := Cum + P, BarLen = round(60 * C / MaxCount), Bar = repeat_string("#", BarLen), printf("%-10w: %5d %-60s (%.3f / %.3f)\n", S, C, Bar, P, Cum) end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Helper: count occurrences of X in list %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% count_eq(List, X) = length([Y : Y in List, Y == X]). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % show_percentiles(Data) % default grid % show_percentiles(Data, Ps :: list(float)) % custom percentiles in (0,1) % % Output format: % Percentiles: % (0.01 64.50...) % ... %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% show_percentiles(Data) => DefaultPs = [0.001,0.01,0.025,0.05,0.10,0.25,0.50,0.75,0.84,0.90,0.95,0.975,0.99,0.999,0.9999,0.99999], show_percentiles(Data, DefaultPs). show_percentiles(Data, Ps1) => if Data == [] then println("Empty dataset."), fail end, Ps = get_config(percentiles,Ps1), println("Percentiles:"), if number(Data[1]) then % ---- Numeric: R's Type-7 interpolation ---- Sorted = sort(Data), foreach(P in Ps) Q = percentile_numeric_type7(Sorted, P), % Use maximum double precision (up to 17 significant digits) printf("(%.15g %.17g)\n", P, Q) end else % ---- Discrete (integers or symbols): step CDF ---- Syms = sort(remove_dups(Data)), Counts = [count_eq(Data, V) : V in Syms], N = length(Data), Cum = 0.0, CDFPairs = [], foreach(I in 1..length(Syms)) Cum := Cum + Counts[I]/N, CDFPairs := CDFPairs ++ [(Syms[I], Cum)] end, foreach(P in Ps) V = quantile_step(CDFPairs, P), % P at high precision; V printed exactly (int/symbol) printf("(%.15g %w)\n", P, V) end end. % % Returns the percentiles % get_percentiles(Data) = get_percentiles(Data, DefaultPs) => DefaultPs = [0.001,0.01,0.025,0.05,0.10,0.25,0.50,0.75,0.84,0.90,0.95,0.975,0.99,0.999,0.9999,0.99999]. get_percentiles(Data, Ps1) = Percentiles => if Data == [] then println("Empty dataset."), fail end, Ps = get_config(percentiles,Ps1), Percentiles = [], if number(Data[1]) then % ---- Numeric: R's Type-7 interpolation ---- Sorted = sort(Data), foreach(P in Ps) Q = percentile_numeric_type7(Sorted, P), % Use maximum double precision (up to 17 significant digits) % printf("(%.15g %.17g)\n", P, Q) Percentiles := Percentiles ++ [[P,Q]] end else % ---- Discrete (integers or symbols): step CDF ---- Syms = sort(remove_dups(Data)), Counts = [count_eq(Data, V) : V in Syms], N = length(Data), Cum = 0.0, CDFPairs = [], foreach(I in 1..length(Syms)) Cum := Cum + Counts[I]/N, CDFPairs := CDFPairs ++ [(Syms[I], Cum)] end, foreach(P in Ps) V = quantile_step(CDFPairs, P), % P at high precision; V printed exactly (int/symbol) % printf("(%.15g %w)\n", P, V) Percentiles := Percentiles ++ [[P,Q]] end end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Numeric percentile — R's Type-7 % h = 1 + (n-1)p, q = x_[floor h] + (h-floor h)(x_[ceil h] - x_[floor h]) % Assumes Sorted is ascending and non-empty, p in (0,1) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% percentile_numeric_type7(Sorted, P) = Q => N = length(Sorted), if P =< 0.0 then Q = Sorted[1] elseif P >= 1.0 then Q = Sorted[N] else H = 1.0 + (N - 1.0) * P, L = floor(H).to_int(), U = ceiling(H).to_int(), Gamma = H - L, Xl = Sorted[L], Xu = Sorted[U], Q = Xl + Gamma * (Xu - Xl) end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % quantile_step(CDFPairs, P) % ------------------------------------------------------------ % Return the smallest Value such that cumulative >= P. % CDFPairs = [(Value1,Cum1), (Value2,Cum2), ...] %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% quantile_step(CDFPairs, P) = V => V = find_quantile_step(CDFPairs, P). find_quantile_step([], _P) = none. % should not happen if P<=1 find_quantile_step([(Val, Cum) | Rest], P) = cond(P =< Cum, Val,find_quantile_step(Rest, P)). /* pdf_from_data(Data) Returns a list of Val=Prob for the values in Data which corresponds to the PDF. */ pdf_from_data(Data) = Data.get_probs().sort. /* cdf_from_data(Data) Returns a list of cumulative probabiltiies Val=CumProb for the values in Data */ cdf_from_data(Data) = CDF => PDF = pdf_from_data(Data), CDF = [], Cum = 0, foreach(V=F in PDF) Cum := Cum + F, CDF := CDF ++ [V=Cum] end. /* cdf_from_pdf(PDF) Returns a list of cumulative probabiltiies Val=CumProb for the values in Data */ cdf_from_pdf(PDF) = CDF => CDF = [], Cum = 0, foreach(V=F in PDF) Cum := Cum + F, CDF := CDF ++ [V=Cum] end. /* quantile_from_cdf(Data,Q) Returns the value that corresponds to the Q'th quantile in the CDF (generated by cdf_from_data/1). */ quantile_from_cdf(CDF,Q) = Res => OK = false, I = 1, while(OK == false) C=Val = CDF[I], if Val >= Q then OK := C else I := I + 1 end end, Res = cond(OK==false,0,OK). /* quantile_from_data(Data,Q) Returns the value that corresponds to the Q'th quantile in data (via cdf_from_data/1 cdf_from_pdf/1). */ quantile_from_data(Data,Q) = Res => CDF = cdf_from_data(Data), Res = quantile_from_cdf(CDF,Q). /* data_sigma(Data, Sigma) Given a dataset Data = [X1,X2,...,Xn] and a multiple Sigma, compute: Mean = mean(Data) Stdev = stdev(Data) Val = Mean + Sigma * Stdev CDF = fraction of data <= Val UpperTail = 1.0 - CDF Returns [Val, UpperTail]. Parameter constraints: * Data must be a list of numbers (length >= 2) * Sigma must be numeric Throws explicit domain_error on invalid input. Cf dist_sigma/2 (in ppl_distributions) Picat> D=data_sigma(normal_dist_n(100,15,1000000),4) D = [160.05431345860336,0.000027] Picat> D=dist_sigma($normal_dist(100,15),4) D = [160.0,0.000031671241833] */ data_sigma(Data, Sigma) = [Val, UpperTail] => if not list(Data) then throw($error('domain_error(list,Data),data_sigma/2')) elseif length(Data) < 2 then throw($error('domain_error(length,Data),data_sigma/2')) elseif not number(Sigma) then throw($error('domain_error(number,Sigma),data_sigma/2')) else N = length(Data), Mean = Data.mean, % sum(Data) / N, % Deviations = [X - Mean : X in Data], % Variance = sum([D * D : D in Deviations]) / N, Stdev = Data.stdev, % sqrt(Variance), Val = Mean + Sigma * Stdev, % Empirical CDF: fraction of points <= Val CountLe = length([X : X in Data, X =< Val]), CDF = CountLe / N, UpperTail = 1.0 - CDF end. /* ------------------------------------------------------------ data_quantile(Data, P) ------------------------------------------------------------ Computes the empirical quantile of a dataset. Given: Data : list of numeric values P : quantile probability in [0.0, 1.0] Returns: The empirical quantile value q such that fraction of Data <= q ≈ P Method: * Sorts Data in ascending order. * Uses linear interpolation between order statistics: position = (N - 1) * P + 1 lower index = floor(position) upper index = ceiling(position) interpolates if needed. Parameter constraints: * Data must be a list of numbers, length >= 1 * P must be between 0.0 and 1.0 Throws explicit domain_error on invalid input. Note: This is a variant of get_percentiles/2 but returns only the value. */ data_quantile(Data, P) = Q => if not list(Data) then throw($error('domain_error(list,Data),data_quantile/2')) elseif length(Data) == 0 then throw($error('domain_error(length,Data),data_quantile/2')) elseif P < 0.0 ; P > 1.0 then throw($error('domain_error(number,P),data_quantile/2')) else Sorted = sort(Data), N = length(Sorted), if N == 1 then Q = Sorted[1] else Pos = (N - 1) * P + 1.0, I = floor(Pos), F = Pos - I, if I < 1 then Q = Sorted[1] elseif I >= N then Q = Sorted[N] else Lower = Sorted[I], Upper = Sorted[I + 1], Q = Lower + F * (Upper - Lower) end end end. /* A version of data_quantile/2 data_percentiles(Data,1..99) I.e. the percentiles are written as 1..99 instead of 0.01 and 0.99 */ data_percentiles(Data,Pcts) = [data_quantile(Data,Q/100) : Q in Pcts]. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % show_hpd_intervals(Data) % default masses % show_hpd_intervals(Data, Masses :: list(float)) % e.g. [0.84,0.9,0.95,0.99] % Prints: HPD interval (Mass): Lower..Upper for each Mass in list %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% show_hpd_intervals(Data) => Default = [0.5,0.84,0.90,0.95,0.99,0.999,0.9999,0.99999], show_hpd_intervals(Data, Default). show_hpd_intervals(Data, Masses1) => println("HPD intervals:"), Masses = get_config(hpd_intervals,Masses1), if Data == [] then throw("show_hpd_intervals: empty dataset") elseif not is_all_numbers(Data) then % throw("show_hpd_intervals: data must be numeric") % println("show_hpd_intervals: data is not numeric") println("") else foreach(M in Masses) (M =< 0.0 ; M > 1.0) -> throw("show_hpd_intervals: each Mass must be in (0,1]") ; (L,U) = hpd_interval(Data, M), printf("HPD interval (%w): %.14f..%.14f\n", M, L, U) end end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % hpd_interval(Data, Mass) = (Lower, Upper) % Narrowest interval covering 'Mass' fraction of samples (numeric only). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% hpd_interval(Data, Mass) = (Lower, Upper) => (Data == []) -> throw("hpd_interval: empty dataset") ; (not is_all_numbers(Data)) -> throw("hpd_interval: data must be numeric") ; (Mass =< 0.0 ; Mass > 1.0) -> throw("hpd_interval: Mass must be in (0,1]") ; Sorted = sort(Data), N = length(Sorted), K0 = round(ceiling(Mass * N)), K = cond(K0 < 1, 1, cond(K0 > N, N, K0)), BestI = 1, BestW = 1.0e300, MaxI = N - K + 1, foreach(I in 1..MaxI) J = I + K - 1, W = Sorted[J] - Sorted[I], if W < BestW then BestW := W, BestI := I end end, Lower = Sorted[BestI], Upper = Sorted[BestI + K - 1]. /* (Equally tailed) credible intervals for a probability distribution. credible_intervals_exact(Dist,Q) Returns the Low,High value of the credible interval Q Note: * This is NOT HPD intervals, but are simpler to calculate exact for a probability distribution. In general credible intervals are wider than HPDs. * I.e. this cannot be used for random generated data (which show_hpd_intervals do). */ credible_intervals_exact(Dist,HPD) = [V1,V2] => T = HPD / 2, Low = 0.5 - T, High = 0.5 + T, V1 = quantile(Dist,Low), V2 = quantile(Dist,High). show_credible_intervals_exact(Dist,Qs) => println("Credible intervals (exact):"), foreach(Q in Qs) [V1,V2] = credible_intervals_exact(Dist,Q), if integer(V1), integer(V2) then printf("%w: %d..%d\n",Q,V1,V2), else printf("%w: %.15f .. %0.15f\n",Q,V1,V2) end end, nl. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Helper: ensure list is all numeric %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% is_all_numbers([]). is_all_numbers([X|Xs]) :- number(X), is_all_numbers(Xs). % % Variance of Xs % variance(Xs) = Variance => if number(Xs[1]) then Mu = avg(Xs), N = Xs.len, % Variance = sum([ (X-Mu)**2 : X in Xs ]) / N Variance = sum([ safe_pow(X-Mu,2) : X in Xs ]) / N else if Xs[1] == false ; Xs[1] == true then Nums = [cond(V==true,1,0) : V in Xs], Variance = variance(Nums) else Variance = "data is not numeric" end end. % Standard deviation stdev(Xs) = sqrt(variance(Xs)). /* Return the freqencies of the elements in List */ get_freq(List) = Map => Map = new_map(), foreach(E in List) Map.put(E,Map.get(E,0)+1) end. % alias for get_freq/1 collect(List) = get_freq(List). show_freq(List) => println("Show freqs:"), Freqs = get_freq(List), foreach(K in Freqs.keys.sort) println(K=Freqs.get(K)) end. show_freq_prob(List) => println("Show freqs:"), Freqs = get_freq(List), Total = Freqs.values.sum, foreach(K in Freqs.keys.sort) F = Freqs.get(K), FT = F/Total, printf("%w: %d (%.16f)\n",K,F,FT) end. % % Print the values in list L and their normalized probabilities % in decreasing order. % show_probs(L) => println("Probabilities:"), foreach(K=V in get_probs(L)) printf("%w: %.16f\n",K,V) end. % Truncated version % Configurate the size by setting global_map truncate_size show_probs_trunc(L) => Size = get_config(truncate_size,4), Probs = get_probs(L), Len = Probs.len, if Len > Size * 2 + 2 then println("Probabilities (truncated):"), Probs1 = Probs[1..Size], Probs2 = Probs[Len-Size+1..Len], S = "", foreach(K=V in Probs1) S := S ++ to_fstring("%w: %.16f\n",K,V) end, S := S ++ ".........\n", foreach(K=V in Probs2) S := S ++ to_fstring("%w: %.16f\n",K,V) end, print(S) else % println("Probabilities:"), % foreach(K=V in Probs) % printf("%w: %.16f\n",K,V) % end show_probs(L) end. show_probs_rat(L) => println("Probabilities:"), foreach(K=V in get_probs(L)) printf("%w: %w\n",K,V.and_ratf) end. % Truncated version show_probs_rat_trunc(L) => Size = get_config(truncate_size,4), Probs = get_probs(L), Len = Probs.len, if Len > Size*2 + 2 then println("Probabilities (truncated):"), Probs1 = Probs[1..Size], Probs2 = Probs[Len-Size+1..Len], S = "", foreach(K=V in Probs1) S := S ++ to_fstring("%w: %w\n",K,V.and_ratf) end, S := S ++ ".........\n", foreach(K=V in Probs2) S := S ++ to_fstring("%w: %w\n",K,V.and_ratf) end, print(S) else % println("Probabilities:"), show_probs_rat(L) end. % Return the values of list L and their normalized probabilities % in decreasing order. get_probs(L) = Probs => Len = L.len, Freq = L.get_freq.to_list.sort_down(2), Probs=[], foreach(K=V in Freq) Probs := Probs ++ [K=V/Len] end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % PPAST: Summary statistics for numeric datasets % ---------------------------------------------- % show_summary_stats(Data) -> prints a stats report % Also exposes individual functions if you want programmatic access. % % Includes: % - Count, Min, Max, Sum, Mean % - Population/Sample Variance & Std Dev % - Skewness (Fisher’s unbiased), Kurtosis Excess (unbiased) % - Median, Q1, Q3 (R Type-7 percentiles) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% show_stats(Data) => show_summary_stats(Data). show_summary_stats(Data) => (Data == []) -> throw("show_summary_stats: empty dataset") ; (not is_all_numbers(Data)) -> throw("show_summary_stats: data must be numeric") ; N = length(Data), Sorted = sort(Data), (Mean,M2,M3,M4) = online_moments(Data), VarPop = cond(N > 0, M2/N, 0.0), VarSmp = cond(N > 1, M2/(N-1), 0.0), StdPop = sqrt(VarPop), StdSmp = sqrt(VarSmp), Skew = skewness_unbiased(N, M2, M3), % NaN if N<3 KurtE = kurtosis_excess_unbiased(N, M2, M4), % NaN if N<4 Med = median_sorted(Sorted), Q1 = percentile_numeric_type7(Sorted, 0.25), Q3 = percentile_numeric_type7(Sorted, 0.75), println("Summary statistics"), printf("Count: %d\n", N), printf("Min: %.17g\n", Sorted[1]), printf("Max: %.17g\n", Sorted[N]), printf("Sum: %.17g\n", sum(Data)), printf("Mean: %.17g\n", Mean), printf("Variance (pop): %.17g\n", VarPop), printf("Std dev (pop): %.17g\n", StdPop), printf("Variance (sample): %.17g\n", VarSmp), printf("Std dev (sample): %.17g\n", StdSmp), printf("Skewness (unbiased): %.17g\n", Skew), printf("Kurtosis excess: %.17g\n", KurtE), printf("Median: %.17g\n", Med), printf("Q1 (25%%): %.17g\n", Q1), printf("Q3 (75%%): %.17g\n", Q3). show_simple_stats(Data) => if not number(Data[1]) then println("show_simple_stats: no numeric data") else println([len=Data.len,min=Data.min,mean=Data.mean,median=Data.median,max=Data.max,variance=Data.variance,stdev=Data.stdev]) end. % Mean value of data mean(Data) = Res => % Convert boolean to 0/1 if Data[1] == true ; Data[1] == false then Data := [cond(D==true,1,0) : D in Data] end, if number(Data[1]) then Res = cond(number(Data[1]), sum(Data) / length(Data), not_numeric) else Res = "" % Res = get_probs(Data), % if Res.len > 20 then % println("Mean (truncated)"), % Res := Res[1..20] % end end. % Variance variance_population(Data) = (M2/length(Data)) => (_,M2,_,_) = online_moments(Data). variance_sample(Data) = (M2/(length(Data)-1)) => (length(Data) > 1) -> true ; throw("variance_sample: need N>1"), (_,M2,_,_) = online_moments(Data). std_population(Data) = sqrt(variance_population(Data)). std_sample(Data) = sqrt(variance_sample(Data)). % skewness(Data) = skewness_unbiased(N,M2,M3) => % (N, M2, M3, _) = online_moments(Data). kurtosis_excess(Data) = kurtosis_excess_unbiased(N,M2,M4) => (N, M2, _, M4) = online_moments(Data). % kurtosis(Data) = kurtosis_excess(Data). /* ------------------------------------------------------------ skewness(Data) ------------------------------------------------------------ Computes the skewness of a numeric dataset, returning 0.0 if the variance is zero (i.e., all values are identical). Formula: skewness = E[(X - mean)^3] / (stdev^3) Safe behavior: * Returns 0.0 when variance = 0.0 * Throws explicit domain_error for non-list or empty Data Parameters: Data : list of numeric values (length >= 1) Returns: Skewness : float value (0.0 for constant data) Example: skewness([1,2,3,4,5]) -> 0.0 skewness([1,1,1,1]) -> 0.0 ------------------------------------------------------------ */ skewness(Data) = S => if not list(Data) then throw($error('domain_error(list,Data),safe_skewness/1')) elseif length(Data) == 0 then throw($error('domain_error(length,Data),safe_skewness/1')) else N = length(Data), Mean = sum(Data) / N, Devs = [X - Mean : X in Data], Var = sum([D * D : D in Devs]) / N, if Var =:= 0.0 then S = 0.0 else SD = sqrt(Var), % Num = sum([pow(D,3) : D in Devs]) / N, Num = sum([safe_pow(D,3) : D in Devs]) / N, S = Num / pow(SD, 3) end end. /* ------------------------------------------------------------ kurtosis(Data) ------------------------------------------------------------ Computes the excess kurtosis of a numeric dataset, returning 0.0 if the variance is zero (i.e., all values are identical). Formula: kurtosis = E[(X - mean)^4] / (stdev^4) - 3 Safe behavior: * Returns 0.0 when variance = 0.0 * Throws explicit domain_error for non-list or empty Data Parameters: Data : list of numeric values (length >= 1) Returns: Kurtosis : float value (0.0 for constant data) Example: kurtosis([1,2,3,4,5]) -> -1.3 kurtosis([1,1,1,1]) -> 0.0 kurtosis(normal_dist_n(0,1,1000000)) -> approx 0.0 ------------------------------------------------------------ */ kurtosis(Data) = K => if not list(Data) then throw($error('domain_error(list,Data),safe_kurtosis/1')) elseif length(Data) == 0 then throw($error('domain_error(length,Data),safe_kurtosis/1')) else N = length(Data), Mean = sum(Data) / N, Devs = [X - Mean : X in Data], Var = sum([D * D : D in Devs]) / N, if Var =:= 0.0 then K = 0.0 else SD = sqrt(Var), Num = sum([pow(D,4) : D in Devs]) / N, K = Num / pow(SD, 4) - 3.0 end end. median(Data) = median_sorted(sort(Data)). q1(Data) = percentile_numeric_type7(sort(Data), 0.25). q3(Data) = percentile_numeric_type7(sort(Data), 0.75). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Online one-pass moments (mean, M2, M3, M4) — Pébay/West/Chan update % Returns (Mean, M2, M3, M4); also returns N via length(Data) when needed. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% online_moments(Data) = (Mean, M2, M3, M4) => N = 0, Mean = 0.0, M2 = 0.0, M3 = 0.0, M4 = 0.0, foreach(X in Data) N1 = N + 1, Delta = X - Mean, DeltaN = Delta / N1, Term1 = Delta * DeltaN * N, M4 := M4 + Term1*DeltaN*DeltaN*(N1*N1 - 3*N1 + 3) + 6.0*DeltaN*DeltaN*M2 + 4.0*DeltaN*M3, M3 := M3 + Term1*DeltaN*(N1 - 2) - 3.0*DeltaN*M2, M2 := M2 + Term1, Mean := Mean + DeltaN, N := N1 end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Unbiased skewness (Fisher-Pearson) and unbiased kurtosis excess %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% skewness_unbiased(N, M2, M3) = S => if N < 3 ; M2 =:= 0.0 then throw($error('skewness_unbiased: N >= 3, M2 != 0.0')) % S = 0.0/0.0 % NaN else G = M3 / pow(M2, 1.5), S = sqrt(N*(N-1)) / (N-2) * G end. kurtosis_excess_unbiased(N, M2, M4) = K => println($kurtosis_excess_unbiased(N, M2, M4)), if N < 4 ; M2 =:= 0.0 then throw($error('skewness_unbiased: N >= 4, M2 != 0.0')) % K = 0.0/0.0 % NaN else G2 = M4 / (M2*M2) - 3.0, % excess kurtosis (biased) % Unbiased adjustment: K = ((N-1.0)/((N-2.0)*(N-3.0))) * ((N+1.0)*G2 + 6.0) end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Median for a sorted vector %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% median_sorted(Sorted) = Med => N = length(Sorted), Mid = N // 2, Med = cond(N mod 2 == 1, Sorted[Mid+1], 0.5*(Sorted[Mid] + Sorted[Mid+1])). /* draw_without_replacement(N,A) Return N elements from the list A, drawn without replacement */ draw_without_replacement(N,A) = draw_without_replacement(N,A,[]). draw_without_replacement(N,A,Ns) = Res => if N == 0 ; A == [] then Res = Ns else D = uniform_draw(A), NewA = delete(A,D), Res = draw_without_replacement(N-1,NewA,Ns++[D]) end. % Draw all draw_without_replacement(A) = draw_without_replacement(A.len,A). /* resample(N,Data) Draw N elements from the list Data, with replacement. */ resample(N,Data) = L => L = [], foreach(_ in 1..N) L := L ++ [uniform_draw(Data)] end. % Draw Data.len elements from Data resample(Data) = resample(Data.len,Data). resample_all(Data) = resample(Data.len,Data). /* draw_until_pattern(List,Pattern,Choices) Draw from Choices until the Pattern occurs. Return the list of the drawn elements */ draw_until_pattern(Lst,Pattern,Choices) = Res => if append(_,Pattern,Lst) then Res = Lst else Res = draw_until_pattern(Lst ++ [uniform_draw(Choices)],Pattern,Choices) end. /* flip_until_pattern(Patterns,Lst) Toss a toss until one of the patterns in Patterns occurs. Return the generated list and the successful pattern. */ flip_until_pattern(Patterns,Lst) = Res => MinPatternLen = [Pt.len : Pt in Patterns].min, LstLen = Lst.len, New = bern(1/2), if LstLen >= MinPatternLen then PCheck = [], foreach(P in Patterns,break(PCheck != [])) if append(_,P,Lst) then PCheck := P end end, if PCheck != [] then Res = [Lst,PCheck] else Res = flip_until_pattern(Patterns,Lst ++ [New]) end else Res = flip_until_pattern(Patterns,Lst ++ [New]) end. /* draw_until_one_pattern(Lst,Patterns,Choices) Draw from Choices until one of the pattern in Pattern occurs. Return the generated list and the winning pattern. (This is a generalization of flip_until_pattern/3) */ draw_until_one_pattern(Lst,Patterns,Choices) = Res => MinPatternLen = [Pt.len : Pt in Patterns].min, LstLen = Lst.len, New = uniform_draw(Choices), if LstLen >= MinPatternLen then PCheck = [], foreach(P in Patterns,break(PCheck != [])) if once(append(_,P,Lst)) then PCheck := P end end, if PCheck != [] then Res = [Lst,PCheck] else Res = draw_until_one_pattern(Lst++[New],Patterns,Choices) end else Res = draw_until_one_pattern(Lst++[New],Patterns,Choices) end. /* flip_until_n_success(P,N) Returns the number of draws until N Successes. P is the probability of success. */ flip_until_n_successes(P,N) = Res => C = 0, OK = false, NumSuccesses = 0, while (OK == false) C := C + 1, B = bern(P), if B == 1 then NumSuccesses := NumSuccesses + 1 end, if NumSuccesses == N then OK := true end end, Res = C. /* count_occurrences(What,L) Returns the number of occurrences of elements What in the list L. Note: For many calls to count_occurrences/2 it might be better to use get_freq(L) which returns a map of the counts in L. */ count_occurrences(What,L) = [1 : T in L, T == What].sum. /* count_occurrences_list(What,L) Return the number of occurrences for each element in What. */ count_occurrences_list(What,L) = [count_occurrences(W,L) : W in What]. % % Count the number of matching sublists sub in the list lst % count_occurrences_sublist(Sub,L) = [S : S in sublists(L,Sub.len), S == Sub].len. % % simplex(L) % to_probabilities(L) % % Normalize the values in L to sum to 1. % simplex(L) = [ V / Sum : V in L] => Sum = L.sum. to_probabilities(L) = simplex(L). /* case(Val, List) Given a list List of different alternative [V,R] results, case/2 returns the value R for this Val == V. Example: Toss = case(Coin,[["double-head","head"], ["double-tail","tail"], [default,categorical([1/2,1/2],["head","tail"])] ]) */ case(Val,List) = R => if member([Val,R],List) then true else member([default,R],List) end. /* cases(L) The list L contains pair of [condition,return value], and returns the first return value that has a matching condition. Example: FlipCoin = cases([[Coin=="double-head","head"], [Coin=="double-tail","tail"], [Toss=="tail","head"], [Toss=="head","tail"]]) */ cases(L) = Ret => Ret = _, Found = false, foreach([V,T] in L, break(Found != false)) if cond(V,true,false) == true then Ret = T, Found := true end end. /* Wrapper for cond(..=true,...,...) i.e. skipping the == part */ condt(Cond,Then,Else) = cond(Cond==true,Then,Else). /* check(Cond) Return true or false depending on the truthfullness of the condition Cond. (An even shorter variant of cond/3.) */ check(Cond) = cond(Cond, true, false). check1(Cond) = cond(Cond, 1, 0). /* mod_replace(Val,Mod) Return Mod if Val mod Mod = 0, else Val mod Mod. */ mod_replace(Val,Mod) = cond(Res==0,Mod,Res) => Res = Val mod Mod. /* xor(A,B) Exactly one of A or B is true, but not both. */ xor(A,B) = cond( ((A,not B) ; (not A, B)),true,false). % Boolean -> Integer b2i(V) = cond(V==true,1,0). % % Swap position I <=> J in list L % swap(L,I,J) = L2, list(L) => L2 = L, T = L2[I], L2[I] := L2[J], L2[J] := T. % Returns a shuffled list shuffle(L) = shuffle1(L).shuffle1. % One loop does not seems to be enough... shuffle1(List) = List2 => List2 = List, Len = List.length, foreach(I in 1..Len) R2 = random(1,Len), List2 := swap(List2,I,R2) end. /* all_different(L) True if all values in L are different. */ all_different(L) = L.remove_dups.len == L.len. /* debug(S) Simple debugging (in function context): Prints S and return the value S. */ debug(S) = S => println(S). debug(Text,S) = S => println(Text=S). /* rep(N,V) Return N copies of the value V */ rep(N,V) = [V : _ in 1..N]. /* print_list(L) print each values in the list at a separate line */ print_list(L) => foreach(E in L) println(E) end. /* printf_list(L) (Trying to) print a little more intelligent output of the list L (than print_list/1) */ check_format(V) = F => if integer(E) then F = "%2d " elseif float(E) then F ="%.17f " else F = "%2w " end. printf_list(L) => Format = "", First = L[1], if list(First) ; compound(First) then foreach(E in L.first) Format := Format ++ check_format(E) end else if number(First) then Format := Format ++ check_format(First) else Format := Format ++ "%w " end end, Format := Format ++ "\n", foreach(E in L) if E = $(V '=' P) then call(printf,Format,V,P) else call(printf,Format,E) end end. /* index_of_sub(List,Sub) Return the first position that sub list Sub occurs in list A If there is no such occurrence, return the length of a */ index_of_sub(List,Sub) = Res => if once(append(B,Sub,_,List)) then % Res = B.len+1, % return position of the start of the sub % This is what my Gamble returned: Res = B.len+Sub.len % return position of the end of the sub else Res = List.len end. /* sublists(L,N) Returns all N sized sub lists of L. */ sublists(L,N) = [ L[I..I+N-1] : I in 1..Len-N+1] => Len = L.len. /* ones(N,Val) Return a list of N Val */ ones(N,Val) = [Val : _ in 1..N]. /* num_runs(A) returns the number of runs in the list A. */ num_runs([]) = 0. num_runs(A) = num_runs(A.head,A.tail,0). num_runs(T,A,Acc) = Res => if A.len == 0 then Res = cond(T == A, Acc,Acc+1) else FirstA = A.head, RestA = A.tail, Res = num_runs(FirstA,RestA,Acc + cond(T == FirstA,0,1)) end. % % argmin/argmax % find the index/indices for the min/max value(s) of L % % Picat> argmin([1,2,3,1000,4,5])=X % X = [1] % argmin(L) = MinIxs => Min = min(L), MinIxs = [I : I in 1..L.length, L[I] == Min]. argmax(L) = MaxIxs => Max = max(L), MaxIxs = [I : I in 1..L.length, L[I] == Max]. % % argmin_random_ties/argmax_random_ties % % find the index/indices for the min/max value(s) of L. % For ties, pick any of the candicates % argmin_random_ties(L) = Res => M = argmin(L), Res = uniform_draw(M). argmax_random_ties(L) = Res => M = argmin(L), Res = uniform_draw(M). /* argsort(L) = P P contains the indices to permute L to a sorted list. L[P[1]] represents the smallest value in L, L[P[2]] represents the second smallest value in L, etc. Picat> L=[2,3,1,5,4], L.argsort=P L = [2,3,1,5,4] P = [3,1,2,5,4] */ argsort(L) = zip(L,1..L.len).sort.map(second). /* check_duplicates(L) Returns true if there are any duplicates in list L, else false. */ check_duplicates(L) = cond(L.remove_dups.len != L.len,true,false). % % (equivalence s1 s2) % Equivalence of boolean statements: s1 <=> s2 % i.e. if s1 is true -> s2 is true % if s2 is true -> s1 is true % equivalence(S1,S2) => if cond(S1,true,false) == cond(S2,true,false) then observe(true) else observe(false) end. % % (implication s1 s2) % Implication of boolean statements: s1 => s2 % i.e. if s1 is true -> s2 is true % Supports both "plain" conditions as well as lambda based conditions % implication(S1,S2) => % Convert lambda based condition to plain conditions if cond(S1 == true,true,false) == true then observe( cond(S2==true,true,false) == true) end. % Pearson correlation coefficient % correlation(Xs, Ys) % Xs, Ys : lists of equal length with numeric values % correlation_coefficient(Xs, Ys) = R => LenX = length(Xs), LenY = length(Ys), if LenX =\= LenY then throw($error('correlation: lists must have same length')) elseif LenX =:= 0 then throw($error('correlation: empty lists')) else MeanX = sum(Xs) / LenX, MeanY = sum(Ys) / LenY, % deviations DXs = [X - MeanX : X in Xs], DYs = [Y - MeanY : Y in Ys], % numerator: sum((X - MeanX)*(Y - MeanY)) Num = sum([DXs[I] * DYs[I] : I in 1..LenX]), % denominator: sqrt(sum(dx^2) * sum(dy^2)) DenX = sqrt(sum([DX * DX : DX in DXs])), DenY = sqrt(sum([DY * DY : DY in DYs])), if DenX =:= 0.0 ; DenY =:= 0.0 then throw($error('correlation: zero variance in one list')) else R = Num / (DenX * DenY) end end. /* % CHECK THIS % Sample covariance (population form uses /N; sample uses /(N-1)) covariance(Xs, Ys) = Cov => Len = length(Xs), if Len =\= length(Ys) then throw($error('covariance: lists must have same length')) elseif Len =:= 0 then throw($error('covariance: empty lists')) else MeanX = sum(Xs)/Len, MeanY = sum(Ys)/Len, Cov = sum([(Xs[I]-MeanX)*(Ys[I]-MeanY) : I in 1..Len]) / Len end. */ /* % CHECK THIS % (Optional) sample correlation uses sample std devs; algebraically, % the / (N-1) factors cancel, so it equals the same r as before. sample_correlation(Xs, Ys) = R => Len = length(Xs), if Len =\= length(Ys) then throw($error('sample_correlation: lists must have same length')) elseif Len < 2 then throw($error('sample_correlation: need at least 2 points')) else MeanX = sum(Xs)/Len, MeanY = sum(Ys)/Len, DX = [X-MeanX : X in Xs], DY = [Y-MeanY : Y in Ys], Num = sum([DX[I]*DY[I] : I in 1..Len]), Sxx = sum([DX[I]*DX[I] : I in 1..Len]), Syy = sum([DY[I]*DY[I] : I in 1..Len]), if Sxx =:= 0.0 ; Syy =:= 0.0 then throw($error('sample_correlation: zero variance in one list')) else R = Num / sqrt(Sxx*Syy) % same as correlation/2 end end. */ /* but_last(L) Return all but the last element in L */ but_last(L) = L[1..L.length-1]. /* rest(L) Returns the rest/tail of the list L */ rest(L) = tail(L). /* take_last(A,N) Returns the last N elements of the list A. */ take_last(A,N) = Res => if A.len < N then Res = [] else Len = A.len, Res = A[Len-N+1..Len] end. /* get_runs(As) Return the runs of the list As */ get_runs(As) = get_runs1(As.rest, [[As.first]]). get_runs1(As,Aux) = Res => if As == [] then Res = Aux else ButLastAux = but_last(Aux), LastAux = last(Aux), AuxVal = LastAux.first, A = As.first, if AuxVal == A then Res = get_runs1(As.rest,ButLastAux ++ [[A]++LastAux]) else Res = get_runs1(As.rest,Aux ++ [[A]]) end end. % Return the lengths of the runs in Xs get_runs_lens(Xs) = get_runs(Xs).map(len). /* get_runs_op(Xs,Op) This is a generalized version of get_runs/1, which is the same as get_runs_op(Xs,$=). The Op can be about anything, but it must adhere to the type of the elements in the list Xs. */ get_runs_op(Xs,Op) = get_runs_op(Xs.rest,[[Xs.first]],Op). get_runs_op(As,Aux,Op) = Res => if As == [] then Res = Aux else ButLastAux = but_last(Aux), LastAux = last(Aux), AuxVal = first(LastAux), A = first(As), F =.. [Op,AuxVal,A], if cond(F.call,true,false) == true then Res = get_runs_op(As.rest,Aux ++ [[A]],Op) else Res = get_runs_op(As.rest,ButLastAux ++ [LastAux++[A]],Op) end end. % Generate N 0/1 runs with probability P of success generate_01_runs(N,P) = get_runs(bern_n(P,N)). % Return the runs with value vals filter_runs(Runs,Val) = [Run : Run in Runs, Run[1] == Val]. /* Records */ /* get_records(A) get_records(A,Op) Return all records in the list A * Op: The comparison operator, default $(>) Note: This supports the operations: <, <=, !=, >, >= (and '=' but that only compare against the first element) And also user defined operators. */ get_records(A) = get_records(A.rest,A.first,[A.first],$(>)). get_records(A,Op) = get_records(A.rest,A.first,[A.first],Op). get_records(A,MaxVal,Aux,Op) = Res => if A == [] then Res = Aux else Val = A.first, F =.. [Op,Val,MaxVal], if cond(F.call,true,false) == true then Res = get_records(A.rest,Val,Aux++[Val],Op) else Res = get_records(A.rest,MaxVal,Aux,Op) end end. /* get_records_geq(A) Weak records, i.e. a record is defined as >= instead of > */ get_records_geq(A) = get_records(A,($>=)). /* get_records_ix(A) get_records_ix(A,Op) Return all indices of the records in the list A * Op: The comparison parameter, default $(>) Note: This supports the operations: <, <=, !=, >, >= (and '=' but that only compare against the first element) And also user defined operators. */ get_records_ix(A) = get_records_ix(2,A.rest,A.first,[1],$(>)). get_records_ix(A,Op) = get_records_ix(2,A.rest,A.first,[1],Op). get_records_ix(Ix,A,MaxVal,Aux,Op) = Res => if A == [] then Res = Aux else Val = A.first, F =.. [Op,Val,MaxVal], if cond(F.call,true,false) == true then Res = get_records_ix(Ix+1,A.rest,Val,Aux++[Ix],Op) else Res = get_records_ix(Ix+1,A.rest,MaxVal,Aux,Op) end end. % >= instead of > get_records_ix_geq(A) = get_records_ix(A,$(>=)). /* comb_sum(Len,N,Sum) Return a list of all valid combinations of length len with the values 0..N that sums to Sum */ comb_sum(Len,N,Sum) = solve_all(X).sort => X = new_list(Len), X :: 0..N, sum(X) #= Sum. /* Show the plot_ascii_scatter plot. Config: scatter_plot_size */ show_scatter_plot(Data) => [Width,Height] = get_config(scatter_plot_size,[60,20]), show_scatter_plot(Data,Width,Height). show_scatter_plot(Data,Width,Height) => println("Scatter plot:"), nl, if (list(Data[1]),not number(Data[1,1])) ; (not list(Data[1]), not number(Data[1])) then println("show_scatter_plot: Data is not numeric") else [Plot,MinX,MaxX,_MinY,_MaxY] = plot_ascii_scatter(Data,Width,Height), foreach(P in Plot) println(P) end, println(x=[MinX,MaxX]), nl end. /* ------------------------------------------------------------ plot_ascii_scatter(Data, Width, Height) ------------------------------------------------------------ Creates an ASCII scatter plot from a list of (X,Y) points. Parameters: Data : list of [X,Y] pairs, numeric Width : number of columns in the plot (e.g., 60) Height : number of rows in the plot (e.g., 20) Features: * Automatic scaling of X and Y axes * Uses '.' for empty space and '*' for plotted points * Points outside the bounding box are ignored * Returns a list of strings (each row) * To print, use: foreach(Row in Plot) println(Row) end Example: Data = [[1,2],[2,3],[3,5],[4,4]], Plot = plot_ascii_scatter(Data,60,20), foreach(R in Plot) println(R) end. ------------------------------------------------------------ */ plot_ascii_scatter(Data) = plot_ascii_scatter(Data, 60, 20). plot_ascii_scatter(Data, Width, Height) = [Plot,MinX,MaxX,MinY,MaxY] => if not list(Data) then throw($error('domain_error(list,Data),plot_ascii_scatter/3')) elseif Width =< 1 ; Height =< 1 then throw($error('domain_error(size,[Width,Height]),plot_ascii_scatter/3')) else if number(Data[1]) then Data := [ [I,Data[I]] : I in 1..Data.len] end, % Extract X and Y % Xs = [X : [X,_] in Data], % Ys = [Y : [_,Y] in Data], [Xs,Ys] = Data.transpose, MinX = min(Xs), MaxX = max(Xs), MinY = min(Ys), MaxY = max(Ys), DX = MaxX - MinX, DY = MaxY - MinY, % Avoid division by zero by expanding degenerate ranges if DX =:= 0.0 then DX := 1.0 end, if DY =:= 0.0 then DY := 1.0 end, % Initialize blank grid as list of lists Grid0 = [ [" " : _ in 1..Width] : _ in 1..Height ], % Plot points Grid = plot_ascii_points(Data,Grid0,Width,Height,MinX,MinY,DX,DY), % Convert rows to strings Plot = [ymax=MaxY], Plot := Plot ++ [ "|" ++ Row.join('') : Row in Grid ], Plot := Plot ++ [ymin=MinY] end. /* ------------------------------------------------------------ plot_ascii_points(Data,Grid,Width,Height,MinX,MinY,DX,DY) ------------------------------------------------------------ Internal helper: places '*' into the grid. ------------------------------------------------------------ */ plot_ascii_points([],Grid,_,_,_,_,_,_) = Grid. plot_ascii_points([[X,Y]|Rest],Grid0,Width,Height,MinX,MinY,DX,DY) = Grid => % Convert numeric coordinates to grid coordinates GXfloat = (X - MinX) * (Width - 1) / DX, GYfloat = (Y - MinY) * (Height - 1) / DY, GX = round(GXfloat), GY = Height - 1 - round(GYfloat), % Flip Y-axis (top=0) if GX >= 0, GX < Width, GY >= 0, GY < Height then % Replace '.' with '*' at (GY,GX) Row = Grid0[GY+1], NewRow = replace_nth(Row,GX+1,"*"), Grid1 = replace_nth(Grid0,GY+1,NewRow), Grid = plot_ascii_points(Rest,Grid1,Width,Height,MinX,MinY,DX,DY) else % Point outside plot → ignore Grid = plot_ascii_points(Rest,Grid0,Width,Height,MinX,MinY,DX,DY) end. /* ------------------------------------------------------------ replace_nth(List,N,Val) ------------------------------------------------------------ Replaces the Nth element (1-based) of a list. ------------------------------------------------------------ */ replace_nth([_|Rest],1,Val) = [Val|Rest] => true. replace_nth([H|Rest],N,Val) = [H|replace_nth(Rest,N-1,Val)] => N > 1. /* Notes on the effect of dataset size in ABC distance metrics The ABC distance measures used here (Wasserstein, normalized RMSE, KS, and the combined abc_fit_score) behave differently depending on the sample sizes of the observed data and the posterior predictive data. The following summarizes their sensitivity. 1. Wasserstein distance (mean absolute percentile difference) - Very robust to differences in sample size. - Depends mainly on the underlying distribution, not on how many samples were drawn. - Works well even when the two datasets have different lengths. - Becomes noisy only when a dataset is very small (< 50 samples). 2. Normalized RMSE over percentiles - Behaves similarly to the Wasserstein distance. - Mostly insensitive to unequal dataset sizes. - Good global measure of mismatch. 3. Kolmogorov-Smirnov (KS) statistic - Moderately sensitive to sample-size differences. - KS uses the maximum difference between empirical CDFs; the CDF of a smaller dataset is more "steppy", which increases noise. - Still reliable unless the smaller dataset is very small (< 30). 4. Combined abc_fit_score (Wnorm + KS) - Dominated by the normalized Wasserstein part (Wnorm). - Only slightly affected by KS noise when dataset sizes differ. - Typical variation due to sample-size differences is small (often around 0.02 to 0.05 in the score). - Remains stable for practical ABC usage. Rule of thumb: * Data size >= 50 gives reasonable stability. * Data size >= 100 gives very good stability. * Posterior predictive size >= 1000 is ideal. In summary: Wasserstein and normalized RMSE are highly robust to unequal sample sizes, KS is moderately sensitive, and the abc_fit_score remains stable because the Wasserstein component dominates. */ /* The RMSE over percentiles measures the typical difference between the empirical percentiles of the observed data and those of the posterior predictive sample. The value is always expressed in the same units as the data. Small RMSE means the two distributions are similar. As a rule of thumb: - RMSE < 0.5 * stdev(data): very good agreement - RMSE ≈ stdev(data): moderate mismatch - RMSE > stdev(data): poor match - RMSE << IQR/10: excellent match across the entire distribution To compare across different scales, use RMSE / stdev(data), which behaves like an effect size. */ rmse_percentiles(Data,Post) = RMSE => Qs = [Q : Q in 0.01..0.01..0.99], PData = [data_quantile(Data,Q) : Q in Qs], PPost = [data_quantile(Post,Q) : Q in Qs], RMSE = sqrt( sum([ (Pi - Qi)**2 : {Pi,Qi} in zip(PData,PPost)]) / length(PData) ). /* Relationship to “effect size” RMSE can be interpreted similarly to Cohen’s d or effect size measures: Normalized_RMSE = RMSE / stdev(data) Meaning: < 0.2 -> tiny difference 0.2–0.5 -> small 0.5–0.8 -> moderate 0.8 -> large 1.2 - very large 2.0 -> distributions do not match at all This helps compare RMSE across different scales. */ normalized_rmse(Data,Post) = rmse_percentiles(Data,Post) / stdev(Data). /* -------------------------------------------------------------------- normalized_rmse_explained(NRMSE) -------------------------------------------------------------------- Given a normalized RMSE (RMSE / stdev(data)), returns a textual interpretation. Thresholds (interpreting effect size): NRMSE < 0.20 -> very good match 0.20–0.50 -> good 0.50–0.80 -> moderate mismatch 0.80–1.20 -> strong mismatch >1.20 -> poor match / distributions differ a lot Returns a string with a brief explanation. Example: normalized_rmse_explained(0.09) "Excellent match (very small difference between distributions)." normalized_rmse_explained(0.75) "Moderate mismatch; distributions differ noticeably." -------------------------------------------------------------------- */ normalized_rmse_explained(NRMSE) = Msg => if NRMSE < 0.0 then throw($error('domain_error(non_negative,NRMSE),normalized_rmse_explained/1')) elseif NRMSE < 0.20 then Msg = "Excellent match (very small difference between distributions)." elseif NRMSE < 0.50 then Msg = "Good match; only small differences across the distribution." elseif NRMSE < 0.80 then Msg = "Moderate mismatch; distributions differ noticeably." elseif NRMSE < 1.20 then Msg = "Strong mismatch; posterior predictive is not close to the data." else Msg = "Poor match; distributions are substantially different." end. /* -------------------------------------------------------------------- abc_fit_score(Data, Ys) -------------------------------------------------------------------- Computes a single scalar measuring how well Ys (posterior predictive) matches Data (observed data). Components: W = Wasserstein distance over percentiles (global mismatch) KS = Kolmogorov--Smirnov statistic (max CDF difference) Wnorm = W / stdev(Data) (scale-free) Score = Wnorm + KS Interpretation: 0.00–0.10 excellent 0.10–0.30 good 0.30–0.60 moderate 0.60–1.00 poor >1.00 very poor -------------------------------------------------------------------- */ abc_fit_score(Data, Post) = Score => Qs = [Q : Q in 0.01..0.01..0.99], P = [data_quantile(Data,Q) : Q in Qs], Q = [data_quantile(Post,Q) : Q in Qs], % Wasserstein distance W = sum([abs(Pi - Qi) : {Pi,Qi} in zip(P,Q)]) / length(P), % Normalize by stdev(Data) SD = stdev(Data), Wnorm = cond(SD =:= 0.0, 0.0, W / SD), % KS statistic (max CDF difference) F = [I/100.0 : I in 1..99], KS = max([abs(Fi - Fj) : {Fi,Fj} in zip(F,F)]), % Actually needs both CDFs % Real KS between empirical CDFs: KS := max([abs((I/100.0) - (J/100.0)) : {I,J} in zip(P,Q)]), Score = Wnorm + KS. /* ==================================================================== abc_fit_score_explained(Data, Ys) ==================================================================== Computes a distribution-agnostic ABC goodness-of-fit score based on: Wnorm : normalized Wasserstein distance over percentiles KS : Kolmogorov-Smirnov statistic Score = Wnorm + KS Returns: [Score, Explanation] Interpretation: Score < 0.10 : excellent fit Score < 0.30 : good fit Score < 0.60 : moderate mismatch Score < 1.00 : strong mismatch Score >= 1.00 : poor fit Notes: - Wasserstein distance measures global mismatch - KS distance measures local maximum mismatch - Using percentiles makes it distribution-free ==================================================================== */ abc_fit_score_explained(Data, Ys) = [Score, Msg] => if not list(Data) ; not list(Ys) then throw($error('domain_error(list,[Data,Ys]),abc_fit_score_explained/2')) end, % Percentile grid Ps = data_percentiles(Data,1..99), Qs = data_percentiles(Ys, 1..99), % --- Wasserstein distance (global difference) W = sum([abs(Pi - Qi) : {Pi,Qi} in zip(Ps,Qs)]) / length(Ps), % --- Normalize by stdev(Data) SD = stdev(Data), Wnorm = cond(SD =:= 0.0, 0.0, W / SD), % --- KS statistic (max CDF difference) % Using empirical CDFs based on sorted data KS = ks_distance(Data,Ys), Score = Wnorm + KS, Msg = interpret_fit_score(Score). /* ==================================================================== ks_distance(Data, Ys) ==================================================================== Computes the empirical Kolmogorov-Smirnov distance. This uses a linear-time merge walk over sorted Data and Ys. ==================================================================== */ ks_distance(Data, Ys) = KS => SortedD = sort(Data), SortedY = sort(Ys), ND = length(SortedD), NY = length(SortedY), I = 1, J = 1, KS0 = 0.0, while (I =< ND ; J =< NY) Xd = cond(I =< ND, SortedD[I], 1.0e100), Xy = cond(J =< NY, SortedY[J], 1.0e100), if Xd =< Xy then Cd = I / ND, Cy = (J-1) / NY, Diff = abs(Cd - Cy), if Diff > KS0 then KS0 := Diff end, I := I + 1 else Cd = (I-1) / ND, Cy = J / NY, Diff = abs(Cd - Cy), if Diff > KS0 then KS0 := Diff end, J := J + 1 end end, KS = KS0. /* ==================================================================== interpret_fit_score(Score) ==================================================================== Human-readable interpretation of ABC fit score. ==================================================================== */ interpret_fit_score(S) = Msg => if S < 0.10 then Msg = "Excellent fit; posterior predictive matches the data very closely." elseif S < 0.30 then Msg = "Good fit; only small distributional differences." elseif S < 0.60 then Msg = "Moderate mismatch; distributions differ in shape." elseif S < 1.00 then Msg = "Strong mismatch; posterior predictive is noticeably different." else Msg = "Poor fit; distributions differ substantially." end. /* Analyse posterior and compare with data. */ analyse_post(Data) => analyse_post(Data,get_store().get("post")). analyse_post(Data,Post) => println("Post:"), % println(Post), show_simple_stats(Post), show_histogram(Post), show_scatter_plot(Post), % println(rmse=rmse_percentiles(Data,Post)), RMSENormalized = normalized_rmse(Data,Post), println(normalized_rmse=RMSENormalized), println(normalized_rmse_explained(RMSENormalized)), % println(abc_fit_score=abc_fit_score(Data, Post)), println(abc_fit_score_explained=abc_fit_score_explained(Data, Post)), nl. analyse_post_simple(Data) => analyse_post_simple(Data,get_store().get("post")). analyse_post_simple(Data,Post) => println("Post:"), RMSENormalized = normalized_rmse(Data,Post), println(normalized_rmse=RMSENormalized), println(normalized_rmse_explained(RMSENormalized)), println(abc_fit_score_explained=abc_fit_score_explained(Data, Post)), nl. analyze_post(Data) => analyse_post(Data). analyze_post(Data,Post) => analyse_post(Data,Post). analyze_post_simple(Data) => analyse_post_simple(Data). analyze_post_simple(Data,Post) => analyse_post_simple(Data,Post). % Return a random markov chain with transitions transitions and init-states init-state markov_chain_loop(Transitions,N,A) = Res => if N == 0 then Res = A else State = A.last, Len = Transitions.first.len, Next = categorical(Transitions[State],1..Len), Res = markov_chain_loop(Transitions,N-1,A ++ [Next]) end. markov_chain(Transitions,InitState,N) = Res => Init = categorical(InitState,1..Transitions.len), Res = markov_chain_loop(Transitions,N-1,[Init]). markov_chain_n(Transitions,InitState,N,Num) = [markov_chain(Transitions,InitState,N) : _ in 1..Num]. /* one_hot(N,I) Create a one hot list of length n and the single 1 in position i (1-based) Example Picat> one_hot(10,5) [0,0,0,0,1,0,0,0,0,0] */ one_hot(N,I) = Res => A = ones(N,0), A[I] := 1, Res = A. /* Generate a random transition matrix. */ random_transition_matrix(N) = random_transition_matrix(N,1/N). random_transition_matrix(N,V) = M => Alpha = ones(N,V), M = [ dirichlet_dist(Alpha) : _ in 1..N]. /* Print a matrix */ show_matrix(M) => show_matrix(M,"Matrix"). show_matrix(M,Name) => println(Name), foreach(Row in M) println(Row) end, nl. % Alias print_matrix(M) => show_matrix(M). print_matrix(M,Name) => show_matrix(M,Name). /* find_first(L,Val,Default) Returns the first occurrence (position) of val in list a. If no occurrence, return default value. */ find_first(L,Val,Default) = cond(Pos > 0, Pos, Default) => Pos = find_first_of(L,Val). /* overlap(A1,A2,B1,B2) Return true if there is an overlap between [Start1,End1] and [Start2,End2]. */ overlap(Start1,End1,Start2,End2) = cond(max(Start1,Start2) <= min(End1,End2),true,false).