#| 

  Order statistics with replacement - discrete distribution in Racket/Gamble 

  
  This is for order statistics with replacement for discrete distributions. 
  For continous distribution, see gamble_order_statistics_continuous_dist.rkt

  From Siegrist "Probability Mathematical Statisics and Stochastic Processes",
  Chapter "6.6: Order Statistics"
  """
  Suppose that x is a real-valued variable for a population and that x = (x1,x2, … , xn ) 
  are the observed values of a sample of size n corresponding to this variable. 
  The order statistic of rank k is the k th smallest value in the data set, and is usually 
  denoted xn:k . To emphasize the dependence on the sample size, another common notation 
  is x(k) .
  """
  And Wikipedia https://en.wikipedia.org/wiki/Order_statistic


  Example:
  4 dice are thrown, what are the probabilities of the second minimum value
  of these dice? 

    4 dice are thrown -> n=4
    2nd smallest value -> k=2
    x is the value in 1..6 for which we want to see the CDF or PDF

    order_statistics_with_replacement_discrete_<=_cdf
               (lambda (x) (discrete_uniform_dist_pdf 1 6 x))
               (lambda (x) (discrete_uniform_dist_cdf 1 6 x))
               4 1 x)

    order_statistics_with_replacement_discrete_pdf
               (lambda (x) (discrete_uniform_dist_pdf 1 6 x))
               (lambda (x) (discrete_uniform_dist_cdf 1 6 x))
               4 1 x)

   The general form is 
    order_statistics_with_replacement_discrete_*
               (PDF of the distribution
                CDF of the distribution
               n k x)

   Here's the CDF
   (1 671/1296 0.5177469135802469)
   (2 65/81 0.8024691358024691)
   (3 15/16 0.9375)
   (4 80/81 0.9876543209876543)
   (5 1295/1296 0.9992283950617284)
   (6 1 1.0)
 
   and the PDF
   (1 671/1296 0.5177469135802469)
   (2 41/144 0.2847222222222222)
   (3 175/1296 0.13503086419753085)
   (4 65/1296 0.05015432098765432)
   (5 5/432 0.011574074074074073)
   (6 1/1296 0.0007716049382716049)

  I.e. the probability that 1 is the second smallest value is
  0.5177469135802469

  This corresponds to Mathematica's OrderDistribution which 
  is for order statistics with replacement, e.g.
     dist = OrderDistribution[{DiscreteUniformDistribution[{1, 6}], 4}, 1]
     PDF[dist, 2]
     -> 0.284722
     CDF[dist, 2]
     -> 0.802469


  In general
   dist = OrderDistribution[{DiscreteUniformDistribution[{a, b}], b}, k]
  i.e. it takes all samples (not a part of the samples as in 
  statistics_without_replacement_dist_*
   

  For order statistics without replacement, see 
  gamble_order_statistics_without_replacement_dist.rkt)


  This program was created by Hakan Kjellerstrand, hakank@gmail.com
  See also my Racket page: http://www.hakank.org/racket/

|#

#lang gamble

; (require gamble/viz)
(require racket)
(require "gamble_utils.rkt")
(require "gamble_distributions.rkt")


; Discrete uniform
(let ([a 1]
      [b 10]
      [n 10]
      [k 10])
  (newline)
  (displayln "Testing with discrete_uniform_dist")
  (displayln (format "a:~a b:~a n:~a k:~a" a b n k))
  
  (displayln "PDF")
  (displayln "order_statistics_with_replacement_discrete_pdf pdf cdf n k x")
  (displayln "Testing with discrete_uniform_dist")
  
  (show "sum" (sum (for/list ([i (range a (add1 b))])
                     (let ([t (order_statistics_with_replacement_discrete_pdf
                               (lambda (x) (discrete_uniform_dist_pdf a b x))
                               (lambda (x) (discrete_uniform_dist_cdf a b x))
                               n k i)])
                       (show2 i t (* 1.0 t ))
                       t))))
  (newline)
  
  (displayln "CDF")
  (displayln "order_statistics_with_replacement_discrete_<=_cdf pdf cdf n k x")
  (for/list ([i (range a (add1 b))])
    (let ([t (order_statistics_with_replacement_discrete_<=_cdf
              (lambda (x) (discrete_uniform_dist_pdf a b x))
              (lambda (x) (discrete_uniform_dist_cdf a b x))
              n k i)])
      (show2 i t (* 1.0 t ))
      t))
  (newline)


  )

;; (newline)
;; (displayln "Testing PDF with poisson 3, 4 samples, k=1")
;; (show "sum" (sum (for/list ([i (range 0 17)])
;;   (let ([t (order_statistics_with_replacement_discrete_pdf
;;             (lambda (x) (poisson_dist_pdf 3 x))
;;             (lambda (x) (poisson_dist_cdf 3 x))
;;             4 1 i)])
;;     (show2 i t (* 1.0 t ))
;;     t))))

;; (newline)
;; (displayln "Testing PDF with binomial 10 0.5, 3 samples, k=2")
;; (show "sum" (sum (for/list ([i (range 0 17)])
;;   (let ([t (order_statistics_with_replacement_discrete_pdf
;;             (lambda (x) (binomial_dist_pdf 10 0.5 x))
;;             (lambda (x) (binomial_dist_cdf 10 0.5 x))
;;             3 2 i)])
;;     (show2 i t (* 1.0 t ))
;;     t))))
;; (newline)

(newline)
(displayln "Testing CDF with binomial 10 0.5, 25 samples, k=25")
(show "sum" (sum (for/list ([i (range 0 11)])
  (let ([t (order_statistics_with_replacement_discrete_<=_cdf
            (lambda (x) (binomial_dist_pdf 10 1/2 x))
            (lambda (x) (binomial_dist_cdf 10 1/2 x))
            25 25 i)])
    (show2 i t (* 1.0 t ))
    t))))
(newline)


#|
 Model for comparing with the theoretical.

 * dice problem, discrete_uniform 1..6, k=4
   theoretical:
  (1 671/1296 0.5177469135802469)
  (2 41/144 0.2847222222222222)
  (3 175/1296 0.13503086419753085)
  (4 65/1296 0.05015432098765432)
  (5 5/432 0.011574074074074073)
  (6 1/1296 0.0007716049382716049)
  sum: 1


  enumerate
  variable : v
  1: 671/1296 (0.5177469135802469)
  2: 41/144 (0.2847222222222222)
  3: 175/1296 (0.13503086419753085)
  4: 65/1296 (0.05015432098765432)
  5: 5/432 (0.011574074074074073)
  6: 1/1296 (0.0007716049382716049)
  mean: 2275/1296 (1.7554012345679013)

  This is a perfect match!


 * poisson 3 with k=1
   Theoretical
  (0 0.18476325541545102 0.18476325541545102)
  (1 0.4038896209941395 0.4038896209941395)
  (2 0.3006513907542001 0.3006513907542001)
  (3 0.09520909011577983 0.09520909011577983)
  (4 0.014321944941076775 0.014321944941076775)
  (5 0.0011151049028135622 0.0011151049028135622)
  (6 4.833214743376517e-5 4.833214743376517e-5)
  (7 1.240645337367781e-6 1.240645337367781e-6)
  (8 1.9874597090069756e-8 1.9874597090069756e-8)
  (9 2.0769378794230991e-10 2.0769378794230991e-10)
  (10 1.4701721908764935e-12 1.4701721908764935e-12)
  (11 7.256888437990838e-15 7.256888437990838e-15)
  (12 -7.542531655543706e-17 -7.542531655543706e-17)
  (13 1.920935136681248e-16 1.920935136681248e-16)
  (14 -1.6182973545380112e-17 -1.6182973545380112e-17)
  (15 7.812398900803163e-17 7.812398900803163e-17)
  (16 -1.8107632007198144e-16 -1.8107632007198144e-16)
  sum: 0.9999999999999998

  The model works quite well up to about 8, after that it differs quite much
  Model (enumerate #:limit 1e-11)
  1: 0.4038896209964249
  2: 0.30065139075571795
  0: 0.18476325541540473
  3: 0.09520909011536116
  4: 0.014321944940108572
  5: 0.001115104901999425
  6: 4.833214679082736e-5
  7: 1.2406448903676498e-6
  8: 1.9874316592297865e-8
  9: 2.0754799170551148e-10
  10: 1.4348995331000185e-12
  11: 2.3832991203146688e-15
  mean: 1.3539818153961933

  With importance-sampler (1 000 000 samples)
  variable : v
  1: 0.4047730000000001
  2: 0.30038200000000004
  0: 0.18451900000000002
  3: 0.09495300000000002
  4: 0.014284000000000003
  5: 0.0010410000000000003
  6: 4.700000000000001e-5
  7: 1.0000000000000002e-6
  mean: 1.3530260000000003

  * binomial 10 0.5 3 samples, k=2
  Theoretical 
(0 2.859160304069519e-6 2.859160304069519e-6)
(1 0.00034084543585777283 0.00034084543585777283)
(2 0.008301353082060814 0.008301353082060814)
(3 0.0698232650756836 0.0698232650756836)
(4 0.24068735539913177 0.24068735539913177)
(5 0.36168864369392395 0.36168864369392395)
(6 0.24068735539913177 0.24068735539913177)
(7 0.0698232650756836 0.0698232650756836)
(8 0.008301353082060814 0.008301353082060814)
(9 0.00034084543585777283 0.00034084543585777283)
(10 2.859160304069519e-6 2.859160304069519e-6)

  enumerate
  variable : v
  5: 0.36168864369392384
  4: 0.2406873553991318
  6: 0.2406873553991318
  3: 0.06982326507568361
  7: 0.06982326507568361
  2: 0.008301353082060842
  8: 0.008301353082060842
  1: 0.00034084543585777174
  9: 0.00034084543585777174
  0: 2.859160304069521e-6
  10: 2.859160304069521e-6
  mean: 5.0

|#

(displayln "\nModel")
(define (model)
  (enumerate ; #:limit 1e-11
   ; rejection-sampler
   ; importance-sampler
   ; mh-sampler

   ; #|
   ; 4 dice are rolled, what is the min (k=1) or max (k=4))
   (define m 6)
   (define n 4)
   (define k 2)

   ; 5 numbers in the range of 1..25 are drawn,
   ; What are the probabilities of the 3rd smallest value?
   ; (This takes about 30s for enumerate)
   ; (define m 25)
   ; (define n 5)
   ; (define k 3)

   ; With replacement
   (define x (for/list ([i n]) (add1 (random-integer m))))
   ; |#


   ; Poisson 3, 4 samples, k=1 (use #:limit 1e-11 or smaller with enumerate)
   ; (define x (for/list ([i 4]) (poisson 3)))
   ; (define k 1)
   
   ; Binomial 10 0.5, 3 samples, k=2
   ;; (define x (for/list ([i 3]) (binomial 10 1/2)))
   ;; (define k 2)

   ; Just testing: 10 values of normal 100 15, what is the 3 smallest value
   ; 
   ; (define x (for/list ([i 10]) (normal 100 15)))
   
   
   (define x_sorted (sort x <))
   ; (show "x" x_sorted)
   ; Order statistics is 1-based 
   (define v (ith-smallest x_sorted (sub1 k) #t))

   (define p (>= v 9))
   (list v
         p
         ; x
         )
   
   )
)

(show-marginals (model)
                (list  "v"
                       "p"
                       "x"
                       
                       )
                #:num-samples 100000
                ; #:truncate-output 5
                ; #:skip-marginals? #t
                ; #:show-stats? #t
                ; #:credible-interval 0.84
                #:hpd-interval (list 0.95)
                ; #:show-histogram? #t
                ; #:show-percentiles? #t
                ; #:burn 0
                ; #:thin 0
                )