Approximation: Sampling¶

Reservoir Sampling: Algorithm R¶

In [1]:
# ported from https://en.wikipedia.org/wiki/Reservoir_sampling
from random import randrange

def reservoir_sample(data, n, k):
  # fill the reservoir array
  r = []
  for i in range(k):
    r.append(data[i])

  # replace elements with gradually decreasing probability
  for i in range(k, n-1):
    # randrange(a) generates a uniform integer in [0, a)
    j = randrange(i+1)
    if j < k:
        r[j] = data[i]
            
  return(r)
     
data = list(range(1000))
n = len(data)
k = 5
r = reservoir_sample(data, n, k)
r
Out[1]:
[654, 891, 526, 753, 224]

Likelihood of each value 1 to 1000 to be in the database:

In [3]:
import numpy as np
import pandas as pd

K = 5
N_SAMPLES = 10000
samples = []

N = 1000
data = list(range(N))
for j in range(N_SAMPLES):
    samples += reservoir_sample(data, N, k)

#unique, counts = np.unique(samples, return_counts=True)
ax = pd.Series(samples).plot.hist(grid=True, bins=20, rwidth=0.9,
                   color='#607c8e')
ax.set_title(f"Distribution of frequencies of all {N} values")
Out[3]:
Text(0.5, 1.0, 'Distribution of frequencies of all 1000 values')

Optimization to Reservoir: AlgorithmL¶

  • Calling the random number generator for every row can be slow.
  • Idea: after each row we choose, could we predict how many rows we'll skip?
  • Let's plot the gaps between chosen values empirically! (The sampling gap distribution).
In [5]:
import numpy as np
import pandas as pd

def plot_gaps(r):
    r.sort()
    gaps = pd.Series(np.diff(r))
    gaps.plot.hist(grid=True, bins=20, rwidth=0.9,
                   color='#607c8e')
data = list(range(100000))
n = len(data)
k = 1000
r = reservoir_sample(data, n, k)
plot_gaps(r)
  • Turns out this is approximately a geometric distribution with a closed form!
    • Won't prove here
  • So we can pick random gaps from geometric distribution
    • "skip over" the to-be-discarded inputs in between
    • only call RNG as many times as there are gaps!
    • i.e. about as many times as the size of the sample!
In [6]:
# This is called Algorithm L
# ported from https://en.wikipedia.org/wiki/Reservoir_sampling
from random import random, randrange
from math import exp, log, floor

def reservoir_sample_L(data, n, k):
  # fill the reservoir array
  r = []
  for i in range(k):
    r.append(data[i])
    
  # random.random() generates a uniform [0,1) random number
  w = exp(log(random())/k)

  while i < n:
      i = i + floor(log(random())/log(1-w)) + 1
      if i < n:
          # replace a random item of the reservoir with item i
          r[randrange(k)] = data[i]  # random index between 0 and k-1, inclusive
          w = w * exp(log(random())/k)
            
  return(r)
     
data = list(range(1000))
n = len(data)
k = 5
r = reservoir_sample_L(data, n, k)
r
Out[6]:
[910, 799, 635, 859, 946]
In [7]:
data = list(range(100000))
n = len(data)
k = 1000
r = reservoir_sample_L(data, n, k)
plot_gaps(r)

Reservoir Sampling as a Table-Valued UDF in PostgreSQL¶

We can implement a reservoir as a table function that returns (rownumber, pos) pairs and join with that to sample.

In [8]:
## replace the database connection with a database of your own!
%reload_ext sql
%sql postgresql://localhost:5432/baseball
In [9]:
%sql CREATE EXTENSION IF NOT EXISTS plpython3u; -- import extension
Running query in 'postgresql://localhost:5432/baseball'
Out[9]:
In [10]:
%%sql
-- create the reservoir_swaps UDF --
DROP TYPE IF EXISTS reservoir_pair CASCADE;
CREATE TYPE reservoir_pair AS (rownum integer, pos integer);
CREATE OR REPLACE FUNCTION reservoir_swaps(k integer, n integer) RETURNS setof reservoir_pair
    AS $$
  # optimized reservoir sampling algorithm, Algorithm L
  from random import random, randrange
  from math import exp, log, floor
  # fill the reservoir array
  r = []

  for i in range(k):
    yield((i,i))
    
  # random.random() generates a uniform [0,1) random number
  w = exp(log(random())/k)

  while i < n:
      i = i + floor(log(random())/log(1-w)) + 1
      if i < n:
          # replace a random item of the reservoir with item i
          w = w * exp(log(random())/k)
          yield(i, randrange(k))  # rand om index between 0 and k-1, inclusive
            
  return(r)
    $$
    LANGUAGE 'plpython3u'
    VOLATILE
    RETURNS NULL ON NULL INPUT;


CREATE OR REPLACE FUNCTION reservoir_rows(k integer, n integer) RETURNS setof integer
  AS $$
     SELECT MAX(rownum) AS rownum
     FROM reservoir_swaps(k, n)
     GROUP by pos $$ 
LANGUAGE 'sql'
VOLATILE;
Running query in 'postgresql://localhost:5432/baseball'
Out[10]:
In [11]:
%%sql
SELECT reservoir_rows(10, count(*)::integer) FROM batting;
Running query in 'postgresql://localhost:5432/baseball'
10 rows affected.
Out[11]:
reservoir_rows
85249
87146
46212
92054
4743
6969
50606
53969
48693
70423
Truncated to displaylimit of 10.
In [12]:
%%sql
WITH rrows AS (SELECT reservoir_rows(10, count(*)::integer) AS rows 
                 FROM batting),
     rbatting AS (SELECT row_number() over(), * 
                    FROM batting)
SELECT *
  FROM rbatting, rrows 
 WHERE row_number = rows;
Running query in 'postgresql://localhost:5432/baseball'
10 rows affected.
Out[12]:
row_number playerid yearid stint teamid lgid g ab r h h2b h3b hr rbi sb cs bb so ibb hbp sh sf gidp rows
9221 mclauby01 1978 1 SEA AL 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9221
14858 waitsri01 1979 1 CLE AL 34 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14858
37583 davieky01 2005 1 ATL NL 21 15 0 3 0 0 0 4 0 0 2 3 0 0 10 1 0 37583
41413 martite01 1975 1 SLN NL 16 21 1 4 2 0 0 2 0 0 0 2 0 0 2 0 2 41413
50420 keefeti01 1880 1 TRN NL 12 43 4 10 3 0 0 3 None None 1 12 None None None None None 50420
50455 malonji01 1962 1 CIN NL 24 43 3 8 1 0 0 3 0 0 1 4 0 0 0 0 1 50455
52302 yoshima01 1998 1 NYN NL 29 48 3 3 1 0 0 3 0 0 3 27 0 0 8 0 0 52302
61659 renkost01 1970 1 MON NL 41 80 8 16 3 0 1 6 0 0 2 18 0 0 3 0 1 61659
83031 rodrise01 2016 1 PIT NL 140 300 49 81 16 1 18 56 2 1 33 102 2 5 1 3 6 83031
100428 frymatr01 1995 1 DET AL 144 567 79 156 21 5 15 81 4 2 63 100 4 3 0 7 18 100428
Truncated to displaylimit of 10.
In [13]:
%%sql
WITH rrows AS (SELECT reservoir_rows(15, count(*)::integer) AS rows 
                 FROM batting),
     rbatting AS (SELECT row_number() over(), * 
                    FROM batting)
SELECT COUNT(*)
  FROM rbatting, rrows 
 WHERE row_number = rows;
Running query in 'postgresql://localhost:5432/baseball'
1 rows affected.
Out[13]:
count
15

Stratified Sampling¶

In [14]:
# See larger display
%config SqlMagic.displaylimit = 80
In [15]:
%%sql
-- Stratified Sampling with Reservoirs
WITH grprows AS (SELECT teamid, reservoir_rows(10, COUNT(*)::integer) AS rows 
                   FROM batting 
                  GROUP BY teamid),
     rbatting AS (SELECT row_number() over(partition by teamid), * 
                    FROM batting)
SELECT *
  FROM rbatting b, grprows g
 WHERE row_number = rows
   AND b.teamid = g.teamid
 ORDER BY b.teamid;
Running query in 'postgresql://localhost:5432/baseball'
1450 rows affected.
Out[15]:
row_number playerid yearid stint teamid lgid g ab r h h2b h3b hr rbi sb cs bb so ibb hbp sh sf gidp teamid_1 rows
7 daisege01 1884 1 ALT UA 1 4 0 0 0 0 0 None None None 0 None None None None None None ALT 7
8 gradyjo01 1884 1 ALT UA 9 36 5 11 3 0 0 None None None 2 None None None None None None ALT 8
9 noftsge01 1884 1 ALT UA 7 25 0 1 0 0 0 None None None 0 None None None None None None ALT 9
10 smithge01 1884 1 ALT UA 25 108 9 34 8 1 0 None None None 1 None None None None None None ALT 10
11 doughch01 1884 1 ALT UA 23 85 6 22 5 0 0 None None None 2 None None None None None None ALT 11
13 harrifr01 1884 1 ALT UA 24 95 10 25 2 1 0 None None None 3 None None None None None None ALT 13
14 brownji01 1884 1 ALT UA 21 88 12 22 2 2 1 None None None 1 None None None None None None ALT 14
15 shaffta01 1884 1 ALT UA 13 55 10 18 2 0 0 None None None 3 None None None None None None ALT 15
16 mooreje01 1884 1 ALT UA 20 80 10 25 3 1 1 None None None 0 None None None None None None ALT 16
56 weberbe01 2003 1 ANA AL 62 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ANA 56
107 watsoal01 1997 1 ANA AL 35 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ANA 107
130 percitr01 1999 1 ANA AL 60 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ANA 130
187 levinal01 2001 1 ANA AL 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ANA 187
239 molinbe01 2000 1 ANA AL 130 473 59 133 20 2 14 71 1 0 23 33 0 6 4 7 17 ANA 239
240 alicelu01 1997 1 ANA AL 128 388 59 98 16 7 5 37 22 8 69 65 3 8 4 2 4 ANA 240
264 erstada01 1998 1 ANA AL 133 537 84 159 39 3 19 82 20 6 43 77 7 6 1 3 2 ANA 264
265 guerrvl01 2004 1 ANA AL 156 612 124 206 39 2 39 126 15 3 52 74 14 8 0 8 19 ANA 265
312 johnske04 2000 1 ANA AL 6 4 2 2 0 0 0 0 0 0 2 0 0 0 1 0 0 ANA 312
335 appieke01 2003 1 ANA AL 19 5 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 ANA 335
128 chacijh01 2015 1 ARI NL 5 8 0 1 0 0 0 0 0 0 0 4 0 0 1 0 0 ARI 128
135 galarar01 2011 1 ARI NL 8 15 0 0 0 0 0 1 0 0 0 7 0 0 1 0 0 ARI 135
228 roberry01 2009 1 ARI NL 110 305 41 85 17 2 7 25 7 3 40 55 1 3 2 1 2 ARI 228
347 uptonju01 2008 1 ARI NL 108 356 52 89 19 6 15 42 1 4 54 121 6 4 0 3 3 ARI 347
353 tracych01 2009 1 ARI NL 98 257 29 61 15 0 8 39 1 0 26 38 7 1 0 4 3 ARI 353
413 chaveer01 2014 1 ARI NL 44 69 6 17 3 1 3 8 2 0 11 19 0 0 0 1 2 ARI 413
460 counscr01 2000 1 ARI NL 67 152 23 48 8 1 2 11 3 3 20 18 0 2 1 1 4 ARI 460
516 mileywa01 2013 1 ARI NL 34 60 2 8 3 0 1 8 0 0 1 16 0 0 8 0 3 ARI 516
577 plesada01 1999 2 ARI NL 34 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 ARI 577
689 holmbda01 2013 1 ARI NL 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 ARI 689
14 matthga01 1979 1 ATL NL 156 631 97 192 34 5 27 90 18 6 60 75 5 0 1 3 6 ATL 14
464 teherju01 2016 1 ATL NL 31 49 1 10 1 0 0 2 0 0 1 16 0 0 11 0 1 ATL 464
643 tarasto01 1993 1 ATL NL 24 35 6 8 2 0 0 2 0 1 0 5 0 1 0 1 1 ATL 643
779 woodal02 2015 1 ATL NL 20 33 1 5 1 0 0 4 0 0 3 21 0 0 5 0 1 ATL 779
907 medlekr01 2010 1 ATL NL 32 27 2 5 1 0 0 2 0 0 3 14 0 1 2 0 0 ATL 907
1016 hodgetr01 2003 1 ATL NL 52 5 0 0 0 0 0 0 0 0 0 4 0 0 1 0 0 ATL 1016
1194 joynewa01 2000 1 ATL NL 119 224 24 63 12 0 5 32 0 0 31 31 3 1 0 4 2 ATL 1194
1397 boyercl02 1969 1 ATL NL 144 496 57 124 16 1 14 57 3 7 55 87 6 4 4 3 16 ATL 1397
1735 vizcaar01 2016 1 ATL NL 43 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ATL 1735
1848 speckcl01 1986 1 ATL NL 13 3 0 0 0 0 0 0 0 0 1 3 0 0 1 0 0 ATL 1848
40 drakeol01 2017 1 BAL AL 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 BAL 40
524 tollias01 2016 1 BAL AL 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 BAL 524
665 willima05 1987 1 BAL AL 61 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 BAL 665
1109 urruthe01 2015 1 BAL AL 10 34 3 9 1 0 1 6 0 0 2 3 0 0 0 0 1 BAL 1109
1185 tillmch01 2013 1 BAL AL 33 5 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 BAL 1185
1492 jonesad01 2010 1 BAL AL 149 581 76 165 25 5 19 69 7 7 23 119 1 13 2 2 17 BAL 1492
1949 salmoch01 1969 1 BAL AL 52 91 18 27 5 0 3 12 0 0 10 22 1 2 0 1 1 BAL 1949
2147 davisch02 2017 1 BAL AL 128 456 65 98 15 1 26 61 1 1 61 195 4 3 0 4 7 BAL 2147
2196 bigbila01 2004 1 BAL AL 139 478 76 134 23 1 15 68 8 3 45 113 0 1 3 4 7 BAL 2196
2518 gibboja01 2006 1 BAL AL 90 343 34 95 23 0 13 46 0 0 32 48 2 2 0 1 12 BAL 2518
15 orourji01 1881 1 BFN NL 83 348 71 105 21 7 0 30 None None 27 18 None None None None None BFN 15
16 roweja01 1884 1 BFN NL 93 400 85 126 14 14 4 61 None None 23 14 None None None None None BFN 16
22 hornujo01 1880 1 BFN NL 85 342 47 91 8 11 1 42 None None 8 29 None None None None None BFN 22
37 eggleda01 1879 1 BFN NL 78 317 41 66 5 7 0 27 None None 11 41 None None None None None BFN 37
40 broutda01 1883 1 BFN NL 98 425 85 159 41 17 3 97 None None 16 17 None None None None None BFN 40
57 whitede01 1884 1 BFN NL 110 452 82 147 16 11 5 74 None None 32 13 None None None None None BFN 57
58 galvipu01 1879 1 BFN NL 67 265 34 66 11 6 0 27 None None 1 56 None None None None None BFN 58
66 libbyst01 1879 1 BFN NL 1 2 0 0 0 0 0 0 None None 0 1 None None None None None BFN 66
81 rittech01 1885 1 BFN NL 2 6 0 1 0 0 0 0 None None 0 2 None None None None None BFN 81
119 sullisl01 1881 1 BFN NL 35 121 13 23 4 0 0 15 None None 1 21 None None None None None BFN 119
6 staffge01 1890 1 BFP PL 15 49 11 7 1 0 0 3 2 None 7 8 None 0 None None None BFP 6
7 rainejo01 1890 1 BFP PL 42 166 29 39 5 1 1 20 12 None 24 15 None 5 None None None BFP 7
8 doefr01 1890 1 BFP PL 1 2 0 0 0 0 0 0 0 None 0 1 None 0 None None None BFP 8
10 beeched01 1890 1 BFP PL 126 536 69 159 22 10 3 90 14 None 29 23 None 7 None None None BFP 10
13 whitede01 1890 1 BFP PL 122 439 62 114 13 4 0 47 3 None 67 30 None 19 None None None BFP 13
17 clarksp01 1890 1 BFP PL 69 260 45 69 11 1 2 25 8 None 20 16 None 3 None None None BFP 17
18 irwinjo01 1890 1 BFP PL 77 308 62 72 11 4 0 34 18 None 43 19 None 4 None None None BFP 18
24 fersoal01 1890 1 BFP PL 11 32 4 7 0 0 0 2 1 None 6 7 None 1 None None None BFP 24
25 lewis01 1890 1 BFP PL 1 5 1 1 0 0 0 0 0 None 0 0 None 0 None None None BFP 25
7 boardfr01 1874 1 BL1 NA 1 4 0 1 0 0 0 0 0 0 0 0 None None None None 0 BL1 7
8 sweasch01 1874 1 BL1 NA 8 33 2 8 0 0 0 4 0 0 2 0 None None None None 0 BL1 8
21 hastisc01 1872 2 BL1 NA 13 62 16 19 3 1 0 4 0 1 1 2 None None None None 0 BL1 21
25 careyto01 1873 1 BL1 NA 56 291 76 98 18 3 1 55 2 3 1 4 None None None None 2 BL1 25
27 mannija01 1874 1 BL1 NA 42 174 32 61 8 2 0 18 0 0 2 2 None None None None 0 BL1 27
33 highadi01 1872 1 BL1 NA 50 245 72 84 10 1 2 37 4 5 2 3 None None None None 2 BL1 33
34 yorkto01 1873 1 BL1 NA 57 278 70 84 11 7 2 50 4 1 3 4 None None None None 0 BL1 34
35 mcveyca01 1873 1 BL1 NA 38 192 49 73 5 5 2 35 2 1 3 2 None None None None 1 BL1 35
42 millsev01 1873 1 BL1 NA 54 262 64 87 20 9 0 56 1 0 2 1 None None None None 2 BL1 42
46 yorkto01 1872 1 BL1 NA 51 250 66 66 10 4 0 40 6 1 3 1 None None None None 0 BL1 46
26 purcebl01 1888 1 BL2 AA 101 406 53 96 9 4 2 39 16 None 27 34 None 3 None None None BL2 26
33 smithph01 1887 1 BL2 AA 64 205 37 48 7 6 1 18 7 None 26 11 None 1 None None None BL2 33
Truncated to displaylimit of 80.