## replace the database connection with a database of your own!
%reload_ext sql
%sql postgresql://localhost:5432/baseball
%sql CREATE EXTENSION IF NOT EXISTS plpython3u; -- import extension
SELECT COUNT(*) ... WHERE col = k
WHERE col BETWEEN k AND l
SELECT COUNT(DISTINCT col)
SELECT COUNT(*) ... WHERE col = x
Incredibly simple idea!
Intuition:
col
to integerscol
, increment the counter at $A[{h(v)}]$WHERE col = x
return $A[{h(x)}]$How bad is our estimate?
Why not stop here?
Easy to write
Scales to arbitrarily large data sets!
In practice, all the hashing at construction time can be slow
PostgreSQL/MADlib example below
## MADlib is only compatible with PostgreSQL 12
## My PostgreSQL 12 installation didn't have Python3
## So let's switch connections now
%reload_ext sql
%sql postgresql://localhost:5433/baseball
RuntimeError: (psycopg2.OperationalError) connection to server at "localhost" (127.0.0.1), port 5433 failed: Connection refused Is the server running on that host and accepting TCP/IP connections? connection to server at "localhost" (::1), port 5433 failed: Cannot assign requested address Is the server running on that host and accepting TCP/IP connections? (Background on this error at: https://sqlalche.me/e/20/e3q8) If you need help solving this issue, send us a message: https://ploomber.io/community
%%sql
WITH data AS (SELECT floor(random()*10)::integer AS class,
floor(exp(log(random()*100)))::integer AS a1 FROM generate_series(1,100000)),
sketch AS (SELECT madlib.cmsketch(class) AS class_cm, madlib.cmsketch(a1) AS a1_cm FROM data)
SELECT 'sketch' as method,
madlib.cmsketch_count(class_cm, 7) as class_7, madlib.cmsketch_count(class_cm, 9) as class_9,
madlib.cmsketch_count(a1_cm, 3) as a1_3, madlib.cmsketch_count(a1_cm, 7) as a1_7
FROM sketch
UNION ALL
SELECT 'actual',
sum(CASE WHEN class = 7 THEN 1 ELSE 0 END), sum(CASE WHEN class = 9 THEN 1 ELSE 0 END),
sum(CASE WHEN a1 = 3 THEN 1 ELSE 0 END), sum(CASE WHEN a1 = 7 THEN 1 ELSE 0 END)
FROM data;
%%sql
WITH sketch AS (SELECT madlib.cmsketch(hr) AS hr_cm FROM batting)
SELECT 'sketch' as method,
madlib.cmsketch_count(hr_cm, 40)
FROM sketch
UNION ALL
SELECT 'actual',
COUNT(*)
FROM batting
WHERE hr = 40;
SELECT COUNT(DISTINCT col) FROM table
Problem:
HyperLogLog is one of many solutions to this problem.
# https://www.geeksforgeeks.org/number-of-leading-zeros-in-binary-representation-of-a-given-number/
def countZeros(x):
# Keep shifting x by one until
# leftmost bit does not become 1.
total_bits = 32
res = 0
while ((x & (1 << (total_bits - 1))) == 0):
x = (x << 1)
res += 1
return res
from random import randint
import pandas as pd
s = pd.Series([countZeros(hash(i)) for i in range(1,1000)])
s.plot.hist(grid=True, bins=20, rwidth=0.9, color='#607c8e')
Run the cell above a few times and you'll see:
1
is $50\%$10
is $25\%$. Etc.COUNT(DISTINCT)
!!%%sql
WITH data AS (SELECT floor(random()*10)::integer AS class,
floor(exp(log(random()*100)))::integer AS a1 FROM generate_series(1,100000)),
approx AS (SELECT madlib.fmsketch_dcount(class) AS class_fm, madlib.fmsketch_dcount(a1) AS a1_fm FROM data)
SELECT 'sketch' as method,
*
FROM approx
UNION ALL
SELECT 'actual',
COUNT(DISTINCT class), COUNT(DISTINCT a1)
FROM data;
Sketches are like materialized views that can approximate the answer to a class of queries.
Sketches are mostly used in high-volume streaming settings