Lecture 15: Imputation and Entity Resolution¶

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)

sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline

GNIS data¶

This is the GNIS dataset from the previous lecture. If you didn't load in the database, run the below cell before connecting.

In [2]:
!psql -h localhost -d gnis -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS gnis'
!psql -h localhost -c 'CREATE DATABASE gnis' 
!psql -h localhost -d gnis -f ../lec13/data/gnis.sql
 pg_terminate_backend 
----------------------
(0 rows)

DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 3195
COPY 11533
CREATE INDEX
In [3]:
%reload_ext sql
%sql postgresql://localhost:5432/gnis

Make a "holey" dataset¶

In [4]:
%sql SELECT setseed(0.12345);
Running query in 'postgresql://localhost:5432/gnis'
1 rows affected.
Out[4]:
setseed
In [5]:
%%sql
DROP TABLE IF EXISTS holey CASCADE;
CREATE TABLE holey AS 
SELECT feature_id, feature_name, feature_class, state_alpha, county_name, 
       prim_lat_dec, prim_long_dec, 
       CASE WHEN random() > 0.9 THEN NULL
            ELSE elev_in_m
        END AS elev_in_m
  FROM national;
SELECT count(elev_in_m)::float / count(*) FROM holey;
Running query in 'postgresql://localhost:5432/gnis'
11533 rows affected.
1 rows affected.
Out[5]:
?column?
0.8755744385675887
In [6]:
%%sql
SELECT * FROM holey TABLESAMPLE BERNOULLI(.1);
Running query in 'postgresql://localhost:5432/gnis'
13 rows affected.
Out[6]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m
129624 Jernigan Lake Reservoir AL Escambia 31.0448271 -87.1856044 35.0
542541 Poley Populated Place LA Livingston 30.3382495 -90.7964848 3.0
729853 Burkhart Prairie Area MO Newton 36.9617304 -94.483 348.0
901807 First Baptist Church Church NM Otero 32.8995325 -105.960265 1323.0
1156802 Multnomah Falls Bar Bar OR Multnomah 45.5831737 -122.1234199 4.0
1347909 Stevens Park School School TX Dallas 32.7601302 -96.8650032 176.0
1477093 Barley Populated Place VA Greensville 36.5640382 -77.7241508 86.0
1541653 Left Fork Elk Creek Stream WV Mingo 37.7301032 -82.1334652 269.0
1726318 Teton Mine Mine WY Natrona 42.8896821 -107.4520133 2111.0
2080975 Clay County Courthouse Building MN Clay 46.8841667 -96.7633333 275.0
Truncated to displaylimit of 10.

Schema 1. Default Value Imputation with SQL¶

In [7]:
%%sql
WITH elevavg AS (SELECT avg(elev_in_m) FROM holey)
SELECT h.*, 
       CASE WHEN h.elev_in_m IS NOT NULL THEN h.elev_in_m
            ELSE e.avg
        END AS imputed_elev_in_m
  FROM holey h, elevavg e
LIMIT 10;
Running query in 'postgresql://localhost:5432/gnis'
10 rows affected.
Out[7]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m imputed_elev_in_m
1230 Belmont Mountains Range AZ Maricopa 33.642258 -112.9010129 931.0 931.0
1839 Bootlegger Saddle Gap AZ Cochise 31.8931474 -109.2831176 2707.0 2707.0
2336 Cabeza Prieta Game Range Park AZ Yuma 32.250056 -113.45074 275.0 275.0
2750 Chandler Springs Spring AZ Navajo 35.3766788 -110.4754096 1685.0 1685.0
3032 Cline Well Well AZ Cochise 31.9000849 -110.3428525 None 483.962467815409
3039 Clover Tank Reservoir AZ Gila 33.8509816 -110.2577249 1563.0 1563.0
3060 Coat Spring Spring AZ Navajo 36.12678 -110.3330424 1926.0 1926.0
3143 Comar Spring Spring AZ Navajo 35.5308428 -110.4162424 1732.0 1732.0
3333 Cottonwood Creek Stream AZ Coconino 36.050817 -111.9865535 800.0 800.0
3342 Cottonwood Creek Stream AZ Mohave 36.901931 -112.5632547 1389.0 1389.0
Truncated to displaylimit of 10.

Scheme 2. Correlation across columns¶

In [8]:
# Training is an aggregate function
# Here we'll train the model in SQL just for fun
result = %sql SELECT regr_slope(elev_in_m, prim_long_dec), \
               regr_intercept(elev_in_m, prim_long_dec) FROM holey
slope, intercept = result[0]
slope, intercept
Running query in 'postgresql://localhost:5432/gnis'
1 rows affected.
Out[8]:
(-10.314179001097786, -477.9603219322606)
In [9]:
%%sql
SELECT prim_long_dec, elev_in_m
FROM holey
WHERE prim_long_dec >= -122
ORDER BY prim_long_dec
OFFSET 40;
Running query in 'postgresql://localhost:5432/gnis'
10834 rows affected.
Out[9]:
prim_long_dec elev_in_m
-121.8142546 469.0
-121.8099602 None
-121.8083604 1454.0
-121.8056805 None
-121.803556 1614.0
-121.7974945 1264.0
-121.7960906 219.0
-121.784236 1684.0
-121.7832846 2.0
-121.7785678 46.0
Truncated to displaylimit of 10.
In [10]:
%%sql
SELECT *, 
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE {{slope}}*prim_long_dec + {{intercept}}
        END AS imputed_elev_in_m
  FROM holey
LIMIT 10;
Running query in 'postgresql://localhost:5432/gnis'
10 rows affected.
Out[10]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m imputed_elev_in_m
1230 Belmont Mountains Range AZ Maricopa 33.642258 -112.9010129 931.0 931.0
1839 Bootlegger Saddle Gap AZ Cochise 31.8931474 -109.2831176 2707.0 2707.0
2336 Cabeza Prieta Game Range Park AZ Yuma 32.250056 -113.45074 275.0 275.0
2750 Chandler Springs Spring AZ Navajo 35.3766788 -110.4754096 1685.0 1685.0
3032 Cline Well Well AZ Cochise 31.9000849 -110.3428525 None 660.1356102444698
3039 Clover Tank Reservoir AZ Gila 33.8509816 -110.2577249 1563.0 1563.0
3060 Coat Spring Spring AZ Navajo 36.12678 -110.3330424 1926.0 1926.0
3143 Comar Spring Spring AZ Navajo 35.5308428 -110.4162424 1732.0 1732.0
3333 Cottonwood Creek Stream AZ Coconino 36.050817 -111.9865535 800.0 800.0
3342 Cottonwood Creek Stream AZ Mohave 36.901931 -112.5632547 1389.0 1389.0
Truncated to displaylimit of 10.

Scheme 3. General model-based interpolation¶

We won't show the demo; check slides for the general idea.

In [11]:
%reload_ext sql
%sql postgresql://localhost:5432/gnis
%config SqlMagic.displaylimit = 100

Scheme 4. [simple] Correlation across ordered rows¶

In [12]:
%%sql
-- The following does not work in PostgreSQL!
WITH buggy AS (
SELECT *,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE lag(elev_in_m, 1)
                 OVER (ORDER BY feature_id)
        END AS imputed_elev_in_m
  FROM holey
)
SELECT feature_id, elev_in_m, imputed_elev_in_m FROM buggy ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[12]:
feature_id elev_in_m imputed_elev_in_m
48638 46.0 46.0
49192 401.0 401.0
49214 194.0 194.0
49350 None 194.0
49578 None None
49802 47.0 47.0
49925 111.0 111.0
50059 71.0 71.0
50309 None 71.0
50661 26.0 26.0
50948 222.0 222.0
51021 70.0 70.0
In [13]:
%%sql
-- Here's a UDA fix from
-- https://stackoverflow.com/questions/18987791/how-do-i-efficiently-select-the-previous-non-null-value
CREATE OR REPLACE FUNCTION coalesce_agg_sfunc(state anyelement, value anyelement) RETURNS anyelement AS
$$
    SELECT coalesce(value, state);
$$ LANGUAGE SQL;

CREATE OR REPLACE AGGREGATE coalesce_agg(anyelement) (
    SFUNC = coalesce_agg_sfunc,
    STYPE  = anyelement);
Running query in 'postgresql://localhost:5432/gnis'
Out[13]:
In [14]:
%%sql
-- Fixed to handle repeated NULLs
WITH fixed AS (
SELECT *,
       coalesce_agg(elev_in_m) OVER (order by feature_id) AS imputed_elev_in_m
  FROM holey
)
SELECT feature_id, elev_in_m, imputed_elev_in_m FROM fixed ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[14]:
feature_id elev_in_m imputed_elev_in_m
48638 46.0 46.0
49192 401.0 401.0
49214 194.0 194.0
49350 None 194.0
49578 None 194.0
49802 47.0 47.0
49925 111.0 111.0
50059 71.0 71.0
50309 None 71.0
50661 26.0 26.0
50948 222.0 222.0
51021 70.0 70.0

Generalized interpolation across rows¶

Forward pass

In [15]:
%%sql
-- 1. Forward assign run numbers to rows, propagate val, get nextval
CREATE OR REPLACE VIEW forward AS
SELECT *,
       SUM(CASE WHEN elev_in_m IS NULL THEN 0 ELSE 1 END) 
         OVER (ORDER BY feature_id) AS run,
       coalesce_agg(elev_in_m) OVER (ORDER BY feature_id) AS run_start,
       CASE WHEN elev_in_m IS NULL 
              THEN lead(elev_in_m, 1) OVER (ORDER BY feature_id)
            ELSE NULL
             END AS nextval
  FROM holey;
SELECT * FROM forward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[15]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m run run_start nextval
48638 Dry Creek Stream AR Union 33.3242919 -92.9121074 46.0 168 46.0 None
49192 Foster Cemetery Cemetery AR Benton 36.4612601 -94.1579926 401.0 169 401.0 None
49214 Fox Hollow Valley AR Sharp 36.4575632 -91.4387492 194.0 170 194.0 None
49350 Lewis Johnson Family Cemetery Cemetery AR Johnson 35.4122994 -93.3587851 None 170 194.0 None
49578 Grassy Lake Swamp AR Hempstead 33.6887443 -93.8693011 None 170 194.0 47.0
49802 Hamilton Creek Stream AR Ouachita 33.7592735 -92.9482208 47.0 171 47.0 None
49925 Hartsoe Cemetery Cemetery AR Greene 36.1833735 -90.4370362 111.0 172 111.0 None
50059 Hickory Grove Cemetery Cemetery AR Lonoke 34.6431418 -92.028191 71.0 173 71.0 None
50309 Hopewell Church Church AR Arkansas 34.3839893 -91.1540068 None 173 71.0 26.0
50661 John Mack Slough Gut AR Calhoun 33.3431778 -92.4573712 26.0 174 26.0 None
50948 Lake Norfork Church Church AR Baxter 36.3853437 -92.224323 222.0 175 222.0 None
51021 Leachville Cemetery Cemetery AR Jackson 35.5949353 -91.243365 70.0 176 70.0 None

Backward pass

In [16]:
%%sql
-- 2. Backward: assign run_end, run_size, run_rank
CREATE OR REPLACE VIEW backward AS
SELECT *,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE coalesce_agg(nextval) OVER (PARTITION BY run ORDER BY feature_id DESC)
        END AS run_end,
       count(*) OVER (PARTITION BY run) AS run_size,
       -1 + (RANK() OVER (PARTITION BY run ORDER BY feature_id)) AS run_rank
  FROM forward;
SELECT * FROM backward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[16]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m run run_start nextval run_end run_size run_rank
48638 Dry Creek Stream AR Union 33.3242919 -92.9121074 46.0 168 46.0 None 46.0 1 0
49192 Foster Cemetery Cemetery AR Benton 36.4612601 -94.1579926 401.0 169 401.0 None 401.0 1 0
49214 Fox Hollow Valley AR Sharp 36.4575632 -91.4387492 194.0 170 194.0 None 194.0 3 0
49350 Lewis Johnson Family Cemetery Cemetery AR Johnson 35.4122994 -93.3587851 None 170 194.0 None 47.0 3 1
49578 Grassy Lake Swamp AR Hempstead 33.6887443 -93.8693011 None 170 194.0 47.0 47.0 3 2
49802 Hamilton Creek Stream AR Ouachita 33.7592735 -92.9482208 47.0 171 47.0 None 47.0 1 0
49925 Hartsoe Cemetery Cemetery AR Greene 36.1833735 -90.4370362 111.0 172 111.0 None 111.0 1 0
50059 Hickory Grove Cemetery Cemetery AR Lonoke 34.6431418 -92.028191 71.0 173 71.0 None 71.0 2 0
50309 Hopewell Church Church AR Arkansas 34.3839893 -91.1540068 None 173 71.0 26.0 26.0 2 1
50661 John Mack Slough Gut AR Calhoun 33.3431778 -92.4573712 26.0 174 26.0 None 26.0 1 0
50948 Lake Norfork Church Church AR Baxter 36.3853437 -92.224323 222.0 175 222.0 None 222.0 1 0
51021 Leachville Cemetery Cemetery AR Jackson 35.5949353 -91.243365 70.0 176 70.0 None 70.0 1 0

Scalar function pass

In [17]:
%%sql
-- 3. Simple scalar pass
CREATE OR REPLACE VIEW final AS
SELECT *, 
       run_start + (run_rank-1)*((run_end-run_start)/(run_size))
         AS interpolated
  FROM backward;
SELECT * FROM final  ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[17]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m run run_start nextval run_end run_size run_rank interpolated
48638 Dry Creek Stream AR Union 33.3242919 -92.9121074 46.0 168 46.0 None 46.0 1 0 46.0
49192 Foster Cemetery Cemetery AR Benton 36.4612601 -94.1579926 401.0 169 401.0 None 401.0 1 0 401.0
49214 Fox Hollow Valley AR Sharp 36.4575632 -91.4387492 194.0 170 194.0 None 194.0 3 0 194.0
49350 Lewis Johnson Family Cemetery Cemetery AR Johnson 35.4122994 -93.3587851 None 170 194.0 None 47.0 3 1 194.0
49578 Grassy Lake Swamp AR Hempstead 33.6887443 -93.8693011 None 170 194.0 47.0 47.0 3 2 145.0
49802 Hamilton Creek Stream AR Ouachita 33.7592735 -92.9482208 47.0 171 47.0 None 47.0 1 0 47.0
49925 Hartsoe Cemetery Cemetery AR Greene 36.1833735 -90.4370362 111.0 172 111.0 None 111.0 1 0 111.0
50059 Hickory Grove Cemetery Cemetery AR Lonoke 34.6431418 -92.028191 71.0 173 71.0 None 71.0 2 0 71.0
50309 Hopewell Church Church AR Arkansas 34.3839893 -91.1540068 None 173 71.0 26.0 26.0 2 1 71.0
50661 John Mack Slough Gut AR Calhoun 33.3431778 -92.4573712 26.0 174 26.0 None 26.0 1 0 26.0
50948 Lake Norfork Church Church AR Baxter 36.3853437 -92.224323 222.0 175 222.0 None 222.0 1 0 222.0
51021 Leachville Cemetery Cemetery AR Jackson 35.5949353 -91.243365 70.0 176 70.0 None 70.0 1 0 70.0
In [18]:
%sql EXPLAIN Analyze SELECT * from final LIMIT 500;
Running query in 'postgresql://localhost:5432/gnis'
20 rows affected.
Out[18]:
QUERY PLAN
Limit (cost=1206.79..1275.92 rows=500 width=216) (actual time=31.791..33.647 rows=500 loops=1)
  -> Subquery Scan on backward (cost=1206.79..2074.77 rows=6278 width=216) (actual time=31.789..33.614 rows=500 loops=1)
        -> WindowAgg (cost=1206.79..1902.13 rows=6278 width=208) (actual time=31.787..33.537 rows=500 loops=1)
              -> WindowAgg (cost=1206.79..1792.26 rows=6278 width=200) (actual time=31.782..33.164 rows=501 loops=1)
                    -> Incremental Sort (cost=1206.79..1682.40 rows=6278 width=192) (actual time=31.777..32.887 rows=502 loops=1)
                          Sort Key: forward.run, forward.feature_id
                          Presorted Key: forward.run
                          Full-sort Groups: 16 Sort Method: quicksort Average Memory: 29kB Peak Memory: 29kB
                          -> WindowAgg (cost=1204.85..1330.41 rows=6278 width=192) (actual time=31.694..32.700 rows=514 loops=1)
                                -> Sort (cost=1204.85..1220.55 rows=6278 width=184) (actual time=31.648..31.689 rows=515 loops=1)
                                      Sort Key: forward.run, forward.feature_id DESC
                                      Sort Method: quicksort Memory: 2102kB
                                      -> Subquery Scan on forward (cost=604.80..808.83 rows=6278 width=184) (actual time=3.266..26.821 rows=11533 loops=1)
                                            -> WindowAgg (cost=604.80..746.05 rows=6278 width=184) (actual time=3.266..25.651 rows=11533 loops=1)
                                                  -> Sort (cost=604.80..620.49 rows=6278 width=160) (actual time=3.195..3.816 rows=11533 loops=1)
                                                        Sort Key: holey.feature_id
                                                        Sort Method: quicksort Memory: 2011kB
                                                        -> Seq Scan on holey (cost=0.00..208.78 rows=6278 width=160) (actual time=0.008..0.957 rows=11533 loops=1)
Planning Time: 0.136 ms
Execution Time: 33.874 ms

String distance¶

In [19]:
%reload_ext sql
%sql postgresql://localhost:5432/postgres
Connecting and switching to connection postgresql://localhost:5432/postgres
In [20]:
%sql CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
Running query in 'postgresql://localhost:5432/postgres'
Out[20]:
In [21]:
%%sql
DROP TABLE IF EXISTS Strings;
CREATE TABLE Strings (str1 TEXT, str2 TEXT);
INSERT INTO Strings VALUES
 ('Lisa', 'List'),
 ('Lisa', 'License'),
 ('Joe', 'Noel');
Running query in 'postgresql://localhost:5432/postgres'
3 rows affected.
Out[21]:
In [22]:
%%sql
SELECT *,
       levenshtein(str1, str2),
       soundex(str1) as soundex1,
       soundex(str2) as soundex2,
       dmetaphone(str1) AS dmetaphone1,
       dmetaphone(str2) AS dmetaphone2,
       dmetaphone_alt(str1) AS dmetaphone_alt1,
       dmetaphone_alt(str2) AS dmetaphone_alt2
FROM Strings;
Running query in 'postgresql://localhost:5432/postgres'
3 rows affected.
Out[22]:
str1 str2 levenshtein soundex1 soundex2 dmetaphone1 dmetaphone2 dmetaphone_alt1 dmetaphone_alt2
Lisa List 1 L200 L230 LS LST LS LST
Lisa License 4 L200 L252 LS LSNS LS LSNS
Joe Noel 2 J000 N400 J NL A NL