Lecture 14: Imputation¶

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)

sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline

GNIS data¶

This is the GNIS dataset from a previous lecture. If you didn't load in the database, run the below cell before connecting.

In [2]:
!psql -h localhost -d gnis -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS gnis'
!psql -h localhost -c 'CREATE DATABASE gnis' 
!psql -h localhost -d gnis -f ../lec11/data/gnis.sql
 pg_terminate_backend 
----------------------
(0 rows)

DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 3195
COPY 11533
CREATE INDEX
In [3]:
%reload_ext sql
%sql postgresql://localhost:5432/gnis
There's a new jupysql version available (0.10.14), you're running 0.10.0. To upgrade: pip install jupysql --upgrade
Deploy Panel apps for free on Ploomber Cloud! Learn more: https://ploomber.io/s/signup
In [4]:
%sql SELECT setseed(0.12345);
Running query in 'postgresql://localhost:5432/gnis'
1 rows affected.
Out[4]:
setseed
In [5]:
%%sql
DROP TABLE IF EXISTS holey CASCADE;
CREATE TABLE holey AS 
SELECT feature_id, feature_name, feature_class, state_alpha, county_name, 
       prim_lat_dec, prim_long_dec, 
       CASE WHEN random() > 0.9 THEN NULL
            ELSE elev_in_m
        END AS elev_in_m
  FROM national;
SELECT count(elev_in_m)::float / count(*) FROM holey;
Running query in 'postgresql://localhost:5432/gnis'
11533 rows affected.
1 rows affected.
Out[5]:
?column?
0.8755744385675887

Method 1. Default Value Imputation with SQL¶

In [6]:
%%sql
WITH elevavg AS (SELECT avg(elev_in_m) FROM holey)
SELECT h.*, 
       CASE WHEN h.elev_in_m IS NOT NULL THEN h.elev_in_m
            ELSE e.avg
        END AS imputed_elev_in_m
  FROM holey h, elevavg e
LIMIT 10;
Running query in 'postgresql://localhost:5432/gnis'
10 rows affected.
Out[6]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m imputed_elev_in_m
1230 Belmont Mountains Range AZ Maricopa 33.642258 -112.9010129 931.0 931.0
1839 Bootlegger Saddle Gap AZ Cochise 31.8931474 -109.2831176 2707.0 2707.0
2336 Cabeza Prieta Game Range Park AZ Yuma 32.250056 -113.45074 275.0 275.0
2750 Chandler Springs Spring AZ Navajo 35.3766788 -110.4754096 1685.0 1685.0
3032 Cline Well Well AZ Cochise 31.9000849 -110.3428525 None 483.962467815409
3039 Clover Tank Reservoir AZ Gila 33.8509816 -110.2577249 1563.0 1563.0
3060 Coat Spring Spring AZ Navajo 36.12678 -110.3330424 1926.0 1926.0
3143 Comar Spring Spring AZ Navajo 35.5308428 -110.4162424 1732.0 1732.0
3333 Cottonwood Creek Stream AZ Coconino 36.050817 -111.9865535 800.0 800.0
3342 Cottonwood Creek Stream AZ Mohave 36.901931 -112.5632547 1389.0 1389.0
Truncated to displaylimit of 10.

Method 2. Correlation across columns¶

In [7]:
# Training is an aggregate function
# Here we'll train the model in SQL just for fun
result = %sql SELECT regr_slope(elev_in_m, prim_long_dec), \
               regr_intercept(elev_in_m, prim_long_dec) FROM holey
slope, intercept = result[0]
slope, intercept
Running query in 'postgresql://localhost:5432/gnis'
1 rows affected.
Out[7]:
(-10.314179001097786, -477.9603219322606)
In [8]:
%%sql
SELECT *, 
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE {{slope}}*prim_long_dec + {{intercept}}
        END AS imputed_elev_in_m
  FROM holey
LIMIT 10;
Running query in 'postgresql://localhost:5432/gnis'
10 rows affected.
Out[8]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m imputed_elev_in_m
1230 Belmont Mountains Range AZ Maricopa 33.642258 -112.9010129 931.0 931.0
1839 Bootlegger Saddle Gap AZ Cochise 31.8931474 -109.2831176 2707.0 2707.0
2336 Cabeza Prieta Game Range Park AZ Yuma 32.250056 -113.45074 275.0 275.0
2750 Chandler Springs Spring AZ Navajo 35.3766788 -110.4754096 1685.0 1685.0
3032 Cline Well Well AZ Cochise 31.9000849 -110.3428525 None 660.1356102444698
3039 Clover Tank Reservoir AZ Gila 33.8509816 -110.2577249 1563.0 1563.0
3060 Coat Spring Spring AZ Navajo 36.12678 -110.3330424 1926.0 1926.0
3143 Comar Spring Spring AZ Navajo 35.5308428 -110.4162424 1732.0 1732.0
3333 Cottonwood Creek Stream AZ Coconino 36.050817 -111.9865535 800.0 800.0
3342 Cottonwood Creek Stream AZ Mohave 36.901931 -112.5632547 1389.0 1389.0
Truncated to displaylimit of 10.

Method 3. General model-based interpolation¶

We won't show the demo; check slides for the general idea.

In [9]:
%config SqlMagic.displaylimit = 100

Method 4. [simple] Correlation across ordered rows¶

In [10]:
%%sql
-- The following doesn't work in PostgreSQL!
WITH buggy AS (
SELECT *,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE lag(elev_in_m, 1)
                 OVER (ORDER BY feature_id)
        END AS imputed_elev_in_m
  FROM holey
)
SELECT * FROM buggy ORDER BY elev_in_m DESC LIMIT 100;
Running query in 'postgresql://localhost:5432/gnis'
100 rows affected.
Out[10]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m imputed_elev_in_m
214923 Lewes Junior High School School DE Sussex 38.7687232 -75.1460168 None 89.0
209568 Ox Hill Populated Place CT Fairfield 41.2284297 -73.2190027 None 31.0
204786 San Acacio Populated Place CO Costilla 37.2139025 -105.5644584 None 1712.0
205451 Berkshire Populated Place CT Fairfield 41.4087069 -73.2595601 None 1945.0
205915 Calebs Peak Summit CT Litchfield 41.7559276 -73.4648456 None 116.0
194300 Bull Pasture Well Well CO Las Animas 37.6594568 -104.3541397 None 1692.0
198468 Gurley Dam Dam CO San Miguel 38.035547 -108.2459128 None None
189182 Lienhart Mine Mine CO Chaffee 38.9377726 -106.2547437 None 3173.0
197727 Pine Brook Hill Populated Place CO Boulder 40.0499857 -105.3147137 None 2601.0
198347 King Reservoir Number 1 Reservoir CO Mesa 38.8657673 -108.6665656 None 1568.0
168322 Batson Post Office (historical) Post Office AL Coosa 0.0 0.0 None None
175017 Camp Foster Creek Stream CO Pitkin 39.3308169 -107.2222735 None 2514.0
179621 Bartlett Mountain Summit CO Lake 39.3769097 -106.1591995 None 2231.0
184045 West Prong Red Creek Stream CO Archuleta 37.3825024 -107.4417201 None 2461.0
184321 Park Reservoir Reservoir CO Archuleta 37.3118044 -107.1509411 None 2782.0
168566 Teddy Post Office (historical) Post Office AL Escambia 0.0 0.0 None None
168977 Pondville Post Office (historical) Post Office AL Bibb 0.0 0.0 None 108.0
134138 Fourth Creek School (historical) School AL Sumter 32.4595827 -88.1983586 None 163.0
196693 Morapos Cow Camp Locale CO Rio Blanco 40.1841431 -107.5350619 None 2643.0
170234 Lady Moon Lake Lake CO Larimer 40.7695992 -105.5267598 None None
195705 Smith Reservoir Reservoir CO Otero 37.7677873 -103.9584627 None 1208.0
168550 Filmore Post Office (historical) Post Office AL Escambia 0.0 0.0 None None
167976 Walnut Hill Post Office (historical) Post Office AL Tallapoosa 0.0 0.0 None None
157305 Kell Cemetery Cemetery AL Morgan 0.0 0.0 None 195.0
174510 Thompson Reservoir Number 3 Reservoir CO Mesa 39.0454778 -108.7755725 None 2542.0
83954 Bend Post Office (historical) Post Office AR Marion 0.0 0.0 None 62.0
176528 McKinney Draw Valley CO Dolores 37.8663073 -108.5713788 None 3035.0
168503 Howles Store Post Office (historical) Post Office AL Elmore 0.0 0.0 None None
159665 Gilgal Baptist Church Church AL Tuscaloosa 33.1231761 -87.5002824 None 268.0
167869 Byron Post Office (historical) Post Office AL Randolph 0.0 0.0 None None
150422 New Home Populated Place AL DeKalb 34.6023063 -85.6991353 None 398.0
114664 Boggy Branch Stream FL Holmes 30.9731224 -85.7449601 None 78.0
127439 Stony Ridge Ridge AL Shelby 33.2201151 -86.7260973 None 95.0
148083 Eastern Gate Baptist Church Church AL Tuscaloosa 33.1528975 -87.4752819 None 166.0
141361 Burwells Classical Mathematical School (historical) School AL Sumter 0.0 0.0 None 200.0
166604 Tecumseh Post Office (historical) Post Office AL Cherokee 0.0 0.0 None None
83356 Thorn Cemetery Cemetery AR Faulkner 35.2539553 -92.3909621 None 57.0
69785 Township of Durham Civil AR Washington 35.944583 -93.9840421 None 574.0
129541 Pace Lake Reservoir AL Baldwin 30.5214126 -87.5095787 None 29.0
163989 Truck Mine Number 5 Mine AL Bibb 0.0 0.0 None None
56602 Yocum Cove Stream AR Marion 36.4700658 -92.6473881 None 432.0
163762 Williams Cemetery Cemetery AL Limestone 0.0 0.0 None 365.0
191511 Green Mountain Summit CO Fremont 38.5349305 -105.6604581 None 2645.0
114844 Brasher Branch Stream AL Shelby 33.4049059 -86.5942081 None 171.0
168098 Simsville Post Office (historical) Post Office AL Bullock 0.0 0.0 None None
84873 Forley Post Office (historical) Post Office AR Washington 0.0 0.0 None 46.0
164524 Conecuh National Forest Forest AL Covington 31.094322 -86.6341634 None 43.0
83633 Lost Mountain Wilderness Retreat Airport (historical) Airport AR Newton 35.9292434 -92.9626717 None None
55132 Springfield Church Church AR Pulaski 34.6084264 -92.0890328 None 62.0
39670 Old Dominion Library Building AZ Gila 33.406722 -110.7917765 None 1217.0
13547 West Well Well AZ Navajo 35.0355772 -110.4826281 None 1100.0
169917 Elkhorn Mine Mine CO Routt 40.9844118 -106.9519934 None 1998.0
8268 Moore Ranch Locale AZ Cochise 31.755093 -109.5122882 None 1214.0
76112 Greenbrier United Methodist Church Church AR Faulkner 35.2320267 -92.3884883 None 380.0
4590 Fiftysix Tank Reservoir AZ Gila 33.6899872 -110.3340976 None 2434.0
141577 Nelson Mine Mine AL Shelby 0.0 0.0 None None
167859 Tallapoosa Post Office (historical) Post Office AL Randolph 0.0 0.0 None 58.0
165584 Westlawn Elementary School School AL Morgan 34.6109253 -87.0069538 None 76.0
120344 Hooper City Populated Place AL Jefferson 33.5553829 -86.8411025 None 93.0
141780 Breman Coal Bed Mine Mine AL Cullman 0.0 0.0 None None
81705 Rauch Estate Open Pit Mine Mine AR Pulaski 34.645093 -92.3037624 None 196.0
82079 Ditch Number 14 Canal AR Mississippi 35.4623055 -90.1589823 None 233.0
139004 Jollywell Branch Stream AL Hale 32.5698557 -87.5577822 None 70.0
64652 Wright (historical) Populated Place AR Pike 34.1628804 -93.5376838 None 202.0
84960 Maddock Post Office (historical) Post Office AR White 0.0 0.0 None None
82103 Ditch Number 16 Canal AR Mississippi 35.7136884 -90.2509305 None None
84600 Meredith Post Office (historical) Post Office AR Woodruff 0.0 0.0 None 99.0
42036 Lockett Tank Reservoir AZ Coconino 35.1490729 -112.1510338 None 1402.0
124364 Oyster Bay Lake AL Baldwin 30.2726058 -87.7309074 None 152.0
129336 Zion Church Church AL Choctaw 32.2870865 -88.0708525 None 256.0
61851 Tyronza Cut-off Gut AR Mississippi 35.630635 -90.208151 None 74.0
113459 Bar Branch Stream AL Greene 32.8073977 -88.078892 None 125.0
50309 Hopewell Church Church AR Arkansas 34.3839893 -91.1540068 None 71.0
143509 Martha Thomas Elementary School School AL Mobile 30.7546326 -88.1147234 None 11.0
51173 Linn Creek Stream AR Van Buren 35.4467454 -92.3301548 None 99.0
35236 The Park Flat AZ Yavapai 34.1486467 -111.4984726 None 624.0
49578 Grassy Lake Swamp AR Hempstead 33.6887443 -93.8693011 None None
65723 Locke School (historical) School AR Crawford 0.0 0.0 None 392.0
3032 Cline Well Well AZ Cochise 31.9000849 -110.3428525 None 1685.0
42122 Panther Tank Reservoir AZ Greenlee 33.1650631 -109.1942369 None None
68497 Rock Creek Golf Course Locale AR Pulaski 34.7123145 -92.3570975 None 95.0
25467 Caterpillar Tank Wash Stream AZ Maricopa 33.725592 -112.2937718 None 2795.0
55868 Vaden Cemetery Cemetery AR Clark 33.8720415 -92.9457159 None 82.0
33574 Reno Canyon Valley AZ Yavapai 34.3618875 -111.9572469 None 2092.0
71212 Cook Cemetery Cemetery AR Izard 36.0686754 -92.0079225 None 200.0
26578 Boggy Creek Stream AZ Apache 33.741996 -109.4398075 None 2315.0
155060 Emelle Populated Place AL Sumter 32.7292998 -88.3144755 None 72.0
42070 Whipple Tank Reservoir AZ Navajo 34.1783484 -109.8782247 None None
49350 Lewis Johnson Family Cemetery Cemetery AR Johnson 35.4122994 -93.3587851 None 194.0
52399 Mount Pleasant Church Church AR Jefferson 34.393707 -92.1109776 None 61.0
166564 Hancock Post Office (historical) Post Office AL Cherokee 0.0 0.0 None 71.0
127065 Snake Creek Stream AL Lee 32.3843101 -85.2785524 None 134.0
84769 Manoah Post Office (historical) Post Office AR Drew 0.0 0.0 None None
28394 Deer Spring Mountain Summit AZ Navajo 34.0107604 -109.9752047 None 1692.0
12504 Tipperary Mine Mine AZ Mohave 34.9300043 -114.3727359 None 603.0
53531 Pleasant Grove Cemetery Cemetery AR Mississippi 35.6192561 -90.0791719 None 60.0
13040 Union High School School AZ Maricopa 33.4517105 -112.0670927 None None
59185 Ellis Ridge Ridge AR Pope 35.6202133 -93.0366957 None 78.0
46393 Bentley Ridge Cemetery Cemetery AR Baxter 36.4872804 -92.2826514 None 58.0
222119 Deadman Gulch Valley CA Sacramento 38.2805281 -121.3513615 None 149.0
Truncated to displaylimit of 100.
In [11]:
%%sql
-- Here's a UDA fix from
-- https://stackoverflow.com/questions/18987791/how-do-i-efficiently-select-the-previous-non-null-value
CREATE OR REPLACE FUNCTION coalesce_agg_sfunc(state anyelement, value anyelement) RETURNS anyelement AS
$$
    SELECT coalesce(value, state);
$$ LANGUAGE SQL;

CREATE AGGREGATE coalesce_agg(anyelement) (
    SFUNC = coalesce_agg_sfunc,
    STYPE  = anyelement);
Running query in 'postgresql://localhost:5432/gnis'
Out[11]:
In [12]:
%%sql
-- Fixed to handle repeated NULLs
WITH fixed AS (
SELECT *,
       coalesce_agg(elev_in_m) OVER (order by feature_id) AS imputed_elev_in_m
  FROM holey
)
SELECT * FROM fixed ORDER BY imputed_elev_in_m DESC LIMIT 100;
Running query in 'postgresql://localhost:5432/gnis'
100 rows affected.
Out[12]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m imputed_elev_in_m
1416580 Browne Tower Summit AK Denali 63.1000583 -150.9314441 4628.0 4628.0
188010 Grizzly Peak Summit CO La Plata 37.6076296 -107.5818648 4171.0 4171.0
191939 Spread Eagle Peak Summit CO Custer 38.1252369 -105.6435197 4090.0 4090.0
187393 Emery Peak Summit CO San Juan 37.8882556 -107.6210535 4054.0 4054.0
181832 Baker Mountain Summit CO Clear Creek 39.6607654 -105.8402538 3832.0 3832.0
1703800 Lucky Jim One Mine Mine CO Park 39.1519347 -105.742509 None 3728.0
1703775 Park County Claims Mine Mine CO Park 39.3510997 -105.9869614 3728.0 3728.0
393894 Donaldson Peak Summit ID Custer 44.0639981 -113.7003853 3661.0 3661.0
1599419 Dundee Mountain Summit WY Park 43.8771622 -109.2682693 3640.0 3640.0
939450 Master Man Mine Mine NM Colfax 36.6297542 -105.2088936 3633.0 3633.0
179902 Ceresco Number 4 Adit Mine CO Lake 39.3644327 -106.1714108 3604.0 3604.0
189898 Antoro Shaft Mine CO Saguache 38.3297213 -106.125297 3577.0 3577.0
1684419 Maumee Mine Mine CO Park 39.269434 -106.1461318 3555.0 3555.0
1598169 Big Sheep Mountain Summit WY Sublette 43.2751868 -109.8786551 3537.0 3537.0
1603836 Twins Glacier Glacier WY Sublette 43.149673 -109.6540386 3517.0 3517.0
1427800 Mount Ellen Range UT Garfield 38.1097069 -110.8137658 3501.0 3501.0
1734540 Gold Run Placer Mine CO San Miguel 37.831384 -107.8345093 3449.0 3449.0
1659681 Slide Mountain Summit CA Mono 38.0931634 -119.4458359 3379.0 3379.0
178928 Windy Gap Gap CO Montezuma 37.4749967 -108.0917424 3354.0 3354.0
197523 Payne Gulch Valley CO Gunnison 38.6838024 -106.4736984 3340.0 3340.0
780007 Black Mountain Summit MT Park 45.4713346 -110.4664499 3337.0 3337.0
203950 Trachyte Knob Summit CO Teller 38.7731142 -105.1065711 3314.0 3314.0
179985 Buckskin Joe Mine Mine CO Park 39.2899894 -106.0955752 3309.0 3309.0
189852 Sheep Mountain Trail Trail CO Saguache 38.3297215 -106.1889107 3298.0 3298.0
2572516 Windy Lake Lake WY Fremont 42.6200138 -108.9825615 3274.0 3274.0
1444260 Pearl Lake Lake UT Uintah 40.7891427 -109.9571503 3263.0 3263.0
181328 Floral Park Campground Locale CO Clear Creek 39.7894314 -105.7805628 3258.0 3258.0
1724633 Garfield Grouse Mine Mine CO Teller 38.7330442 -105.1363688 3251.0 3251.0
186674 Copley Lake Lake CO Gunnison 38.8722009 -107.0830674 3251.0 3251.0
1414051 Mount Spurr Summit AK Kenai Peninsula 61.2997222 -152.2513889 3248.0 3248.0
779022 Anaconda Range Range MT Deer Lodge 46.0500221 -113.1901018 3246.0 3246.0
1446855 Upper Chain Lake Reservoir UT Duchesne 40.7113249 -110.2569397 3240.0 3240.0
931481 Double Eagle V Ski Trail Trail NM Santa Fe 35.7911406 -105.7950188 3240.0 3240.0
262684 Little Papoose Creek Stream CA Trinity 40.8009791 -122.735855 None 3224.0
262186 Kings River Trail Trail CA Inyo 37.1679871 -118.48483 3224.0 3224.0
1441700 Henderson Creek Stream UT Garfield 37.6077619 -112.0557457 None 3223.0
1441572 Hancock Peak Summit UT Iron 37.6349097 -112.7619161 3223.0 3223.0
187877 Raber Lohr Ditch Canal CO Hinsdale 37.6786111 -107.3222767 3218.0 3218.0
181537 Utah Hill Summit CO Gilpin 39.861742 -105.5629767 3202.0 3202.0
1735447 Little Keystone Shaft Mine Mine CO Lake 39.2477668 -106.4028044 3196.0 3196.0
861027 Prospect Peak Summit NV Elko 41.7961632 -115.3514744 3176.0 3176.0
189182 Lienhart Mine Mine CO Chaffee 38.9377726 -106.2547437 None 3173.0
189124 Holywater Mine Mine CO Chaffee 38.8094397 -106.358913 3173.0 3173.0
1693838 Eva Mine Complex Mine CO Clear Creek 39.6794316 -105.7277832 3166.0 3166.0
1600421 Horse Pasture Draw Valley WY Sublette 42.913281 -110.3380592 None 3151.0
1599988 Goosewing Guard Station Locale WY Teton 43.5629975 -110.3032447 None 3151.0
1599888 Galena Basin Basin WY Park 43.905233 -109.2923677 3151.0 3151.0
177789 Green Mountain Summit CO Grand 40.2961549 -105.8221575 3138.0 3138.0
181517 Nugget Populated Place CO Gilpin 39.8635984 -105.5883336 3128.0 3128.0
1428713 Highline Trail Trail UT Duchesne 40.688557 -110.8173912 3120.0 3120.0
199320 Trio Dam Dam CO Delta 39.0466474 -107.8356185 3108.0 3108.0
862430 Great Basin National Park Park NV White Pine 38.9460953 -114.2579658 3102.0 3102.0
173389 Crater Lake Lake CO Rio Blanco 40.1400804 -107.1207371 3097.0 3097.0
1667345 Little Shot Lake Lake CA Fresno 37.1252053 -118.8260505 3096.0 3096.0
189116 East Willow Creek Stream CO Gunnison 38.7538837 -106.4819717 3089.0 3089.0
1684255 Pacific-Gold Dust Mine Mine CO Summit 39.4685983 -106.0244624 3089.0 3089.0
175365 Little Giant Reservoir Number 1 Reservoir CO Delta 39.0405469 -107.7873713 3084.0 3084.0
1724727 Lexington Mine Mine CO Teller 38.7405442 -105.1560918 3076.0 3076.0
176038 Dubach Meadows Flat CO Eagle 39.5402631 -106.6630902 3074.0 3074.0
917164 Hidden Treasure Mine Mine NM Colfax 36.6114208 -105.2411167 3068.0 3068.0
396870 Mount McGuire Summit ID Lemhi 45.174046 -114.602137 3059.0 3059.0
1684932 Allied Gold Mining Company Mine Mine CO Boulder 39.9930424 -105.576667 3049.0 3049.0
176195 Guller Creek Stream CO Summit 39.5035976 -106.1758565 3035.0 3035.0
176528 McKinney Draw Valley CO Dolores 37.8663073 -108.5713788 None 3035.0
1445191 Roundy Spring Spring UT Garfield 38.0396633 -111.6903788 3025.0 3025.0
1445316 Sand Wash Stream UT Garfield 37.8838658 -112.425208 None 3025.0
199786 Kendall Dam Dam CO Mesa 39.1149804 -107.8072845 3024.0 3024.0
1442905 Long Flat Flat UT Piute 38.2683083 -112.3552077 3011.0 3011.0
1444807 Red Lake Lake UT Daggett 40.7950226 -109.8341217 3004.0 3004.0
909123 North Baldy Summit NM Socorro 34.0503013 -107.1812222 3002.0 3002.0
190348 Rudifor Lake Lake CO Rio Grande 37.5242201 -106.6722501 2997.0 2997.0
176590 Kilpacker Creek Stream CO Dolores 37.8233271 -108.0511831 2995.0 2995.0
261678 Iceberg Lake Lake CA Madera 37.6704116 -119.1691178 2981.0 2981.0
1438313 Avon Populated Place UT Cache 41.5329911 -111.8132743 None 2978.0
1438255 Antimony Knoll Summit UT Garfield 38.0514094 -111.815261 2978.0 2978.0
1683679 Majestic Mine Mine CO Boulder 40.1247079 -105.5588892 2978.0 2978.0
806032 Imelda Lake Lake MT Park 45.1663594 -109.9120372 2977.0 2977.0
931531 Geronimo Ski Trail Trail NM Lincoln 33.3973012 -105.7885986 2976.0 2976.0
930659 Wild Onion Ski Trail Trail NM Lincoln 33.3973012 -105.7885986 2976.0 2976.0
1447716 Manning Meadows Reservoir Reservoir UT Piute 38.4904107 -112.0702195 2971.0 2971.0
173999 Deadhorse Spring Spring CO Garfield 39.6572051 -107.2211646 2962.0 2962.0
172337 Lower Rainbow Lake Lake CO Jackson 40.6481244 -106.6193546 2960.0 2960.0
268442 Trumbull Lake Lake CA Mono 38.0534252 -119.2578319 2957.0 2957.0
188507 Phelps Cabin Creek Stream CO Hinsdale 38.1488844 -107.1053286 2954.0 2954.0
1446325 Tall Four Reservoir Reservoir UT Garfield 37.9200023 -111.8341652 2950.0 2950.0
1685340 Legion Mine Mine CO Clear Creek 39.7794318 -105.6813922 2942.0 2942.0
1609296 Sepulcher Mountain Summit WY Park 44.9903881 -110.7665775 2939.0 2939.0
2664241 Tucker Ponds Lake CO Mineral 37.49492 -106.75954 2934.0 2934.0
1724854 Rose Maud Mine Mine CO Teller 38.7324889 -105.1574808 2932.0 2932.0
853139 Puritan Mine Mine NV Elko 41.8485179 -115.3800676 2910.0 2910.0
186986 McIntosh Mountain Summit CO Gunnison 38.5750862 -107.014652 2905.0 2905.0
1447736 Farnsworth Dam Dam UT Sevier 38.7733041 -111.6624129 2897.0 2897.0
269289 Woodchuck Country Area CA Fresno 37.0174429 -118.8887203 2884.0 2884.0
859080 Buller Mountain Summit NV Mineral 38.3808139 -118.6877054 2880.0 2880.0
1606084 Thompson Reservoir Reservoir WY Carbon 41.1983044 -106.3655775 2878.0 2878.0
183510 The Castle Pillar CO Jefferson 39.3105454 -105.3824978 2873.0 2873.0
39068 Lee Valley Dam Dam AZ Apache 33.9417151 -109.4989794 2870.0 2870.0
180170 Crystal Lakes Lake CO Lake 39.1950626 -106.3383877 2867.0 2867.0
2724004 Petrie Ditch Canal CO Park 39.3091177 -105.8102555 2862.0 2862.0
177295 Hourglass Reservoir Reservoir CO Larimer 40.5793186 -105.6319349 2859.0 2859.0
Truncated to displaylimit of 100.

Method 4: Generalized interpolation across rows, with Linear Interpolation as an example¶

Forward pass

In [14]:
%%sql
-- 1. Forward assign run numbers to rows, propagate val, get nextval
CREATE OR REPLACE VIEW forward AS
SELECT *,
       SUM(CASE WHEN elev_in_m IS NULL THEN 0 ELSE 1 END) 
         OVER (ORDER BY feature_id) AS run,
       coalesce_agg(elev_in_m) OVER (ORDER BY feature_id) AS run_start,
       CASE WHEN elev_in_m IS NULL 
              THEN lead(elev_in_m, 1) OVER (ORDER BY feature_id)
            ELSE NULL
             END AS nextval
  FROM holey;
SELECT * FROM forward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[14]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m run run_start nextval
48638 Dry Creek Stream AR Union 33.3242919 -92.9121074 46.0 168 46.0 None
49192 Foster Cemetery Cemetery AR Benton 36.4612601 -94.1579926 401.0 169 401.0 None
49214 Fox Hollow Valley AR Sharp 36.4575632 -91.4387492 194.0 170 194.0 None
49350 Lewis Johnson Family Cemetery Cemetery AR Johnson 35.4122994 -93.3587851 None 170 194.0 None
49578 Grassy Lake Swamp AR Hempstead 33.6887443 -93.8693011 None 170 194.0 47.0
49802 Hamilton Creek Stream AR Ouachita 33.7592735 -92.9482208 47.0 171 47.0 None
49925 Hartsoe Cemetery Cemetery AR Greene 36.1833735 -90.4370362 111.0 172 111.0 None
50059 Hickory Grove Cemetery Cemetery AR Lonoke 34.6431418 -92.028191 71.0 173 71.0 None
50309 Hopewell Church Church AR Arkansas 34.3839893 -91.1540068 None 173 71.0 26.0
50661 John Mack Slough Gut AR Calhoun 33.3431778 -92.4573712 26.0 174 26.0 None
50948 Lake Norfork Church Church AR Baxter 36.3853437 -92.224323 222.0 175 222.0 None
51021 Leachville Cemetery Cemetery AR Jackson 35.5949353 -91.243365 70.0 176 70.0 None

Backward pass

In [15]:
%%sql
-- 2. Backward: assign run_end, run_size, run_rank
CREATE OR REPLACE VIEW backward AS
SELECT *,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE coalesce_agg(nextval) OVER (PARTITION BY run ORDER BY feature_id DESC)
        END AS run_end,
       count(*) OVER (PARTITION BY run) AS run_size,
       -1 + (RANK() OVER (PARTITION BY run ORDER BY feature_id)) AS run_rank
  FROM forward;
SELECT * FROM backward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[15]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m run run_start nextval run_end run_size run_rank
48638 Dry Creek Stream AR Union 33.3242919 -92.9121074 46.0 168 46.0 None 46.0 1 0
49192 Foster Cemetery Cemetery AR Benton 36.4612601 -94.1579926 401.0 169 401.0 None 401.0 1 0
49214 Fox Hollow Valley AR Sharp 36.4575632 -91.4387492 194.0 170 194.0 None 194.0 3 0
49350 Lewis Johnson Family Cemetery Cemetery AR Johnson 35.4122994 -93.3587851 None 170 194.0 None 47.0 3 1
49578 Grassy Lake Swamp AR Hempstead 33.6887443 -93.8693011 None 170 194.0 47.0 47.0 3 2
49802 Hamilton Creek Stream AR Ouachita 33.7592735 -92.9482208 47.0 171 47.0 None 47.0 1 0
49925 Hartsoe Cemetery Cemetery AR Greene 36.1833735 -90.4370362 111.0 172 111.0 None 111.0 1 0
50059 Hickory Grove Cemetery Cemetery AR Lonoke 34.6431418 -92.028191 71.0 173 71.0 None 71.0 2 0
50309 Hopewell Church Church AR Arkansas 34.3839893 -91.1540068 None 173 71.0 26.0 26.0 2 1
50661 John Mack Slough Gut AR Calhoun 33.3431778 -92.4573712 26.0 174 26.0 None 26.0 1 0
50948 Lake Norfork Church Church AR Baxter 36.3853437 -92.224323 222.0 175 222.0 None 222.0 1 0
51021 Leachville Cemetery Cemetery AR Jackson 35.5949353 -91.243365 70.0 176 70.0 None 70.0 1 0

Scalar function pass

In [16]:
%%sql
-- 3. Simple scalar pass
CREATE OR REPLACE VIEW final AS
SELECT *, 
       run_start + (run_rank-1)*((run_end-run_start)/(run_size))
         AS interpolated
  FROM backward;
SELECT * FROM final  ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[16]:
feature_id feature_name feature_class state_alpha county_name prim_lat_dec prim_long_dec elev_in_m run run_start nextval run_end run_size run_rank interpolated
48638 Dry Creek Stream AR Union 33.3242919 -92.9121074 46.0 168 46.0 None 46.0 1 0 46.0
49192 Foster Cemetery Cemetery AR Benton 36.4612601 -94.1579926 401.0 169 401.0 None 401.0 1 0 401.0
49214 Fox Hollow Valley AR Sharp 36.4575632 -91.4387492 194.0 170 194.0 None 194.0 3 0 194.0
49350 Lewis Johnson Family Cemetery Cemetery AR Johnson 35.4122994 -93.3587851 None 170 194.0 None 47.0 3 1 194.0
49578 Grassy Lake Swamp AR Hempstead 33.6887443 -93.8693011 None 170 194.0 47.0 47.0 3 2 145.0
49802 Hamilton Creek Stream AR Ouachita 33.7592735 -92.9482208 47.0 171 47.0 None 47.0 1 0 47.0
49925 Hartsoe Cemetery Cemetery AR Greene 36.1833735 -90.4370362 111.0 172 111.0 None 111.0 1 0 111.0
50059 Hickory Grove Cemetery Cemetery AR Lonoke 34.6431418 -92.028191 71.0 173 71.0 None 71.0 2 0 71.0
50309 Hopewell Church Church AR Arkansas 34.3839893 -91.1540068 None 173 71.0 26.0 26.0 2 1 71.0
50661 John Mack Slough Gut AR Calhoun 33.3431778 -92.4573712 26.0 174 26.0 None 26.0 1 0 26.0
50948 Lake Norfork Church Church AR Baxter 36.3853437 -92.224323 222.0 175 222.0 None 222.0 1 0 222.0
51021 Leachville Cemetery Cemetery AR Jackson 35.5949353 -91.243365 70.0 176 70.0 None 70.0 1 0 70.0
In [17]:
%sql EXPLAIN Analyze SELECT * from final LIMIT 500;
Running query in 'postgresql://localhost:5432/gnis'
20 rows affected.
Out[17]:
QUERY PLAN
Limit (cost=1206.79..1275.92 rows=500 width=216) (actual time=32.796..34.651 rows=500 loops=1)
  -> Subquery Scan on backward (cost=1206.79..2074.77 rows=6278 width=216) (actual time=32.795..34.621 rows=500 loops=1)
        -> WindowAgg (cost=1206.79..1902.13 rows=6278 width=208) (actual time=32.792..34.527 rows=500 loops=1)
              -> WindowAgg (cost=1206.79..1792.26 rows=6278 width=200) (actual time=32.785..34.151 rows=501 loops=1)
                    -> Incremental Sort (cost=1206.79..1682.40 rows=6278 width=192) (actual time=32.779..33.859 rows=502 loops=1)
                          Sort Key: forward.run, forward.feature_id
                          Presorted Key: forward.run
                          Full-sort Groups: 16 Sort Method: quicksort Average Memory: 29kB Peak Memory: 29kB
                          -> WindowAgg (cost=1204.85..1330.41 rows=6278 width=192) (actual time=32.693..33.669 rows=514 loops=1)
                                -> Sort (cost=1204.85..1220.55 rows=6278 width=184) (actual time=32.618..32.655 rows=515 loops=1)
                                      Sort Key: forward.run, forward.feature_id DESC
                                      Sort Method: quicksort Memory: 2102kB
                                      -> Subquery Scan on forward (cost=604.80..808.83 rows=6278 width=184) (actual time=3.450..27.244 rows=11533 loops=1)
                                            -> WindowAgg (cost=604.80..746.05 rows=6278 width=184) (actual time=3.449..26.057 rows=11533 loops=1)
                                                  -> Sort (cost=604.80..620.49 rows=6278 width=160) (actual time=3.367..4.023 rows=11533 loops=1)
                                                        Sort Key: holey.feature_id
                                                        Sort Method: quicksort Memory: 2011kB
                                                        -> Seq Scan on holey (cost=0.00..208.78 rows=6278 width=160) (actual time=0.014..1.050 rows=11533 loops=1)
Planning Time: 0.229 ms
Execution Time: 34.940 ms

String distance¶

In [18]:
%reload_ext sql
%sql postgresql://localhost:5432/postgres
Connecting and switching to connection postgresql://localhost:5432/postgres
In [19]:
%sql CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
Running query in 'postgresql://localhost:5432/postgres'
Out[19]:
In [20]:
%%sql
DROP TABLE IF EXISTS Strings;
CREATE TABLE Strings (str1 TEXT, str2 TEXT);
INSERT INTO Strings VALUES
 ('Lisa', 'List'),
 ('Lisa', 'License'),
 ('Joe', 'Noel');
Running query in 'postgresql://localhost:5432/postgres'
3 rows affected.
Out[20]:
In [21]:
%%sql
SELECT *,
       levenshtein(str1, str2),
       soundex(str1) as soundex1,
       soundex(str2) as soundex2,
       dmetaphone(str1) AS dmetaphone1,
       dmetaphone(str2) AS dmetaphone2,
       dmetaphone_alt(str1) AS dmetaphone_alt1,
       dmetaphone_alt(str2) AS dmetaphone_alt2
FROM Strings;
Running query in 'postgresql://localhost:5432/postgres'
3 rows affected.
Out[21]:
str1 str2 levenshtein soundex1 soundex2 dmetaphone1 dmetaphone2 dmetaphone_alt1 dmetaphone_alt2
Lisa List 1 L200 L230 LS LST LS LST
Lisa License 4 L200 L252 LS LSNS LS LSNS
Joe Noel 2 J000 N400 J NL A NL

[Extra] Pretty print of row runs, for lecture¶

(for lecture slides. Generated with pandas)

In [22]:
import pandas as pd
pd.DataFrame.from_dict({"feature_id": [493, 494, 495, 496],
                        "feature_name": ["Adobe Canyon", "Adobe Flats", "Adobe Lake", "Adobe Mountain"],
                        "elev_in_m": [1423, "None", "None", 517],
                        "run": [84, 84, 84, 85],
                        "run_start": [1423, 1423, 1423, 517],
                        "run_end": [517, 517, 517, 517],
                        "run_size": [3, 3, 3, 1],
                        "run_rank": [1, 2, 3, 1],
                        "interpol_elev_in_m": [1423, 1121, 819, 517]})
Out[22]:
feature_id feature_name elev_in_m run run_start run_end run_size run_rank interpol_elev_in_m
0 493 Adobe Canyon 1423 84 1423 517 3 1 1423
1 494 Adobe Flats None 84 1423 517 3 2 1121
2 495 Adobe Lake None 84 1423 517 3 3 819
3 496 Adobe Mountain 517 85 517 517 1 1 517