Lecture 14: Imputation¶
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)
sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline
GNIS data¶
This is the GNIS dataset from a previous lecture. If you didn't load in the database, run the below cell before connecting.
In [2]:
!psql -h localhost -d gnis -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS gnis'
!psql -h localhost -c 'CREATE DATABASE gnis'
!psql -h localhost -d gnis -f ../lec11/data/gnis.sql
pg_terminate_backend ---------------------- (0 rows) DROP DATABASE CREATE DATABASE SET SET SET SET SET set_config ------------ (1 row) SET SET SET SET SET SET CREATE TABLE ALTER TABLE CREATE TABLE ALTER TABLE COPY 3195 COPY 11533 CREATE INDEX
In [3]:
%reload_ext sql
%sql postgresql://localhost:5432/gnis
There's a new jupysql version available (0.10.14), you're running 0.10.0. To upgrade: pip install jupysql --upgrade Deploy Panel apps for free on Ploomber Cloud! Learn more: https://ploomber.io/s/signup
In [4]:
%sql SELECT setseed(0.12345);
Running query in 'postgresql://localhost:5432/gnis'
1 rows affected.
Out[4]:
setseed |
---|
In [5]:
%%sql
DROP TABLE IF EXISTS holey CASCADE;
CREATE TABLE holey AS
SELECT feature_id, feature_name, feature_class, state_alpha, county_name,
prim_lat_dec, prim_long_dec,
CASE WHEN random() > 0.9 THEN NULL
ELSE elev_in_m
END AS elev_in_m
FROM national;
SELECT count(elev_in_m)::float / count(*) FROM holey;
Running query in 'postgresql://localhost:5432/gnis'
11533 rows affected.
1 rows affected.
Out[5]:
?column? |
---|
0.8755744385675887 |
Method 1. Default Value Imputation with SQL¶
In [6]:
%%sql
WITH elevavg AS (SELECT avg(elev_in_m) FROM holey)
SELECT h.*,
CASE WHEN h.elev_in_m IS NOT NULL THEN h.elev_in_m
ELSE e.avg
END AS imputed_elev_in_m
FROM holey h, elevavg e
LIMIT 10;
Running query in 'postgresql://localhost:5432/gnis'
10 rows affected.
Out[6]:
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | imputed_elev_in_m |
---|---|---|---|---|---|---|---|---|
1230 | Belmont Mountains | Range | AZ | Maricopa | 33.642258 | -112.9010129 | 931.0 | 931.0 |
1839 | Bootlegger Saddle | Gap | AZ | Cochise | 31.8931474 | -109.2831176 | 2707.0 | 2707.0 |
2336 | Cabeza Prieta Game Range | Park | AZ | Yuma | 32.250056 | -113.45074 | 275.0 | 275.0 |
2750 | Chandler Springs | Spring | AZ | Navajo | 35.3766788 | -110.4754096 | 1685.0 | 1685.0 |
3032 | Cline Well | Well | AZ | Cochise | 31.9000849 | -110.3428525 | None | 483.962467815409 |
3039 | Clover Tank | Reservoir | AZ | Gila | 33.8509816 | -110.2577249 | 1563.0 | 1563.0 |
3060 | Coat Spring | Spring | AZ | Navajo | 36.12678 | -110.3330424 | 1926.0 | 1926.0 |
3143 | Comar Spring | Spring | AZ | Navajo | 35.5308428 | -110.4162424 | 1732.0 | 1732.0 |
3333 | Cottonwood Creek | Stream | AZ | Coconino | 36.050817 | -111.9865535 | 800.0 | 800.0 |
3342 | Cottonwood Creek | Stream | AZ | Mohave | 36.901931 | -112.5632547 | 1389.0 | 1389.0 |
Method 2. Correlation across columns¶
In [7]:
# Training is an aggregate function
# Here we'll train the model in SQL just for fun
result = %sql SELECT regr_slope(elev_in_m, prim_long_dec), \
regr_intercept(elev_in_m, prim_long_dec) FROM holey
slope, intercept = result[0]
slope, intercept
Running query in 'postgresql://localhost:5432/gnis'
1 rows affected.
Out[7]:
(-10.314179001097786, -477.9603219322606)
In [8]:
%%sql
SELECT *,
CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
ELSE {{slope}}*prim_long_dec + {{intercept}}
END AS imputed_elev_in_m
FROM holey
LIMIT 10;
Running query in 'postgresql://localhost:5432/gnis'
10 rows affected.
Out[8]:
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | imputed_elev_in_m |
---|---|---|---|---|---|---|---|---|
1230 | Belmont Mountains | Range | AZ | Maricopa | 33.642258 | -112.9010129 | 931.0 | 931.0 |
1839 | Bootlegger Saddle | Gap | AZ | Cochise | 31.8931474 | -109.2831176 | 2707.0 | 2707.0 |
2336 | Cabeza Prieta Game Range | Park | AZ | Yuma | 32.250056 | -113.45074 | 275.0 | 275.0 |
2750 | Chandler Springs | Spring | AZ | Navajo | 35.3766788 | -110.4754096 | 1685.0 | 1685.0 |
3032 | Cline Well | Well | AZ | Cochise | 31.9000849 | -110.3428525 | None | 660.1356102444698 |
3039 | Clover Tank | Reservoir | AZ | Gila | 33.8509816 | -110.2577249 | 1563.0 | 1563.0 |
3060 | Coat Spring | Spring | AZ | Navajo | 36.12678 | -110.3330424 | 1926.0 | 1926.0 |
3143 | Comar Spring | Spring | AZ | Navajo | 35.5308428 | -110.4162424 | 1732.0 | 1732.0 |
3333 | Cottonwood Creek | Stream | AZ | Coconino | 36.050817 | -111.9865535 | 800.0 | 800.0 |
3342 | Cottonwood Creek | Stream | AZ | Mohave | 36.901931 | -112.5632547 | 1389.0 | 1389.0 |
Method 3. General model-based interpolation¶
We won't show the demo; check slides for the general idea.
In [9]:
%config SqlMagic.displaylimit = 100
Method 4. [simple] Correlation across ordered rows¶
In [10]:
%%sql
-- The following doesn't work in PostgreSQL!
WITH buggy AS (
SELECT *,
CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
ELSE lag(elev_in_m, 1)
OVER (ORDER BY feature_id)
END AS imputed_elev_in_m
FROM holey
)
SELECT * FROM buggy ORDER BY elev_in_m DESC LIMIT 100;
Running query in 'postgresql://localhost:5432/gnis'
100 rows affected.
Out[10]:
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | imputed_elev_in_m |
---|---|---|---|---|---|---|---|---|
214923 | Lewes Junior High School | School | DE | Sussex | 38.7687232 | -75.1460168 | None | 89.0 |
209568 | Ox Hill | Populated Place | CT | Fairfield | 41.2284297 | -73.2190027 | None | 31.0 |
204786 | San Acacio | Populated Place | CO | Costilla | 37.2139025 | -105.5644584 | None | 1712.0 |
205451 | Berkshire | Populated Place | CT | Fairfield | 41.4087069 | -73.2595601 | None | 1945.0 |
205915 | Calebs Peak | Summit | CT | Litchfield | 41.7559276 | -73.4648456 | None | 116.0 |
194300 | Bull Pasture Well | Well | CO | Las Animas | 37.6594568 | -104.3541397 | None | 1692.0 |
198468 | Gurley Dam | Dam | CO | San Miguel | 38.035547 | -108.2459128 | None | None |
189182 | Lienhart Mine | Mine | CO | Chaffee | 38.9377726 | -106.2547437 | None | 3173.0 |
197727 | Pine Brook Hill | Populated Place | CO | Boulder | 40.0499857 | -105.3147137 | None | 2601.0 |
198347 | King Reservoir Number 1 | Reservoir | CO | Mesa | 38.8657673 | -108.6665656 | None | 1568.0 |
168322 | Batson Post Office (historical) | Post Office | AL | Coosa | 0.0 | 0.0 | None | None |
175017 | Camp Foster Creek | Stream | CO | Pitkin | 39.3308169 | -107.2222735 | None | 2514.0 |
179621 | Bartlett Mountain | Summit | CO | Lake | 39.3769097 | -106.1591995 | None | 2231.0 |
184045 | West Prong Red Creek | Stream | CO | Archuleta | 37.3825024 | -107.4417201 | None | 2461.0 |
184321 | Park Reservoir | Reservoir | CO | Archuleta | 37.3118044 | -107.1509411 | None | 2782.0 |
168566 | Teddy Post Office (historical) | Post Office | AL | Escambia | 0.0 | 0.0 | None | None |
168977 | Pondville Post Office (historical) | Post Office | AL | Bibb | 0.0 | 0.0 | None | 108.0 |
134138 | Fourth Creek School (historical) | School | AL | Sumter | 32.4595827 | -88.1983586 | None | 163.0 |
196693 | Morapos Cow Camp | Locale | CO | Rio Blanco | 40.1841431 | -107.5350619 | None | 2643.0 |
170234 | Lady Moon Lake | Lake | CO | Larimer | 40.7695992 | -105.5267598 | None | None |
195705 | Smith Reservoir | Reservoir | CO | Otero | 37.7677873 | -103.9584627 | None | 1208.0 |
168550 | Filmore Post Office (historical) | Post Office | AL | Escambia | 0.0 | 0.0 | None | None |
167976 | Walnut Hill Post Office (historical) | Post Office | AL | Tallapoosa | 0.0 | 0.0 | None | None |
157305 | Kell Cemetery | Cemetery | AL | Morgan | 0.0 | 0.0 | None | 195.0 |
174510 | Thompson Reservoir Number 3 | Reservoir | CO | Mesa | 39.0454778 | -108.7755725 | None | 2542.0 |
83954 | Bend Post Office (historical) | Post Office | AR | Marion | 0.0 | 0.0 | None | 62.0 |
176528 | McKinney Draw | Valley | CO | Dolores | 37.8663073 | -108.5713788 | None | 3035.0 |
168503 | Howles Store Post Office (historical) | Post Office | AL | Elmore | 0.0 | 0.0 | None | None |
159665 | Gilgal Baptist Church | Church | AL | Tuscaloosa | 33.1231761 | -87.5002824 | None | 268.0 |
167869 | Byron Post Office (historical) | Post Office | AL | Randolph | 0.0 | 0.0 | None | None |
150422 | New Home | Populated Place | AL | DeKalb | 34.6023063 | -85.6991353 | None | 398.0 |
114664 | Boggy Branch | Stream | FL | Holmes | 30.9731224 | -85.7449601 | None | 78.0 |
127439 | Stony Ridge | Ridge | AL | Shelby | 33.2201151 | -86.7260973 | None | 95.0 |
148083 | Eastern Gate Baptist Church | Church | AL | Tuscaloosa | 33.1528975 | -87.4752819 | None | 166.0 |
141361 | Burwells Classical Mathematical School (historical) | School | AL | Sumter | 0.0 | 0.0 | None | 200.0 |
166604 | Tecumseh Post Office (historical) | Post Office | AL | Cherokee | 0.0 | 0.0 | None | None |
83356 | Thorn Cemetery | Cemetery | AR | Faulkner | 35.2539553 | -92.3909621 | None | 57.0 |
69785 | Township of Durham | Civil | AR | Washington | 35.944583 | -93.9840421 | None | 574.0 |
129541 | Pace Lake | Reservoir | AL | Baldwin | 30.5214126 | -87.5095787 | None | 29.0 |
163989 | Truck Mine Number 5 | Mine | AL | Bibb | 0.0 | 0.0 | None | None |
56602 | Yocum Cove | Stream | AR | Marion | 36.4700658 | -92.6473881 | None | 432.0 |
163762 | Williams Cemetery | Cemetery | AL | Limestone | 0.0 | 0.0 | None | 365.0 |
191511 | Green Mountain | Summit | CO | Fremont | 38.5349305 | -105.6604581 | None | 2645.0 |
114844 | Brasher Branch | Stream | AL | Shelby | 33.4049059 | -86.5942081 | None | 171.0 |
168098 | Simsville Post Office (historical) | Post Office | AL | Bullock | 0.0 | 0.0 | None | None |
84873 | Forley Post Office (historical) | Post Office | AR | Washington | 0.0 | 0.0 | None | 46.0 |
164524 | Conecuh National Forest | Forest | AL | Covington | 31.094322 | -86.6341634 | None | 43.0 |
83633 | Lost Mountain Wilderness Retreat Airport (historical) | Airport | AR | Newton | 35.9292434 | -92.9626717 | None | None |
55132 | Springfield Church | Church | AR | Pulaski | 34.6084264 | -92.0890328 | None | 62.0 |
39670 | Old Dominion Library | Building | AZ | Gila | 33.406722 | -110.7917765 | None | 1217.0 |
13547 | West Well | Well | AZ | Navajo | 35.0355772 | -110.4826281 | None | 1100.0 |
169917 | Elkhorn Mine | Mine | CO | Routt | 40.9844118 | -106.9519934 | None | 1998.0 |
8268 | Moore Ranch | Locale | AZ | Cochise | 31.755093 | -109.5122882 | None | 1214.0 |
76112 | Greenbrier United Methodist Church | Church | AR | Faulkner | 35.2320267 | -92.3884883 | None | 380.0 |
4590 | Fiftysix Tank | Reservoir | AZ | Gila | 33.6899872 | -110.3340976 | None | 2434.0 |
141577 | Nelson Mine | Mine | AL | Shelby | 0.0 | 0.0 | None | None |
167859 | Tallapoosa Post Office (historical) | Post Office | AL | Randolph | 0.0 | 0.0 | None | 58.0 |
165584 | Westlawn Elementary School | School | AL | Morgan | 34.6109253 | -87.0069538 | None | 76.0 |
120344 | Hooper City | Populated Place | AL | Jefferson | 33.5553829 | -86.8411025 | None | 93.0 |
141780 | Breman Coal Bed Mine | Mine | AL | Cullman | 0.0 | 0.0 | None | None |
81705 | Rauch Estate Open Pit Mine | Mine | AR | Pulaski | 34.645093 | -92.3037624 | None | 196.0 |
82079 | Ditch Number 14 | Canal | AR | Mississippi | 35.4623055 | -90.1589823 | None | 233.0 |
139004 | Jollywell Branch | Stream | AL | Hale | 32.5698557 | -87.5577822 | None | 70.0 |
64652 | Wright (historical) | Populated Place | AR | Pike | 34.1628804 | -93.5376838 | None | 202.0 |
84960 | Maddock Post Office (historical) | Post Office | AR | White | 0.0 | 0.0 | None | None |
82103 | Ditch Number 16 | Canal | AR | Mississippi | 35.7136884 | -90.2509305 | None | None |
84600 | Meredith Post Office (historical) | Post Office | AR | Woodruff | 0.0 | 0.0 | None | 99.0 |
42036 | Lockett Tank | Reservoir | AZ | Coconino | 35.1490729 | -112.1510338 | None | 1402.0 |
124364 | Oyster Bay | Lake | AL | Baldwin | 30.2726058 | -87.7309074 | None | 152.0 |
129336 | Zion Church | Church | AL | Choctaw | 32.2870865 | -88.0708525 | None | 256.0 |
61851 | Tyronza Cut-off | Gut | AR | Mississippi | 35.630635 | -90.208151 | None | 74.0 |
113459 | Bar Branch | Stream | AL | Greene | 32.8073977 | -88.078892 | None | 125.0 |
50309 | Hopewell Church | Church | AR | Arkansas | 34.3839893 | -91.1540068 | None | 71.0 |
143509 | Martha Thomas Elementary School | School | AL | Mobile | 30.7546326 | -88.1147234 | None | 11.0 |
51173 | Linn Creek | Stream | AR | Van Buren | 35.4467454 | -92.3301548 | None | 99.0 |
35236 | The Park | Flat | AZ | Yavapai | 34.1486467 | -111.4984726 | None | 624.0 |
49578 | Grassy Lake | Swamp | AR | Hempstead | 33.6887443 | -93.8693011 | None | None |
65723 | Locke School (historical) | School | AR | Crawford | 0.0 | 0.0 | None | 392.0 |
3032 | Cline Well | Well | AZ | Cochise | 31.9000849 | -110.3428525 | None | 1685.0 |
42122 | Panther Tank | Reservoir | AZ | Greenlee | 33.1650631 | -109.1942369 | None | None |
68497 | Rock Creek Golf Course | Locale | AR | Pulaski | 34.7123145 | -92.3570975 | None | 95.0 |
25467 | Caterpillar Tank Wash | Stream | AZ | Maricopa | 33.725592 | -112.2937718 | None | 2795.0 |
55868 | Vaden Cemetery | Cemetery | AR | Clark | 33.8720415 | -92.9457159 | None | 82.0 |
33574 | Reno Canyon | Valley | AZ | Yavapai | 34.3618875 | -111.9572469 | None | 2092.0 |
71212 | Cook Cemetery | Cemetery | AR | Izard | 36.0686754 | -92.0079225 | None | 200.0 |
26578 | Boggy Creek | Stream | AZ | Apache | 33.741996 | -109.4398075 | None | 2315.0 |
155060 | Emelle | Populated Place | AL | Sumter | 32.7292998 | -88.3144755 | None | 72.0 |
42070 | Whipple Tank | Reservoir | AZ | Navajo | 34.1783484 | -109.8782247 | None | None |
49350 | Lewis Johnson Family Cemetery | Cemetery | AR | Johnson | 35.4122994 | -93.3587851 | None | 194.0 |
52399 | Mount Pleasant Church | Church | AR | Jefferson | 34.393707 | -92.1109776 | None | 61.0 |
166564 | Hancock Post Office (historical) | Post Office | AL | Cherokee | 0.0 | 0.0 | None | 71.0 |
127065 | Snake Creek | Stream | AL | Lee | 32.3843101 | -85.2785524 | None | 134.0 |
84769 | Manoah Post Office (historical) | Post Office | AR | Drew | 0.0 | 0.0 | None | None |
28394 | Deer Spring Mountain | Summit | AZ | Navajo | 34.0107604 | -109.9752047 | None | 1692.0 |
12504 | Tipperary Mine | Mine | AZ | Mohave | 34.9300043 | -114.3727359 | None | 603.0 |
53531 | Pleasant Grove Cemetery | Cemetery | AR | Mississippi | 35.6192561 | -90.0791719 | None | 60.0 |
13040 | Union High School | School | AZ | Maricopa | 33.4517105 | -112.0670927 | None | None |
59185 | Ellis Ridge | Ridge | AR | Pope | 35.6202133 | -93.0366957 | None | 78.0 |
46393 | Bentley Ridge Cemetery | Cemetery | AR | Baxter | 36.4872804 | -92.2826514 | None | 58.0 |
222119 | Deadman Gulch | Valley | CA | Sacramento | 38.2805281 | -121.3513615 | None | 149.0 |
In [11]:
%%sql
-- Here's a UDA fix from
-- https://stackoverflow.com/questions/18987791/how-do-i-efficiently-select-the-previous-non-null-value
CREATE OR REPLACE FUNCTION coalesce_agg_sfunc(state anyelement, value anyelement) RETURNS anyelement AS
$$
SELECT coalesce(value, state);
$$ LANGUAGE SQL;
CREATE AGGREGATE coalesce_agg(anyelement) (
SFUNC = coalesce_agg_sfunc,
STYPE = anyelement);
Running query in 'postgresql://localhost:5432/gnis'
Out[11]:
In [12]:
%%sql
-- Fixed to handle repeated NULLs
WITH fixed AS (
SELECT *,
coalesce_agg(elev_in_m) OVER (order by feature_id) AS imputed_elev_in_m
FROM holey
)
SELECT * FROM fixed ORDER BY imputed_elev_in_m DESC LIMIT 100;
Running query in 'postgresql://localhost:5432/gnis'
100 rows affected.
Out[12]:
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | imputed_elev_in_m |
---|---|---|---|---|---|---|---|---|
1416580 | Browne Tower | Summit | AK | Denali | 63.1000583 | -150.9314441 | 4628.0 | 4628.0 |
188010 | Grizzly Peak | Summit | CO | La Plata | 37.6076296 | -107.5818648 | 4171.0 | 4171.0 |
191939 | Spread Eagle Peak | Summit | CO | Custer | 38.1252369 | -105.6435197 | 4090.0 | 4090.0 |
187393 | Emery Peak | Summit | CO | San Juan | 37.8882556 | -107.6210535 | 4054.0 | 4054.0 |
181832 | Baker Mountain | Summit | CO | Clear Creek | 39.6607654 | -105.8402538 | 3832.0 | 3832.0 |
1703800 | Lucky Jim One Mine | Mine | CO | Park | 39.1519347 | -105.742509 | None | 3728.0 |
1703775 | Park County Claims Mine | Mine | CO | Park | 39.3510997 | -105.9869614 | 3728.0 | 3728.0 |
393894 | Donaldson Peak | Summit | ID | Custer | 44.0639981 | -113.7003853 | 3661.0 | 3661.0 |
1599419 | Dundee Mountain | Summit | WY | Park | 43.8771622 | -109.2682693 | 3640.0 | 3640.0 |
939450 | Master Man Mine | Mine | NM | Colfax | 36.6297542 | -105.2088936 | 3633.0 | 3633.0 |
179902 | Ceresco Number 4 Adit | Mine | CO | Lake | 39.3644327 | -106.1714108 | 3604.0 | 3604.0 |
189898 | Antoro Shaft | Mine | CO | Saguache | 38.3297213 | -106.125297 | 3577.0 | 3577.0 |
1684419 | Maumee Mine | Mine | CO | Park | 39.269434 | -106.1461318 | 3555.0 | 3555.0 |
1598169 | Big Sheep Mountain | Summit | WY | Sublette | 43.2751868 | -109.8786551 | 3537.0 | 3537.0 |
1603836 | Twins Glacier | Glacier | WY | Sublette | 43.149673 | -109.6540386 | 3517.0 | 3517.0 |
1427800 | Mount Ellen | Range | UT | Garfield | 38.1097069 | -110.8137658 | 3501.0 | 3501.0 |
1734540 | Gold Run Placer | Mine | CO | San Miguel | 37.831384 | -107.8345093 | 3449.0 | 3449.0 |
1659681 | Slide Mountain | Summit | CA | Mono | 38.0931634 | -119.4458359 | 3379.0 | 3379.0 |
178928 | Windy Gap | Gap | CO | Montezuma | 37.4749967 | -108.0917424 | 3354.0 | 3354.0 |
197523 | Payne Gulch | Valley | CO | Gunnison | 38.6838024 | -106.4736984 | 3340.0 | 3340.0 |
780007 | Black Mountain | Summit | MT | Park | 45.4713346 | -110.4664499 | 3337.0 | 3337.0 |
203950 | Trachyte Knob | Summit | CO | Teller | 38.7731142 | -105.1065711 | 3314.0 | 3314.0 |
179985 | Buckskin Joe Mine | Mine | CO | Park | 39.2899894 | -106.0955752 | 3309.0 | 3309.0 |
189852 | Sheep Mountain Trail | Trail | CO | Saguache | 38.3297215 | -106.1889107 | 3298.0 | 3298.0 |
2572516 | Windy Lake | Lake | WY | Fremont | 42.6200138 | -108.9825615 | 3274.0 | 3274.0 |
1444260 | Pearl Lake | Lake | UT | Uintah | 40.7891427 | -109.9571503 | 3263.0 | 3263.0 |
181328 | Floral Park Campground | Locale | CO | Clear Creek | 39.7894314 | -105.7805628 | 3258.0 | 3258.0 |
1724633 | Garfield Grouse Mine | Mine | CO | Teller | 38.7330442 | -105.1363688 | 3251.0 | 3251.0 |
186674 | Copley Lake | Lake | CO | Gunnison | 38.8722009 | -107.0830674 | 3251.0 | 3251.0 |
1414051 | Mount Spurr | Summit | AK | Kenai Peninsula | 61.2997222 | -152.2513889 | 3248.0 | 3248.0 |
779022 | Anaconda Range | Range | MT | Deer Lodge | 46.0500221 | -113.1901018 | 3246.0 | 3246.0 |
1446855 | Upper Chain Lake | Reservoir | UT | Duchesne | 40.7113249 | -110.2569397 | 3240.0 | 3240.0 |
931481 | Double Eagle V Ski Trail | Trail | NM | Santa Fe | 35.7911406 | -105.7950188 | 3240.0 | 3240.0 |
262684 | Little Papoose Creek | Stream | CA | Trinity | 40.8009791 | -122.735855 | None | 3224.0 |
262186 | Kings River Trail | Trail | CA | Inyo | 37.1679871 | -118.48483 | 3224.0 | 3224.0 |
1441700 | Henderson Creek | Stream | UT | Garfield | 37.6077619 | -112.0557457 | None | 3223.0 |
1441572 | Hancock Peak | Summit | UT | Iron | 37.6349097 | -112.7619161 | 3223.0 | 3223.0 |
187877 | Raber Lohr Ditch | Canal | CO | Hinsdale | 37.6786111 | -107.3222767 | 3218.0 | 3218.0 |
181537 | Utah Hill | Summit | CO | Gilpin | 39.861742 | -105.5629767 | 3202.0 | 3202.0 |
1735447 | Little Keystone Shaft Mine | Mine | CO | Lake | 39.2477668 | -106.4028044 | 3196.0 | 3196.0 |
861027 | Prospect Peak | Summit | NV | Elko | 41.7961632 | -115.3514744 | 3176.0 | 3176.0 |
189182 | Lienhart Mine | Mine | CO | Chaffee | 38.9377726 | -106.2547437 | None | 3173.0 |
189124 | Holywater Mine | Mine | CO | Chaffee | 38.8094397 | -106.358913 | 3173.0 | 3173.0 |
1693838 | Eva Mine Complex | Mine | CO | Clear Creek | 39.6794316 | -105.7277832 | 3166.0 | 3166.0 |
1600421 | Horse Pasture Draw | Valley | WY | Sublette | 42.913281 | -110.3380592 | None | 3151.0 |
1599988 | Goosewing Guard Station | Locale | WY | Teton | 43.5629975 | -110.3032447 | None | 3151.0 |
1599888 | Galena Basin | Basin | WY | Park | 43.905233 | -109.2923677 | 3151.0 | 3151.0 |
177789 | Green Mountain | Summit | CO | Grand | 40.2961549 | -105.8221575 | 3138.0 | 3138.0 |
181517 | Nugget | Populated Place | CO | Gilpin | 39.8635984 | -105.5883336 | 3128.0 | 3128.0 |
1428713 | Highline Trail | Trail | UT | Duchesne | 40.688557 | -110.8173912 | 3120.0 | 3120.0 |
199320 | Trio Dam | Dam | CO | Delta | 39.0466474 | -107.8356185 | 3108.0 | 3108.0 |
862430 | Great Basin National Park | Park | NV | White Pine | 38.9460953 | -114.2579658 | 3102.0 | 3102.0 |
173389 | Crater Lake | Lake | CO | Rio Blanco | 40.1400804 | -107.1207371 | 3097.0 | 3097.0 |
1667345 | Little Shot Lake | Lake | CA | Fresno | 37.1252053 | -118.8260505 | 3096.0 | 3096.0 |
189116 | East Willow Creek | Stream | CO | Gunnison | 38.7538837 | -106.4819717 | 3089.0 | 3089.0 |
1684255 | Pacific-Gold Dust Mine | Mine | CO | Summit | 39.4685983 | -106.0244624 | 3089.0 | 3089.0 |
175365 | Little Giant Reservoir Number 1 | Reservoir | CO | Delta | 39.0405469 | -107.7873713 | 3084.0 | 3084.0 |
1724727 | Lexington Mine | Mine | CO | Teller | 38.7405442 | -105.1560918 | 3076.0 | 3076.0 |
176038 | Dubach Meadows | Flat | CO | Eagle | 39.5402631 | -106.6630902 | 3074.0 | 3074.0 |
917164 | Hidden Treasure Mine | Mine | NM | Colfax | 36.6114208 | -105.2411167 | 3068.0 | 3068.0 |
396870 | Mount McGuire | Summit | ID | Lemhi | 45.174046 | -114.602137 | 3059.0 | 3059.0 |
1684932 | Allied Gold Mining Company Mine | Mine | CO | Boulder | 39.9930424 | -105.576667 | 3049.0 | 3049.0 |
176195 | Guller Creek | Stream | CO | Summit | 39.5035976 | -106.1758565 | 3035.0 | 3035.0 |
176528 | McKinney Draw | Valley | CO | Dolores | 37.8663073 | -108.5713788 | None | 3035.0 |
1445191 | Roundy Spring | Spring | UT | Garfield | 38.0396633 | -111.6903788 | 3025.0 | 3025.0 |
1445316 | Sand Wash | Stream | UT | Garfield | 37.8838658 | -112.425208 | None | 3025.0 |
199786 | Kendall Dam | Dam | CO | Mesa | 39.1149804 | -107.8072845 | 3024.0 | 3024.0 |
1442905 | Long Flat | Flat | UT | Piute | 38.2683083 | -112.3552077 | 3011.0 | 3011.0 |
1444807 | Red Lake | Lake | UT | Daggett | 40.7950226 | -109.8341217 | 3004.0 | 3004.0 |
909123 | North Baldy | Summit | NM | Socorro | 34.0503013 | -107.1812222 | 3002.0 | 3002.0 |
190348 | Rudifor Lake | Lake | CO | Rio Grande | 37.5242201 | -106.6722501 | 2997.0 | 2997.0 |
176590 | Kilpacker Creek | Stream | CO | Dolores | 37.8233271 | -108.0511831 | 2995.0 | 2995.0 |
261678 | Iceberg Lake | Lake | CA | Madera | 37.6704116 | -119.1691178 | 2981.0 | 2981.0 |
1438313 | Avon | Populated Place | UT | Cache | 41.5329911 | -111.8132743 | None | 2978.0 |
1438255 | Antimony Knoll | Summit | UT | Garfield | 38.0514094 | -111.815261 | 2978.0 | 2978.0 |
1683679 | Majestic Mine | Mine | CO | Boulder | 40.1247079 | -105.5588892 | 2978.0 | 2978.0 |
806032 | Imelda Lake | Lake | MT | Park | 45.1663594 | -109.9120372 | 2977.0 | 2977.0 |
931531 | Geronimo Ski Trail | Trail | NM | Lincoln | 33.3973012 | -105.7885986 | 2976.0 | 2976.0 |
930659 | Wild Onion Ski Trail | Trail | NM | Lincoln | 33.3973012 | -105.7885986 | 2976.0 | 2976.0 |
1447716 | Manning Meadows Reservoir | Reservoir | UT | Piute | 38.4904107 | -112.0702195 | 2971.0 | 2971.0 |
173999 | Deadhorse Spring | Spring | CO | Garfield | 39.6572051 | -107.2211646 | 2962.0 | 2962.0 |
172337 | Lower Rainbow Lake | Lake | CO | Jackson | 40.6481244 | -106.6193546 | 2960.0 | 2960.0 |
268442 | Trumbull Lake | Lake | CA | Mono | 38.0534252 | -119.2578319 | 2957.0 | 2957.0 |
188507 | Phelps Cabin Creek | Stream | CO | Hinsdale | 38.1488844 | -107.1053286 | 2954.0 | 2954.0 |
1446325 | Tall Four Reservoir | Reservoir | UT | Garfield | 37.9200023 | -111.8341652 | 2950.0 | 2950.0 |
1685340 | Legion Mine | Mine | CO | Clear Creek | 39.7794318 | -105.6813922 | 2942.0 | 2942.0 |
1609296 | Sepulcher Mountain | Summit | WY | Park | 44.9903881 | -110.7665775 | 2939.0 | 2939.0 |
2664241 | Tucker Ponds | Lake | CO | Mineral | 37.49492 | -106.75954 | 2934.0 | 2934.0 |
1724854 | Rose Maud Mine | Mine | CO | Teller | 38.7324889 | -105.1574808 | 2932.0 | 2932.0 |
853139 | Puritan Mine | Mine | NV | Elko | 41.8485179 | -115.3800676 | 2910.0 | 2910.0 |
186986 | McIntosh Mountain | Summit | CO | Gunnison | 38.5750862 | -107.014652 | 2905.0 | 2905.0 |
1447736 | Farnsworth Dam | Dam | UT | Sevier | 38.7733041 | -111.6624129 | 2897.0 | 2897.0 |
269289 | Woodchuck Country | Area | CA | Fresno | 37.0174429 | -118.8887203 | 2884.0 | 2884.0 |
859080 | Buller Mountain | Summit | NV | Mineral | 38.3808139 | -118.6877054 | 2880.0 | 2880.0 |
1606084 | Thompson Reservoir | Reservoir | WY | Carbon | 41.1983044 | -106.3655775 | 2878.0 | 2878.0 |
183510 | The Castle | Pillar | CO | Jefferson | 39.3105454 | -105.3824978 | 2873.0 | 2873.0 |
39068 | Lee Valley Dam | Dam | AZ | Apache | 33.9417151 | -109.4989794 | 2870.0 | 2870.0 |
180170 | Crystal Lakes | Lake | CO | Lake | 39.1950626 | -106.3383877 | 2867.0 | 2867.0 |
2724004 | Petrie Ditch | Canal | CO | Park | 39.3091177 | -105.8102555 | 2862.0 | 2862.0 |
177295 | Hourglass Reservoir | Reservoir | CO | Larimer | 40.5793186 | -105.6319349 | 2859.0 | 2859.0 |
Method 4: Generalized interpolation across rows, with Linear Interpolation as an example¶
Forward pass
In [14]:
%%sql
-- 1. Forward assign run numbers to rows, propagate val, get nextval
CREATE OR REPLACE VIEW forward AS
SELECT *,
SUM(CASE WHEN elev_in_m IS NULL THEN 0 ELSE 1 END)
OVER (ORDER BY feature_id) AS run,
coalesce_agg(elev_in_m) OVER (ORDER BY feature_id) AS run_start,
CASE WHEN elev_in_m IS NULL
THEN lead(elev_in_m, 1) OVER (ORDER BY feature_id)
ELSE NULL
END AS nextval
FROM holey;
SELECT * FROM forward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[14]:
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | run | run_start | nextval |
---|---|---|---|---|---|---|---|---|---|---|
48638 | Dry Creek | Stream | AR | Union | 33.3242919 | -92.9121074 | 46.0 | 168 | 46.0 | None |
49192 | Foster Cemetery | Cemetery | AR | Benton | 36.4612601 | -94.1579926 | 401.0 | 169 | 401.0 | None |
49214 | Fox Hollow | Valley | AR | Sharp | 36.4575632 | -91.4387492 | 194.0 | 170 | 194.0 | None |
49350 | Lewis Johnson Family Cemetery | Cemetery | AR | Johnson | 35.4122994 | -93.3587851 | None | 170 | 194.0 | None |
49578 | Grassy Lake | Swamp | AR | Hempstead | 33.6887443 | -93.8693011 | None | 170 | 194.0 | 47.0 |
49802 | Hamilton Creek | Stream | AR | Ouachita | 33.7592735 | -92.9482208 | 47.0 | 171 | 47.0 | None |
49925 | Hartsoe Cemetery | Cemetery | AR | Greene | 36.1833735 | -90.4370362 | 111.0 | 172 | 111.0 | None |
50059 | Hickory Grove Cemetery | Cemetery | AR | Lonoke | 34.6431418 | -92.028191 | 71.0 | 173 | 71.0 | None |
50309 | Hopewell Church | Church | AR | Arkansas | 34.3839893 | -91.1540068 | None | 173 | 71.0 | 26.0 |
50661 | John Mack Slough | Gut | AR | Calhoun | 33.3431778 | -92.4573712 | 26.0 | 174 | 26.0 | None |
50948 | Lake Norfork Church | Church | AR | Baxter | 36.3853437 | -92.224323 | 222.0 | 175 | 222.0 | None |
51021 | Leachville Cemetery | Cemetery | AR | Jackson | 35.5949353 | -91.243365 | 70.0 | 176 | 70.0 | None |
Backward pass
In [15]:
%%sql
-- 2. Backward: assign run_end, run_size, run_rank
CREATE OR REPLACE VIEW backward AS
SELECT *,
CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
ELSE coalesce_agg(nextval) OVER (PARTITION BY run ORDER BY feature_id DESC)
END AS run_end,
count(*) OVER (PARTITION BY run) AS run_size,
-1 + (RANK() OVER (PARTITION BY run ORDER BY feature_id)) AS run_rank
FROM forward;
SELECT * FROM backward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[15]:
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | run | run_start | nextval | run_end | run_size | run_rank |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48638 | Dry Creek | Stream | AR | Union | 33.3242919 | -92.9121074 | 46.0 | 168 | 46.0 | None | 46.0 | 1 | 0 |
49192 | Foster Cemetery | Cemetery | AR | Benton | 36.4612601 | -94.1579926 | 401.0 | 169 | 401.0 | None | 401.0 | 1 | 0 |
49214 | Fox Hollow | Valley | AR | Sharp | 36.4575632 | -91.4387492 | 194.0 | 170 | 194.0 | None | 194.0 | 3 | 0 |
49350 | Lewis Johnson Family Cemetery | Cemetery | AR | Johnson | 35.4122994 | -93.3587851 | None | 170 | 194.0 | None | 47.0 | 3 | 1 |
49578 | Grassy Lake | Swamp | AR | Hempstead | 33.6887443 | -93.8693011 | None | 170 | 194.0 | 47.0 | 47.0 | 3 | 2 |
49802 | Hamilton Creek | Stream | AR | Ouachita | 33.7592735 | -92.9482208 | 47.0 | 171 | 47.0 | None | 47.0 | 1 | 0 |
49925 | Hartsoe Cemetery | Cemetery | AR | Greene | 36.1833735 | -90.4370362 | 111.0 | 172 | 111.0 | None | 111.0 | 1 | 0 |
50059 | Hickory Grove Cemetery | Cemetery | AR | Lonoke | 34.6431418 | -92.028191 | 71.0 | 173 | 71.0 | None | 71.0 | 2 | 0 |
50309 | Hopewell Church | Church | AR | Arkansas | 34.3839893 | -91.1540068 | None | 173 | 71.0 | 26.0 | 26.0 | 2 | 1 |
50661 | John Mack Slough | Gut | AR | Calhoun | 33.3431778 | -92.4573712 | 26.0 | 174 | 26.0 | None | 26.0 | 1 | 0 |
50948 | Lake Norfork Church | Church | AR | Baxter | 36.3853437 | -92.224323 | 222.0 | 175 | 222.0 | None | 222.0 | 1 | 0 |
51021 | Leachville Cemetery | Cemetery | AR | Jackson | 35.5949353 | -91.243365 | 70.0 | 176 | 70.0 | None | 70.0 | 1 | 0 |
Scalar function pass
In [16]:
%%sql
-- 3. Simple scalar pass
CREATE OR REPLACE VIEW final AS
SELECT *,
run_start + (run_rank-1)*((run_end-run_start)/(run_size))
AS interpolated
FROM backward;
SELECT * FROM final ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
Running query in 'postgresql://localhost:5432/gnis'
12 rows affected.
Out[16]:
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | run | run_start | nextval | run_end | run_size | run_rank | interpolated |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48638 | Dry Creek | Stream | AR | Union | 33.3242919 | -92.9121074 | 46.0 | 168 | 46.0 | None | 46.0 | 1 | 0 | 46.0 |
49192 | Foster Cemetery | Cemetery | AR | Benton | 36.4612601 | -94.1579926 | 401.0 | 169 | 401.0 | None | 401.0 | 1 | 0 | 401.0 |
49214 | Fox Hollow | Valley | AR | Sharp | 36.4575632 | -91.4387492 | 194.0 | 170 | 194.0 | None | 194.0 | 3 | 0 | 194.0 |
49350 | Lewis Johnson Family Cemetery | Cemetery | AR | Johnson | 35.4122994 | -93.3587851 | None | 170 | 194.0 | None | 47.0 | 3 | 1 | 194.0 |
49578 | Grassy Lake | Swamp | AR | Hempstead | 33.6887443 | -93.8693011 | None | 170 | 194.0 | 47.0 | 47.0 | 3 | 2 | 145.0 |
49802 | Hamilton Creek | Stream | AR | Ouachita | 33.7592735 | -92.9482208 | 47.0 | 171 | 47.0 | None | 47.0 | 1 | 0 | 47.0 |
49925 | Hartsoe Cemetery | Cemetery | AR | Greene | 36.1833735 | -90.4370362 | 111.0 | 172 | 111.0 | None | 111.0 | 1 | 0 | 111.0 |
50059 | Hickory Grove Cemetery | Cemetery | AR | Lonoke | 34.6431418 | -92.028191 | 71.0 | 173 | 71.0 | None | 71.0 | 2 | 0 | 71.0 |
50309 | Hopewell Church | Church | AR | Arkansas | 34.3839893 | -91.1540068 | None | 173 | 71.0 | 26.0 | 26.0 | 2 | 1 | 71.0 |
50661 | John Mack Slough | Gut | AR | Calhoun | 33.3431778 | -92.4573712 | 26.0 | 174 | 26.0 | None | 26.0 | 1 | 0 | 26.0 |
50948 | Lake Norfork Church | Church | AR | Baxter | 36.3853437 | -92.224323 | 222.0 | 175 | 222.0 | None | 222.0 | 1 | 0 | 222.0 |
51021 | Leachville Cemetery | Cemetery | AR | Jackson | 35.5949353 | -91.243365 | 70.0 | 176 | 70.0 | None | 70.0 | 1 | 0 | 70.0 |
In [17]:
%sql EXPLAIN Analyze SELECT * from final LIMIT 500;
Running query in 'postgresql://localhost:5432/gnis'
20 rows affected.
Out[17]:
QUERY PLAN |
---|
Limit (cost=1206.79..1275.92 rows=500 width=216) (actual time=32.796..34.651 rows=500 loops=1) |
-> Subquery Scan on backward (cost=1206.79..2074.77 rows=6278 width=216) (actual time=32.795..34.621 rows=500 loops=1) |
-> WindowAgg (cost=1206.79..1902.13 rows=6278 width=208) (actual time=32.792..34.527 rows=500 loops=1) |
-> WindowAgg (cost=1206.79..1792.26 rows=6278 width=200) (actual time=32.785..34.151 rows=501 loops=1) |
-> Incremental Sort (cost=1206.79..1682.40 rows=6278 width=192) (actual time=32.779..33.859 rows=502 loops=1) |
Sort Key: forward.run, forward.feature_id |
Presorted Key: forward.run |
Full-sort Groups: 16 Sort Method: quicksort Average Memory: 29kB Peak Memory: 29kB |
-> WindowAgg (cost=1204.85..1330.41 rows=6278 width=192) (actual time=32.693..33.669 rows=514 loops=1) |
-> Sort (cost=1204.85..1220.55 rows=6278 width=184) (actual time=32.618..32.655 rows=515 loops=1) |
Sort Key: forward.run, forward.feature_id DESC |
Sort Method: quicksort Memory: 2102kB |
-> Subquery Scan on forward (cost=604.80..808.83 rows=6278 width=184) (actual time=3.450..27.244 rows=11533 loops=1) |
-> WindowAgg (cost=604.80..746.05 rows=6278 width=184) (actual time=3.449..26.057 rows=11533 loops=1) |
-> Sort (cost=604.80..620.49 rows=6278 width=160) (actual time=3.367..4.023 rows=11533 loops=1) |
Sort Key: holey.feature_id |
Sort Method: quicksort Memory: 2011kB |
-> Seq Scan on holey (cost=0.00..208.78 rows=6278 width=160) (actual time=0.014..1.050 rows=11533 loops=1) |
Planning Time: 0.229 ms |
Execution Time: 34.940 ms |
String distance¶
In [18]:
%reload_ext sql
%sql postgresql://localhost:5432/postgres
Connecting and switching to connection postgresql://localhost:5432/postgres
In [19]:
%sql CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
Running query in 'postgresql://localhost:5432/postgres'
Out[19]:
In [20]:
%%sql
DROP TABLE IF EXISTS Strings;
CREATE TABLE Strings (str1 TEXT, str2 TEXT);
INSERT INTO Strings VALUES
('Lisa', 'List'),
('Lisa', 'License'),
('Joe', 'Noel');
Running query in 'postgresql://localhost:5432/postgres'
3 rows affected.
Out[20]:
In [21]:
%%sql
SELECT *,
levenshtein(str1, str2),
soundex(str1) as soundex1,
soundex(str2) as soundex2,
dmetaphone(str1) AS dmetaphone1,
dmetaphone(str2) AS dmetaphone2,
dmetaphone_alt(str1) AS dmetaphone_alt1,
dmetaphone_alt(str2) AS dmetaphone_alt2
FROM Strings;
Running query in 'postgresql://localhost:5432/postgres'
3 rows affected.
Out[21]:
str1 | str2 | levenshtein | soundex1 | soundex2 | dmetaphone1 | dmetaphone2 | dmetaphone_alt1 | dmetaphone_alt2 |
---|---|---|---|---|---|---|---|---|
Lisa | List | 1 | L200 | L230 | LS | LST | LS | LST |
Lisa | License | 4 | L200 | L252 | LS | LSNS | LS | LSNS |
Joe | Noel | 2 | J000 | N400 | J | NL | A | NL |
[Extra] Pretty print of row runs, for lecture¶
(for lecture slides. Generated with pandas)
In [22]:
import pandas as pd
pd.DataFrame.from_dict({"feature_id": [493, 494, 495, 496],
"feature_name": ["Adobe Canyon", "Adobe Flats", "Adobe Lake", "Adobe Mountain"],
"elev_in_m": [1423, "None", "None", 517],
"run": [84, 84, 84, 85],
"run_start": [1423, 1423, 1423, 517],
"run_end": [517, 517, 517, 517],
"run_size": [3, 3, 3, 1],
"run_rank": [1, 2, 3, 1],
"interpol_elev_in_m": [1423, 1121, 819, 517]})
Out[22]:
feature_id | feature_name | elev_in_m | run | run_start | run_end | run_size | run_rank | interpol_elev_in_m | |
---|---|---|---|---|---|---|---|---|---|
0 | 493 | Adobe Canyon | 1423 | 84 | 1423 | 517 | 3 | 1 | 1423 |
1 | 494 | Adobe Flats | None | 84 | 1423 | 517 | 3 | 2 | 1121 |
2 | 495 | Adobe Lake | None | 84 | 1423 | 517 | 3 | 3 | 819 |
3 | 496 | Adobe Mountain | 517 | 85 | 517 | 517 | 1 | 1 | 517 |