import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)
sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline
This is the GNIS dataset from the previous lecture. If you didn't load in the database, run the below cell before connecting.
!psql -h localhost -d gnis -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS gnis'
!psql -h localhost -c 'CREATE DATABASE gnis'
!psql -h localhost -d gnis -f ../lec13/data/gnis.sql
pg_terminate_backend ---------------------- (0 rows) DROP DATABASE CREATE DATABASE SET SET SET SET SET set_config ------------ (1 row) SET SET SET SET SET SET CREATE TABLE ALTER TABLE CREATE TABLE ALTER TABLE COPY 3195 COPY 11533 CREATE INDEX
%reload_ext sql
%sql postgresql://localhost:5432/gnis
%sql SELECT setseed(0.12345);
setseed |
---|
%%sql
DROP TABLE IF EXISTS holey CASCADE;
CREATE TABLE holey AS
SELECT feature_id, feature_name, feature_class, state_alpha, county_name,
prim_lat_dec, prim_long_dec,
CASE WHEN random() > 0.9 THEN NULL
ELSE elev_in_m
END AS elev_in_m
FROM national;
SELECT count(elev_in_m)::float / count(*) FROM holey;
?column? |
---|
0.8755744385675887 |
%%sql
SELECT * FROM holey TABLESAMPLE BERNOULLI(.1);
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m |
---|---|---|---|---|---|---|---|
129624 | Jernigan Lake | Reservoir | AL | Escambia | 31.0448271 | -87.1856044 | 35.0 |
542541 | Poley | Populated Place | LA | Livingston | 30.3382495 | -90.7964848 | 3.0 |
729853 | Burkhart Prairie | Area | MO | Newton | 36.9617304 | -94.483 | 348.0 |
901807 | First Baptist Church | Church | NM | Otero | 32.8995325 | -105.960265 | 1323.0 |
1156802 | Multnomah Falls Bar | Bar | OR | Multnomah | 45.5831737 | -122.1234199 | 4.0 |
1347909 | Stevens Park School | School | TX | Dallas | 32.7601302 | -96.8650032 | 176.0 |
1477093 | Barley | Populated Place | VA | Greensville | 36.5640382 | -77.7241508 | 86.0 |
1541653 | Left Fork Elk Creek | Stream | WV | Mingo | 37.7301032 | -82.1334652 | 269.0 |
1726318 | Teton Mine | Mine | WY | Natrona | 42.8896821 | -107.4520133 | 2111.0 |
2080975 | Clay County Courthouse | Building | MN | Clay | 46.8841667 | -96.7633333 | 275.0 |
%%sql
WITH elevavg AS (SELECT avg(elev_in_m) FROM holey)
SELECT h.*,
CASE WHEN h.elev_in_m IS NOT NULL THEN h.elev_in_m
ELSE e.avg
END AS imputed_elev_in_m
FROM holey h, elevavg e
LIMIT 10;
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | imputed_elev_in_m |
---|---|---|---|---|---|---|---|---|
1230 | Belmont Mountains | Range | AZ | Maricopa | 33.642258 | -112.9010129 | 931.0 | 931.0 |
1839 | Bootlegger Saddle | Gap | AZ | Cochise | 31.8931474 | -109.2831176 | 2707.0 | 2707.0 |
2336 | Cabeza Prieta Game Range | Park | AZ | Yuma | 32.250056 | -113.45074 | 275.0 | 275.0 |
2750 | Chandler Springs | Spring | AZ | Navajo | 35.3766788 | -110.4754096 | 1685.0 | 1685.0 |
3032 | Cline Well | Well | AZ | Cochise | 31.9000849 | -110.3428525 | None | 483.962467815409 |
3039 | Clover Tank | Reservoir | AZ | Gila | 33.8509816 | -110.2577249 | 1563.0 | 1563.0 |
3060 | Coat Spring | Spring | AZ | Navajo | 36.12678 | -110.3330424 | 1926.0 | 1926.0 |
3143 | Comar Spring | Spring | AZ | Navajo | 35.5308428 | -110.4162424 | 1732.0 | 1732.0 |
3333 | Cottonwood Creek | Stream | AZ | Coconino | 36.050817 | -111.9865535 | 800.0 | 800.0 |
3342 | Cottonwood Creek | Stream | AZ | Mohave | 36.901931 | -112.5632547 | 1389.0 | 1389.0 |
# Training is an aggregate function
# Here we'll train the model in SQL just for fun
result = %sql SELECT regr_slope(elev_in_m, prim_long_dec), \
regr_intercept(elev_in_m, prim_long_dec) FROM holey
slope, intercept = result[0]
slope, intercept
(-10.314179001097786, -477.9603219322606)
%%sql
SELECT prim_long_dec, elev_in_m
FROM holey
WHERE prim_long_dec >= -122
ORDER BY prim_long_dec
OFFSET 40;
prim_long_dec | elev_in_m |
---|---|
-121.8142546 | 469.0 |
-121.8099602 | None |
-121.8083604 | 1454.0 |
-121.8056805 | None |
-121.803556 | 1614.0 |
-121.7974945 | 1264.0 |
-121.7960906 | 219.0 |
-121.784236 | 1684.0 |
-121.7832846 | 2.0 |
-121.7785678 | 46.0 |
%%sql
SELECT *,
CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
ELSE {{slope}}*prim_long_dec + {{intercept}}
END AS imputed_elev_in_m
FROM holey
LIMIT 10;
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | imputed_elev_in_m |
---|---|---|---|---|---|---|---|---|
1230 | Belmont Mountains | Range | AZ | Maricopa | 33.642258 | -112.9010129 | 931.0 | 931.0 |
1839 | Bootlegger Saddle | Gap | AZ | Cochise | 31.8931474 | -109.2831176 | 2707.0 | 2707.0 |
2336 | Cabeza Prieta Game Range | Park | AZ | Yuma | 32.250056 | -113.45074 | 275.0 | 275.0 |
2750 | Chandler Springs | Spring | AZ | Navajo | 35.3766788 | -110.4754096 | 1685.0 | 1685.0 |
3032 | Cline Well | Well | AZ | Cochise | 31.9000849 | -110.3428525 | None | 660.1356102444698 |
3039 | Clover Tank | Reservoir | AZ | Gila | 33.8509816 | -110.2577249 | 1563.0 | 1563.0 |
3060 | Coat Spring | Spring | AZ | Navajo | 36.12678 | -110.3330424 | 1926.0 | 1926.0 |
3143 | Comar Spring | Spring | AZ | Navajo | 35.5308428 | -110.4162424 | 1732.0 | 1732.0 |
3333 | Cottonwood Creek | Stream | AZ | Coconino | 36.050817 | -111.9865535 | 800.0 | 800.0 |
3342 | Cottonwood Creek | Stream | AZ | Mohave | 36.901931 | -112.5632547 | 1389.0 | 1389.0 |
We won't show the demo; check slides for the general idea.
%reload_ext sql
%sql postgresql://localhost:5432/gnis
%config SqlMagic.displaylimit = 100
%%sql
-- The following does not work in PostgreSQL!
WITH buggy AS (
SELECT *,
CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
ELSE lag(elev_in_m, 1)
OVER (ORDER BY feature_id)
END AS imputed_elev_in_m
FROM holey
)
SELECT feature_id, elev_in_m, imputed_elev_in_m FROM buggy ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
feature_id | elev_in_m | imputed_elev_in_m |
---|---|---|
48638 | 46.0 | 46.0 |
49192 | 401.0 | 401.0 |
49214 | 194.0 | 194.0 |
49350 | None | 194.0 |
49578 | None | None |
49802 | 47.0 | 47.0 |
49925 | 111.0 | 111.0 |
50059 | 71.0 | 71.0 |
50309 | None | 71.0 |
50661 | 26.0 | 26.0 |
50948 | 222.0 | 222.0 |
51021 | 70.0 | 70.0 |
%%sql
-- Here's a UDA fix from
-- https://stackoverflow.com/questions/18987791/how-do-i-efficiently-select-the-previous-non-null-value
CREATE OR REPLACE FUNCTION coalesce_agg_sfunc(state anyelement, value anyelement) RETURNS anyelement AS
$$
SELECT coalesce(value, state);
$$ LANGUAGE SQL;
CREATE OR REPLACE AGGREGATE coalesce_agg(anyelement) (
SFUNC = coalesce_agg_sfunc,
STYPE = anyelement);
%%sql
-- Fixed to handle repeated NULLs
WITH fixed AS (
SELECT *,
coalesce_agg(elev_in_m) OVER (order by feature_id) AS imputed_elev_in_m
FROM holey
)
SELECT feature_id, elev_in_m, imputed_elev_in_m FROM fixed ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
feature_id | elev_in_m | imputed_elev_in_m |
---|---|---|
48638 | 46.0 | 46.0 |
49192 | 401.0 | 401.0 |
49214 | 194.0 | 194.0 |
49350 | None | 194.0 |
49578 | None | 194.0 |
49802 | 47.0 | 47.0 |
49925 | 111.0 | 111.0 |
50059 | 71.0 | 71.0 |
50309 | None | 71.0 |
50661 | 26.0 | 26.0 |
50948 | 222.0 | 222.0 |
51021 | 70.0 | 70.0 |
Forward pass
%%sql
-- 1. Forward assign run numbers to rows, propagate val, get nextval
CREATE OR REPLACE VIEW forward AS
SELECT *,
SUM(CASE WHEN elev_in_m IS NULL THEN 0 ELSE 1 END)
OVER (ORDER BY feature_id) AS run,
coalesce_agg(elev_in_m) OVER (ORDER BY feature_id) AS run_start,
CASE WHEN elev_in_m IS NULL
THEN lead(elev_in_m, 1) OVER (ORDER BY feature_id)
ELSE NULL
END AS nextval
FROM holey;
SELECT * FROM forward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | run | run_start | nextval |
---|---|---|---|---|---|---|---|---|---|---|
48638 | Dry Creek | Stream | AR | Union | 33.3242919 | -92.9121074 | 46.0 | 168 | 46.0 | None |
49192 | Foster Cemetery | Cemetery | AR | Benton | 36.4612601 | -94.1579926 | 401.0 | 169 | 401.0 | None |
49214 | Fox Hollow | Valley | AR | Sharp | 36.4575632 | -91.4387492 | 194.0 | 170 | 194.0 | None |
49350 | Lewis Johnson Family Cemetery | Cemetery | AR | Johnson | 35.4122994 | -93.3587851 | None | 170 | 194.0 | None |
49578 | Grassy Lake | Swamp | AR | Hempstead | 33.6887443 | -93.8693011 | None | 170 | 194.0 | 47.0 |
49802 | Hamilton Creek | Stream | AR | Ouachita | 33.7592735 | -92.9482208 | 47.0 | 171 | 47.0 | None |
49925 | Hartsoe Cemetery | Cemetery | AR | Greene | 36.1833735 | -90.4370362 | 111.0 | 172 | 111.0 | None |
50059 | Hickory Grove Cemetery | Cemetery | AR | Lonoke | 34.6431418 | -92.028191 | 71.0 | 173 | 71.0 | None |
50309 | Hopewell Church | Church | AR | Arkansas | 34.3839893 | -91.1540068 | None | 173 | 71.0 | 26.0 |
50661 | John Mack Slough | Gut | AR | Calhoun | 33.3431778 | -92.4573712 | 26.0 | 174 | 26.0 | None |
50948 | Lake Norfork Church | Church | AR | Baxter | 36.3853437 | -92.224323 | 222.0 | 175 | 222.0 | None |
51021 | Leachville Cemetery | Cemetery | AR | Jackson | 35.5949353 | -91.243365 | 70.0 | 176 | 70.0 | None |
Backward pass
%%sql
-- 2. Backward: assign run_end, run_size, run_rank
CREATE OR REPLACE VIEW backward AS
SELECT *,
CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
ELSE coalesce_agg(nextval) OVER (PARTITION BY run ORDER BY feature_id DESC)
END AS run_end,
count(*) OVER (PARTITION BY run) AS run_size,
-1 + (RANK() OVER (PARTITION BY run ORDER BY feature_id)) AS run_rank
FROM forward;
SELECT * FROM backward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | run | run_start | nextval | run_end | run_size | run_rank |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48638 | Dry Creek | Stream | AR | Union | 33.3242919 | -92.9121074 | 46.0 | 168 | 46.0 | None | 46.0 | 1 | 0 |
49192 | Foster Cemetery | Cemetery | AR | Benton | 36.4612601 | -94.1579926 | 401.0 | 169 | 401.0 | None | 401.0 | 1 | 0 |
49214 | Fox Hollow | Valley | AR | Sharp | 36.4575632 | -91.4387492 | 194.0 | 170 | 194.0 | None | 194.0 | 3 | 0 |
49350 | Lewis Johnson Family Cemetery | Cemetery | AR | Johnson | 35.4122994 | -93.3587851 | None | 170 | 194.0 | None | 47.0 | 3 | 1 |
49578 | Grassy Lake | Swamp | AR | Hempstead | 33.6887443 | -93.8693011 | None | 170 | 194.0 | 47.0 | 47.0 | 3 | 2 |
49802 | Hamilton Creek | Stream | AR | Ouachita | 33.7592735 | -92.9482208 | 47.0 | 171 | 47.0 | None | 47.0 | 1 | 0 |
49925 | Hartsoe Cemetery | Cemetery | AR | Greene | 36.1833735 | -90.4370362 | 111.0 | 172 | 111.0 | None | 111.0 | 1 | 0 |
50059 | Hickory Grove Cemetery | Cemetery | AR | Lonoke | 34.6431418 | -92.028191 | 71.0 | 173 | 71.0 | None | 71.0 | 2 | 0 |
50309 | Hopewell Church | Church | AR | Arkansas | 34.3839893 | -91.1540068 | None | 173 | 71.0 | 26.0 | 26.0 | 2 | 1 |
50661 | John Mack Slough | Gut | AR | Calhoun | 33.3431778 | -92.4573712 | 26.0 | 174 | 26.0 | None | 26.0 | 1 | 0 |
50948 | Lake Norfork Church | Church | AR | Baxter | 36.3853437 | -92.224323 | 222.0 | 175 | 222.0 | None | 222.0 | 1 | 0 |
51021 | Leachville Cemetery | Cemetery | AR | Jackson | 35.5949353 | -91.243365 | 70.0 | 176 | 70.0 | None | 70.0 | 1 | 0 |
Scalar function pass
%%sql
-- 3. Simple scalar pass
CREATE OR REPLACE VIEW final AS
SELECT *,
run_start + (run_rank-1)*((run_end-run_start)/(run_size))
AS interpolated
FROM backward;
SELECT * FROM final ORDER BY feature_id ASC LIMIT 12 OFFSET 183;
feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | run | run_start | nextval | run_end | run_size | run_rank | interpolated |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48638 | Dry Creek | Stream | AR | Union | 33.3242919 | -92.9121074 | 46.0 | 168 | 46.0 | None | 46.0 | 1 | 0 | 46.0 |
49192 | Foster Cemetery | Cemetery | AR | Benton | 36.4612601 | -94.1579926 | 401.0 | 169 | 401.0 | None | 401.0 | 1 | 0 | 401.0 |
49214 | Fox Hollow | Valley | AR | Sharp | 36.4575632 | -91.4387492 | 194.0 | 170 | 194.0 | None | 194.0 | 3 | 0 | 194.0 |
49350 | Lewis Johnson Family Cemetery | Cemetery | AR | Johnson | 35.4122994 | -93.3587851 | None | 170 | 194.0 | None | 47.0 | 3 | 1 | 194.0 |
49578 | Grassy Lake | Swamp | AR | Hempstead | 33.6887443 | -93.8693011 | None | 170 | 194.0 | 47.0 | 47.0 | 3 | 2 | 145.0 |
49802 | Hamilton Creek | Stream | AR | Ouachita | 33.7592735 | -92.9482208 | 47.0 | 171 | 47.0 | None | 47.0 | 1 | 0 | 47.0 |
49925 | Hartsoe Cemetery | Cemetery | AR | Greene | 36.1833735 | -90.4370362 | 111.0 | 172 | 111.0 | None | 111.0 | 1 | 0 | 111.0 |
50059 | Hickory Grove Cemetery | Cemetery | AR | Lonoke | 34.6431418 | -92.028191 | 71.0 | 173 | 71.0 | None | 71.0 | 2 | 0 | 71.0 |
50309 | Hopewell Church | Church | AR | Arkansas | 34.3839893 | -91.1540068 | None | 173 | 71.0 | 26.0 | 26.0 | 2 | 1 | 71.0 |
50661 | John Mack Slough | Gut | AR | Calhoun | 33.3431778 | -92.4573712 | 26.0 | 174 | 26.0 | None | 26.0 | 1 | 0 | 26.0 |
50948 | Lake Norfork Church | Church | AR | Baxter | 36.3853437 | -92.224323 | 222.0 | 175 | 222.0 | None | 222.0 | 1 | 0 | 222.0 |
51021 | Leachville Cemetery | Cemetery | AR | Jackson | 35.5949353 | -91.243365 | 70.0 | 176 | 70.0 | None | 70.0 | 1 | 0 | 70.0 |
%sql EXPLAIN Analyze SELECT * from final LIMIT 500;
QUERY PLAN |
---|
Limit (cost=1206.79..1275.92 rows=500 width=216) (actual time=31.791..33.647 rows=500 loops=1) |
-> Subquery Scan on backward (cost=1206.79..2074.77 rows=6278 width=216) (actual time=31.789..33.614 rows=500 loops=1) |
-> WindowAgg (cost=1206.79..1902.13 rows=6278 width=208) (actual time=31.787..33.537 rows=500 loops=1) |
-> WindowAgg (cost=1206.79..1792.26 rows=6278 width=200) (actual time=31.782..33.164 rows=501 loops=1) |
-> Incremental Sort (cost=1206.79..1682.40 rows=6278 width=192) (actual time=31.777..32.887 rows=502 loops=1) |
Sort Key: forward.run, forward.feature_id |
Presorted Key: forward.run |
Full-sort Groups: 16 Sort Method: quicksort Average Memory: 29kB Peak Memory: 29kB |
-> WindowAgg (cost=1204.85..1330.41 rows=6278 width=192) (actual time=31.694..32.700 rows=514 loops=1) |
-> Sort (cost=1204.85..1220.55 rows=6278 width=184) (actual time=31.648..31.689 rows=515 loops=1) |
Sort Key: forward.run, forward.feature_id DESC |
Sort Method: quicksort Memory: 2102kB |
-> Subquery Scan on forward (cost=604.80..808.83 rows=6278 width=184) (actual time=3.266..26.821 rows=11533 loops=1) |
-> WindowAgg (cost=604.80..746.05 rows=6278 width=184) (actual time=3.266..25.651 rows=11533 loops=1) |
-> Sort (cost=604.80..620.49 rows=6278 width=160) (actual time=3.195..3.816 rows=11533 loops=1) |
Sort Key: holey.feature_id |
Sort Method: quicksort Memory: 2011kB |
-> Seq Scan on holey (cost=0.00..208.78 rows=6278 width=160) (actual time=0.008..0.957 rows=11533 loops=1) |
Planning Time: 0.136 ms |
Execution Time: 33.874 ms |
%reload_ext sql
%sql postgresql://localhost:5432/postgres
%sql CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
%%sql
DROP TABLE IF EXISTS Strings;
CREATE TABLE Strings (str1 TEXT, str2 TEXT);
INSERT INTO Strings VALUES
('Lisa', 'List'),
('Lisa', 'License'),
('Joe', 'Noel');
%%sql
SELECT *,
levenshtein(str1, str2),
soundex(str1) as soundex1,
soundex(str2) as soundex2,
dmetaphone(str1) AS dmetaphone1,
dmetaphone(str2) AS dmetaphone2,
dmetaphone_alt(str1) AS dmetaphone_alt1,
dmetaphone_alt(str2) AS dmetaphone_alt2
FROM Strings;
str1 | str2 | levenshtein | soundex1 | soundex2 | dmetaphone1 | dmetaphone2 | dmetaphone_alt1 | dmetaphone_alt2 |
---|---|---|---|---|---|---|---|---|
Lisa | List | 1 | L200 | L230 | LS | LST | LS | LST |
Lisa | License | 4 | L200 | L252 | LS | LSNS | LS | LSNS |
Joe | Noel | 2 | J000 | N400 | J | NL | A | NL |