import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)
sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline
%reload_ext sql
%sql postgresql://localhost:5432/postgres
%sql SELECT setseed(0.42) -- to fix randomness
| setseed | 
|---|
Just like a Python package, we need to import tablefunc in order to use the normal_rand function.
%sql CREATE EXTENSION IF NOT EXISTS tablefunc;
Assume our datapoints are truly Normal. Simulate them in observations.
%sql DROP TABLE IF EXISTS observations CASCADE;
%sql CREATE TABLE observations AS \
     SELECT normal_rand AS x \
     FROM normal_rand(1000, 50, 5);
results = %sql SELECT x FROM observations;
sns.displot(results.dict(), fill=True, kde=True, bins=20,
            height=4, aspect=2)
<seaborn.axisgrid.FacetGrid at 0x7f2e62b58d50>
Construct a view called normal_outliers using the 2 sigma metric.
# construct the view
%%sql
CREATE OR REPLACE VIEW normal_outliers AS
WITH bounds AS (
   SELECT avg(x) - 2*stddev(x) AS lo,
   avg(x) + 2*stddev(x) AS hi
   FROM observations
)
SELECT x AS outlier
FROM observations o, bounds b 
WHERE x NOT BETWEEN b.lo AND b.hi;
# query the view
%sql SELECT * FROM normal_outliers;
| outlier | 
|---|
| 59.82230258659907 | 
| 39.688576522693424 | 
| 61.3054707730887 | 
| 61.29016170171167 | 
| 37.638986178841535 | 
| 38.80156045940347 | 
| 36.444988608291084 | 
| 34.799624865301666 | 
| 39.71593476243275 | 
| 61.272855781811394 | 
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
            height=4, aspect=1.5)
plt.xlim(30, 70)
(30.0, 70.0)
A.
## plots outliers and original
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)
(30.0, 70.0)
B.
# correct
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)
(30.0, 70.0)
C.
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers) \
               UNION ALL \
               SELECT x, 'cleaned' AS label \
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)
(30.0, 70.0)
D.
results = %sql SELECT x, 'cleaned' AS label \
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)
(30.0, 70.0)
%sql UPDATE observations SET x = x*10 \
      WHERE x = (SELECT MAX(x) FROM observations);
%sql SELECT min(x), max(x) FROM observations
| min | max | 
|---|---|
| 33.81466007123295 | 681.9343532189914 | 
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 700)
(30.0, 700.0)
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
(30.0, 70.0)
## reset before moving on
%sql UPDATE observations SET x = x/10 \
      WHERE x = (SELECT MAX(x) FROM OBSERVATIONS);
import numpy as np
arr = np.array([92, 19, 101, 58, 1053, 91, 26, 78, 10, 13, -40, 101, 86, 85, 15, 89, 89, 28, -5, 41])
arr
array([  92,   19,  101,   58, 1053,   91,   26,   78,   10,   13,  -40,
        101,   86,   85,   15,   89,   89,   28,   -5,   41])
np.mean(arr), np.median(arr)
(101.5, 68.0)
np.percentile(arr, 95, method='nearest')
101
np.percentile(arr, 5, method='nearest')
-5
arr_winsorized = np.array([92, 19, 101, 58, 101, 91, 26, 78, 10, 13, -5, 101, 86, 85, 15, 89, 89, 28, -5, 41])
arr_winsorized
array([ 92,  19, 101,  58, 101,  91,  26,  78,  10,  13,  -5, 101,  86,
        85,  15,  89,  89,  28,  -5,  41])
np.mean(arr_winsorized), np.median(arr_winsorized)
(55.65, 68.0)
%%sql
CREATE OR REPLACE VIEW p1p99 AS
SELECT percentile_cont(.01) WITHIN GROUP (ORDER BY x) AS p1,
           percentile_cont(.99) WITHIN GROUP (ORDER BY x) AS p99
      FROM observations;
SELECT * FROM p1p99;
| p1 | p99 | 
|---|---|
| 39.11510510063385 | 62.38853671085507 | 
%%sql
CREATE OR REPLACE VIEW trimmed_observations AS
SELECT o.x, 'trimmed' AS label
  FROM observations o, p1p99 p
 WHERE o.x BETWEEN p.p1 AND p.p99
UNION ALL
SELECT o.x, 'original' AS label
  FROM observations o;
CREATE OR REPLACE VIEW trimmed_outliers AS
SELECT o.*
  FROM observations o, p1p99 p
 WHERE o.x NOT BETWEEN p.p1 AND p.p99;
results = %sql SELECT * from trimmed_observations
sns.displot(results.dict(), x="x", kind='hist', hue='label', kde=True, bins=20,
           height=4, aspect=1.5)
<seaborn.axisgrid.FacetGrid at 0x7f2e50e25390>
# corrupt one value
%sql UPDATE observations SET x = x*10 \
      WHERE x = (SELECT MAX(x) FROM observations);
Turns out, we still have as many outliers, which we trim.
Because we are trimming based on the order of the data and not on the distribution statistics (mean/SD)
results = %sql SELECT * from trimmed_observations
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
(30.0, 70.0)
## reset before moving on
# %sql UPDATE observations SET x = x*10 \
#       WHERE x = (SELECT MAX(x) FROM OBSERVATIONS);
%%sql
CREATE OR REPLACE VIEW winsorized_observations AS
SELECT CASE WHEN o.x BETWEEN p.p1 AND p.p99 THEN o.x
            WHEN o.x < p.p1 THEN p.p1
            WHEN o.x > p.p99 THEN p.p99
        END AS x,
      'winsorized' AS label
  FROM observations o, p1p99 p
UNION ALL
SELECT o.x, 'original' AS label
  FROM observations o;
results = %sql SELECT * from winsorized_observations
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
(30.0, 70.0)
%%sql 
SELECT 'orig' AS distribution, min(x),
       percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
       percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
       percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
       max(x), avg(x), stddev(x), count(x) 
       FROM observations
UNION ALL
SELECT 'winsorized', min(x),
       percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
       percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
       percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
       max(x), avg(x), stddev(x), count(x) 
       FROM winsorized_observations WHERE label = 'winsorized'
UNION ALL 
SELECT 'trimmed', min(x),
       percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
       percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
       percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
       max(x), avg(x), stddev(x), count(x) 
       FROM trimmed_observations WHERE label = 'trimmed';
| distribution | min | p25 | median | p75 | max | avg | stddev | count | 
|---|---|---|---|---|---|---|---|---|
| orig | 33.81466007123295 | 46.590796172482186 | 49.89707347076867 | 53.06512874818042 | 681.9343532189914 | 50.4731336114083 | 20.56765062228802 | 1000 | 
| winsorized | 39.11510510063385 | 46.590796172482186 | 49.89707347076867 | 53.06512874818042 | 62.38853671085507 | 49.85009437230814 | 4.7890523372867415 | 1000 | 
| trimmed | 39.11624475909565 | 46.6622689719359 | 49.89707347076867 | 52.92677414266813 | 62.38553193932125 | 49.83169178999314 | 4.540953142225145 | 980 | 
%%sql
-- percentile_disc returns an actual data value near the percentile
-- percentile_cont returns an interpolated value at the percentile
CREATE OR REPLACE VIEW median AS
(SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY x) as median
  FROM observations);
%%sql
CREATE OR REPLACE VIEW mad AS
WITH
absdevs AS
(SELECT abs(x - median) as d
   FROM observations, median)
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY d) as mad
  FROM absdevs;
    
SELECT median, mad
  FROM median, mad;
| median | mad | 
|---|---|
| 49.89707347076867 | 3.228948745848946 | 
%%sql
CREATE OR REPLACE VIEW hampelx84x2_observations AS
SELECT o.x,
      'hampelx84x2' AS label
  FROM observations o, median, mad
 WHERE o.x BETWEEN (median-2*1.4826*mad) AND (median+2*1.4826*mad)
UNION ALL
SELECT o.x, 'orig' AS label
  FROM observations o;
CREATE OR REPLACE VIEW Hampel84x2_outliers AS
SELECT x
  FROM observations o, median, mad
 WHERE x NOT BETWEEN (median - 2*1.4826*mad) AND (median + 2*1.4826*mad);
results = %sql SELECT * FROM hampelx84x2_observations
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
(30.0, 70.0)
This is the GNIS dataset from the previous lecture. If you didn't load in the database, run the below cell before connecting.
!psql -h localhost -d gnis -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS gnis'
!psql -h localhost -c 'CREATE DATABASE gnis' 
!psql -h localhost -d gnis -f ../lec13/data/gnis.sql
%reload_ext sql
%sql postgresql://localhost:5432/gnis
%sql SELECT setseed(0.12345);
| setseed | 
|---|
%%sql
DROP TABLE IF EXISTS holey CASCADE;
CREATE TABLE holey AS 
SELECT feature_id, feature_name, feature_class, state_alpha, county_name, 
       prim_lat_dec, prim_long_dec, 
       CASE WHEN random() > 0.9 THEN NULL
            ELSE elev_in_m
        END AS elev_in_m
  FROM national;
SELECT count(elev_in_m)::float / count(*) FROM holey;
| ?column? | 
|---|
| 0.8755744385675887 | 
%%sql
WITH elevavg AS (SELECT avg(elev_in_m) FROM holey)
SELECT h.*, 
       CASE WHEN h.elev_in_m IS NOT NULL THEN h.elev_in_m
            ELSE e.avg
        END AS imputed_elev_in_m
  FROM holey h, elevavg e
LIMIT 10;
| feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | imputed_elev_in_m | 
|---|---|---|---|---|---|---|---|---|
| 1230 | Belmont Mountains | Range | AZ | Maricopa | 33.642258 | -112.9010129 | 931.0 | 931.0 | 
| 1839 | Bootlegger Saddle | Gap | AZ | Cochise | 31.8931474 | -109.2831176 | 2707.0 | 2707.0 | 
| 2336 | Cabeza Prieta Game Range | Park | AZ | Yuma | 32.250056 | -113.45074 | 275.0 | 275.0 | 
| 2750 | Chandler Springs | Spring | AZ | Navajo | 35.3766788 | -110.4754096 | 1685.0 | 1685.0 | 
| 3032 | Cline Well | Well | AZ | Cochise | 31.9000849 | -110.3428525 | None | 483.962467815409 | 
| 3039 | Clover Tank | Reservoir | AZ | Gila | 33.8509816 | -110.2577249 | 1563.0 | 1563.0 | 
| 3060 | Coat Spring | Spring | AZ | Navajo | 36.12678 | -110.3330424 | 1926.0 | 1926.0 | 
| 3143 | Comar Spring | Spring | AZ | Navajo | 35.5308428 | -110.4162424 | 1732.0 | 1732.0 | 
| 3333 | Cottonwood Creek | Stream | AZ | Coconino | 36.050817 | -111.9865535 | 800.0 | 800.0 | 
| 3342 | Cottonwood Creek | Stream | AZ | Mohave | 36.901931 | -112.5632547 | 1389.0 | 1389.0 | 
# Training is an aggregate function
# Here we'll train the model in SQL just for fun
result = %sql SELECT regr_slope(elev_in_m, prim_long_dec), \
               regr_intercept(elev_in_m, prim_long_dec) FROM holey
slope, intercept = result[0]
slope, intercept
(-10.314179001097786, -477.9603219322606)
%%sql
SELECT *, 
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE {{slope}}*prim_long_dec + {{intercept}}
        END AS imputed_elev_in_m
  FROM holey
LIMIT 10;
| feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | imputed_elev_in_m | 
|---|---|---|---|---|---|---|---|---|
| 1230 | Belmont Mountains | Range | AZ | Maricopa | 33.642258 | -112.9010129 | 931.0 | 931.0 | 
| 1839 | Bootlegger Saddle | Gap | AZ | Cochise | 31.8931474 | -109.2831176 | 2707.0 | 2707.0 | 
| 2336 | Cabeza Prieta Game Range | Park | AZ | Yuma | 32.250056 | -113.45074 | 275.0 | 275.0 | 
| 2750 | Chandler Springs | Spring | AZ | Navajo | 35.3766788 | -110.4754096 | 1685.0 | 1685.0 | 
| 3032 | Cline Well | Well | AZ | Cochise | 31.9000849 | -110.3428525 | None | 660.1356102444698 | 
| 3039 | Clover Tank | Reservoir | AZ | Gila | 33.8509816 | -110.2577249 | 1563.0 | 1563.0 | 
| 3060 | Coat Spring | Spring | AZ | Navajo | 36.12678 | -110.3330424 | 1926.0 | 1926.0 | 
| 3143 | Comar Spring | Spring | AZ | Navajo | 35.5308428 | -110.4162424 | 1732.0 | 1732.0 | 
| 3333 | Cottonwood Creek | Stream | AZ | Coconino | 36.050817 | -111.9865535 | 800.0 | 800.0 | 
| 3342 | Cottonwood Creek | Stream | AZ | Mohave | 36.901931 | -112.5632547 | 1389.0 | 1389.0 | 
We won't show the demo; check slides for the general idea.
%config SqlMagic.displaylimit = 100
%%sql
-- The following doesn't work in PostgreSQL!
WITH buggy AS (
SELECT *,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE lag(elev_in_m, 1)
                 OVER (ORDER BY feature_id)
        END AS imputed_elev_in_m
  FROM holey
)
SELECT * FROM buggy ORDER BY elev_in_m DESC LIMIT 100;
| feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | imputed_elev_in_m | 
|---|---|---|---|---|---|---|---|---|
| 902614 | Blue Spring | Spring | NM | San Miguel | 0.0 | 0.0 | None | 2025.0 | 
| 901753 | Emmanuel Baptist Church | Church | NM | San Miguel | 0.0 | 0.0 | None | None | 
| 901695 | Buffalo Head | Summit | NM | Colfax | 0.0 | 0.0 | None | None | 
| 901689 | Cerro de los Burros | Summit | NM | Rio Arriba | 0.0 | 0.0 | None | 1089.0 | 
| 894278 | San Jose Cemetery | Cemetery | NM | Doña Ana | 32.1148663 | -106.7050186 | None | 1421.0 | 
| 891907 | Mesquite Lake | Reservoir | NM | Luna | 31.9873677 | -107.4663164 | None | 1876.0 | 
| 888921 | Estancia Cemetery | Cemetery | NM | Torrance | 34.7412091 | -106.0803887 | None | 2039.0 | 
| 900845 | Bessie Case Spring | Spring | NM | Lincoln | 33.4111999 | -105.9607228 | None | None | 
| 896734 | Wolf Creek | Stream | NM | Mora | 35.8089296 | -104.9183328 | None | 1263.0 | 
| 886608 | Boone Well | Locale | NM | Doña Ana | 32.4270352 | -106.5116614 | None | 1396.0 | 
| 883621 | Windsor Park | Populated Place | NJ | Ocean | 39.9534514 | -74.1495839 | None | None | 
| 887946 | Cottonwood Wash | Stream | NM | San Juan | 36.7780573 | -108.2509094 | None | None | 
| 883958 | Louise Lake Dam | Dam | NJ | Monmouth | 40.1283729 | -74.1978092 | None | 276.0 | 
| 877984 | Ludlam Creek | Stream | NJ | Cape May | 39.2839001 | -74.6830684 | None | 32.0 | 
| 888206 | Dave Smith Windmill | Locale | NM | Grant | 32.4139724 | -108.2322663 | None | 1786.0 | 
| 862959 | Ichabod Range | Range | NV | Elko | 41.7235174 | -115.6278542 | None | 2030.0 | 
| 900669 | Gutierrez Spring | Spring | NM | Otero | 33.058974 | -105.9724903 | None | 2214.0 | 
| 874693 | Berkley Church | Church | NJ | Gloucester | 39.8090023 | -75.2140704 | None | 284.0 | 
| 766085 | Saint Nicholas Church (historical) | Church | MO | St. Louis (city) | 0.0 | 0.0 | None | 175.0 | 
| 890667 | Kenna Draw | Valley | NM | Chaves | 33.8031449 | -103.8819067 | None | 1194.0 | 
| 855500 | Maggie Mine | Mine | NV | Lander | 39.5379808 | -117.0528675 | None | 1605.0 | 
| 805412 | McCrea Mine | Mine | MT | Lewis and Clark | 0.0 | 0.0 | None | 1882.0 | 
| 758604 | Stephens and Akeman Airport (historical) | Airport | MO | Chariton | 39.4653017 | -93.1946478 | None | 141.0 | 
| 883489 | Locust Manor | Populated Place | NJ | Ocean | 40.0501163 | -74.2323659 | None | None | 
| 887875 | Corral | Locale | NM | Eddy | 32.0112363 | -104.0027044 | None | 1321.0 | 
| 857783 | Mountain Spring (historical) | Locale | NV | Pershing | 0.0 | 0.0 | None | None | 
| 846631 | Dean Spring Number Three | Spring | NV | Eureka | 40.2069375 | -116.5423132 | None | 1310.0 | 
| 831935 | Overton Lake | Lake | NE | Holt | 42.285096 | -99.2251813 | None | 483.0 | 
| 857977 | Edgemont School (historical) | School | NV | Elko | 41.7107339 | -116.2909322 | None | None | 
| 847043 | Niagara Cañon | Valley | NV | Nye | 38.7693237 | -117.2054653 | None | 1962.0 | 
| 864086 | Sunnyside-Kirch Wildlife Management Area Airport (historical) | Airport | NV | Nye | 38.4146731 | -115.03668 | None | 1289.0 | 
| 861013 | Potts Ranch | Locale | NV | Nye | 39.0865971 | -116.6411881 | None | 1709.0 | 
| 838724 | Big Canyon Ranch | Locale | NV | Washoe | 40.0849059 | -119.7410274 | None | 2004.0 | 
| 856083 | Millers | Populated Place | NV | Esmeralda | 38.1365977 | -117.4575928 | None | 1969.0 | 
| 884506 | Albert G Waters Stadium | Park | NJ | Middlesex | 40.5242719 | -74.2723682 | None | 95.0 | 
| 886172 | Beeson Station | Locale | NM | Eddy | 32.7828953 | -103.9596741 | None | 1878.0 | 
| 857694 | Centennial (historical) | Locale | NV | Nye | 0.0 | 0.0 | None | 555.0 | 
| 764862 | Trautman Lake Dam | Dam | MO | Ste. Genevieve | 37.8550524 | -90.2984542 | None | 288.0 | 
| 755525 | Valley Knob | Summit | MO | Dent | 0.0 | 0.0 | None | 348.0 | 
| 757560 | Bete Creek | Stream | MO | St. Louis | 0.0 | 0.0 | None | None | 
| 880021 | Saint John's Roman Catholic Church | Church | NJ | Essex | 40.738157 | -74.1659776 | None | 26.0 | 
| 869862 | Shaker Branch | Stream | NH | Merrimack | 43.3184132 | -71.4689601 | None | 594.0 | 
| 833535 | Solso School | School | NE | Madison | 41.7644533 | -97.6972752 | None | None | 
| 847442 | One Thousand One Ranch | Locale | NV | Lincoln | 37.741076 | -114.4449911 | None | 1811.0 | 
| 791922 | Taylor Mine | Mine | MT | Beaverhead | 45.233256 | -112.9894813 | None | 1774.0 | 
| 798110 | Nomland Dam | Dam | MT | McCone | 47.7366747 | -106.1472436 | None | 763.0 | 
| 738277 | Sugarcamp Bluff | Cliff | MO | Camden | 37.9044801 | -92.8907419 | None | 329.0 | 
| 745559 | Bacon (historical) | Locale | MO | Cass | 0.0 | 0.0 | None | 276.0 | 
| 854196 | Old Humboldt Mine | Mine | NV | Lander | 40.6135218 | -117.0892717 | None | 1907.0 | 
| 830975 | Lynchburg Cemetery | Cemetery | NE | Richardson | 40.1306579 | -95.9071931 | None | 693.0 | 
| 882713 | West Fork Minagamahone Brook | Stream | NJ | Monmouth | 40.2383834 | -74.1615129 | None | 0.0 | 
| 883462 | Hightstown High School | School | NJ | Mercer | 40.2612126 | -74.5301502 | None | 21.0 | 
| 837604 | Humboldt Hospital Heliport | Airport | NE | Richardson | 40.1666672 | -95.9336091 | None | 366.0 | 
| 814008 | 04N02E18ACAC01 Well | Well | MT | Broadwater | 46.1021509 | -111.5238582 | None | 812.0 | 
| 752036 | Sargent School | School | MO | Texas | 37.088386 | -92.0209876 | None | 269.0 | 
| 761826 | Deimeke Lake | Reservoir | MO | Audrain | 39.1954006 | -91.9053704 | None | 236.0 | 
| 804344 | SW SW Section 16 Mine | Mine | MT | Madison | 45.6604819 | -111.986931 | None | 1882.0 | 
| 799092 | Widgeon Slough | Lake | MT | Sheridan | 48.9647464 | -104.245728 | None | 1371.0 | 
| 824195 | 22N02W30B___01 Well | Well | MT | Teton | 47.6363413 | -111.9180529 | None | 771.0 | 
| 828506 | Cub Creek | Stream | NE | Jefferson | 40.2447236 | -97.1136453 | None | 578.0 | 
| 810480 | 03S44E03ACA_01 Well | Well | MT | Rosebud | 45.6083332 | -106.2822366 | None | 1596.0 | 
| 712928 | Post Post Office (historical) | Post Office | MS | Lauderdale | 0.0 | 0.0 | None | None | 
| 789082 | Pony Mine | Mine | MT | Madison | 45.6679821 | -111.9544299 | None | 1362.0 | 
| 847886 | Foreys Toll House (historical) | Locale | NV | Mineral | 37.9665971 | -118.3381755 | None | 1877.0 | 
| 810266 | 03S25E07C___01 Well | Well | MT | Yellowstone | 45.5843947 | -108.6884727 | None | 1045.0 | 
| 845041 | Woods Ditch | Canal | NV | Lyon | 38.9513059 | -119.1682088 | None | 2835.0 | 
| 676587 | Reform Chapel | Church | MS | Choctaw | 33.4304009 | -89.1458978 | None | 58.0 | 
| 775680 | Redstone | Populated Place | MT | Sheridan | 48.8216926 | -104.9441394 | None | 1940.0 | 
| 833397 | Shickley | Populated Place | NE | Fillmore | 40.4166737 | -97.7225446 | None | 809.0 | 
| 804835 | D J Silver/Saint Louis Mine | Mine | MT | Missoula | 47.2413172 | -114.7120716 | None | 1126.0 | 
| 824829 | 23N50E14CABB01 Well | Well | MT | Dawson | 47.7539035 | -105.2705401 | None | 586.0 | 
| 757694 | Our Lady of Lourdes Church (historical) | Church | MO | St. Louis | 0.0 | 0.0 | None | None | 
| 826167 | 27N26E17CBCB01 Well | Well | MT | Phillips | 48.0936077 | -108.4101416 | None | 1008.0 | 
| 670011 | Flag Chapel Baptist Church | Church | MS | Hinds | 32.3509782 | -90.2650901 | None | 113.0 | 
| 516216 | Walnut Grove Church | Church | KY | Owsley | 37.3914773 | -83.7496406 | None | 390.0 | 
| 754345 | Ruth | Populated Place | MO | Stone | 0.0 | 0.0 | None | 426.0 | 
| 739137 | Huff Branch | Stream | MO | Johnson | 0.0 | 0.0 | None | None | 
| 757643 | Hillside (historical) | Building | MO | St. Louis | 0.0 | 0.0 | None | None | 
| 579778 | Troy Center | Populated Place | ME | Waldo | 44.682568 | -69.2567136 | None | 51.0 | 
| 706386 | Mount Carmel Cemetery | Cemetery | MS | Lauderdale | 32.5373557 | -88.7183739 | None | 125.0 | 
| 753331 | Cureall School | Locale | MO | Howell | 36.678672 | -92.0707096 | None | 252.0 | 
| 743734 | Grand Prairie Presbyterian Church (historical) | Church | MO | Randolph | 0.0 | 0.0 | None | None | 
| 730185 | Raccoon Creek | Stream | MO | Grundy | 40.1286217 | -93.702999 | None | 255.0 | 
| 857174 | Strode Ranch | Locale | NV | Elko | 0.0 | 0.0 | None | 1385.0 | 
| 807886 | 08S49E32CB__01 Well | Well | MT | Powder River | 45.0941536 | -105.7530619 | None | 1187.0 | 
| 817293 | 10N19W07DDDC01 Well | Well | MT | Ravalli | 46.6315896 | -114.0273283 | None | 1169.0 | 
| 737285 | Cicero | Building | MO | Washington | 0.0 | 0.0 | None | None | 
| 757755 | Walnut Plains Camp Grounds (historical) | Locale | MO | St. Louis | 0.0 | 0.0 | None | None | 
| 757982 | Boyers Prairie | Area | MO | St. Francois | 0.0 | 0.0 | None | None | 
| 802880 | Daly Gulch Placer Mine | Mine | MT | Deer Lodge | 46.1938141 | -113.2378348 | None | 2258.0 | 
| 712334 | Fitzhugh Post Office (historical) | Post Office | MS | Sunflower | 0.0 | 0.0 | None | None | 
| 543368 | La Rosen | Populated Place | LA | Caddo | 32.4034884 | -93.7893467 | None | 27.0 | 
| 744733 | Wrights Mill | Locale | MO | Miller | 0.0 | 0.0 | None | 330.0 | 
| 754721 | Three Springs Camp Ground | Locale | MO | Butler | 0.0 | 0.0 | None | 455.0 | 
| 737097 | Wesco School | School | MO | Crawford | 0.0 | 0.0 | None | None | 
| 736860 | Berrys Mill | Locale | MO | Madison | 0.0 | 0.0 | None | 307.0 | 
| 743277 | Beck Cemetery | Cemetery | MO | Atchison | 40.3220754 | -95.4544788 | None | 289.0 | 
| 736877 | Marvin Collegiate Institute | School | MO | Madison | 0.0 | 0.0 | None | None | 
| 730991 | Claypool School | School | MO | Scott | 0.0 | 0.0 | None | None | 
| 816507 | 09N01E35CB__02 Well | Well | MT | Broadwater | 46.492156 | -111.5752493 | None | 1675.0 | 
%%sql
-- Here's a UDA fix from
-- https://stackoverflow.com/questions/18987791/how-do-i-efficiently-select-the-previous-non-null-value
CREATE OR REPLACE FUNCTION coalesce_agg_sfunc(state anyelement, value anyelement) RETURNS anyelement AS
$$
    SELECT coalesce(value, state);
$$ LANGUAGE SQL;
CREATE AGGREGATE coalesce_agg(anyelement) (
    SFUNC = coalesce_agg_sfunc,
    STYPE  = anyelement);
%%sql
-- Fixed to handle repeated NULLs
WITH fixed AS (
SELECT *,
       coalesce_agg(elev_in_m) OVER (order by feature_id) AS imputed_elev_in_m
  FROM holey
)
SELECT * FROM fixed ORDER BY imputed_elev_in_m DESC LIMIT 100;
| feature_id | feature_name | feature_class | state_alpha | county_name | prim_lat_dec | prim_long_dec | elev_in_m | imputed_elev_in_m | 
|---|---|---|---|---|---|---|---|---|
| 1416580 | Browne Tower | Summit | AK | Denali | 63.1000583 | -150.9314441 | 4628.0 | 4628.0 | 
| 188010 | Grizzly Peak | Summit | CO | La Plata | 37.6076296 | -107.5818648 | 4171.0 | 4171.0 | 
| 191939 | Spread Eagle Peak | Summit | CO | Custer | 38.1252369 | -105.6435197 | 4090.0 | 4090.0 | 
| 187393 | Emery Peak | Summit | CO | San Juan | 37.8882556 | -107.6210535 | 4054.0 | 4054.0 | 
| 181832 | Baker Mountain | Summit | CO | Clear Creek | 39.6607654 | -105.8402538 | 3832.0 | 3832.0 | 
| 1703800 | Lucky Jim One Mine | Mine | CO | Park | 39.1519347 | -105.742509 | None | 3728.0 | 
| 1703775 | Park County Claims Mine | Mine | CO | Park | 39.3510997 | -105.9869614 | 3728.0 | 3728.0 | 
| 393894 | Donaldson Peak | Summit | ID | Custer | 44.0639981 | -113.7003853 | 3661.0 | 3661.0 | 
| 1599419 | Dundee Mountain | Summit | WY | Park | 43.8771622 | -109.2682693 | 3640.0 | 3640.0 | 
| 939450 | Master Man Mine | Mine | NM | Colfax | 36.6297542 | -105.2088936 | 3633.0 | 3633.0 | 
| 179902 | Ceresco Number 4 Adit | Mine | CO | Lake | 39.3644327 | -106.1714108 | 3604.0 | 3604.0 | 
| 189898 | Antoro Shaft | Mine | CO | Saguache | 38.3297213 | -106.125297 | 3577.0 | 3577.0 | 
| 1684419 | Maumee Mine | Mine | CO | Park | 39.269434 | -106.1461318 | 3555.0 | 3555.0 | 
| 1598169 | Big Sheep Mountain | Summit | WY | Sublette | 43.2751868 | -109.8786551 | 3537.0 | 3537.0 | 
| 1603836 | Twins Glacier | Glacier | WY | Sublette | 43.149673 | -109.6540386 | 3517.0 | 3517.0 | 
| 1427800 | Mount Ellen | Range | UT | Garfield | 38.1097069 | -110.8137658 | 3501.0 | 3501.0 | 
| 1734540 | Gold Run Placer | Mine | CO | San Miguel | 37.831384 | -107.8345093 | 3449.0 | 3449.0 | 
| 1659681 | Slide Mountain | Summit | CA | Mono | 38.0931634 | -119.4458359 | 3379.0 | 3379.0 | 
| 178928 | Windy Gap | Gap | CO | Montezuma | 37.4749967 | -108.0917424 | 3354.0 | 3354.0 | 
| 197523 | Payne Gulch | Valley | CO | Gunnison | 38.6838024 | -106.4736984 | 3340.0 | 3340.0 | 
| 780007 | Black Mountain | Summit | MT | Park | 45.4713346 | -110.4664499 | 3337.0 | 3337.0 | 
| 203950 | Trachyte Knob | Summit | CO | Teller | 38.7731142 | -105.1065711 | 3314.0 | 3314.0 | 
| 179985 | Buckskin Joe Mine | Mine | CO | Park | 39.2899894 | -106.0955752 | 3309.0 | 3309.0 | 
| 189852 | Sheep Mountain Trail | Trail | CO | Saguache | 38.3297215 | -106.1889107 | 3298.0 | 3298.0 | 
| 2572516 | Windy Lake | Lake | WY | Fremont | 42.6200138 | -108.9825615 | 3274.0 | 3274.0 | 
| 1444260 | Pearl Lake | Lake | UT | Uintah | 40.7891427 | -109.9571503 | 3263.0 | 3263.0 | 
| 181328 | Floral Park Campground | Locale | CO | Clear Creek | 39.7894314 | -105.7805628 | 3258.0 | 3258.0 | 
| 1724633 | Garfield Grouse Mine | Mine | CO | Teller | 38.7330442 | -105.1363688 | 3251.0 | 3251.0 | 
| 186674 | Copley Lake | Lake | CO | Gunnison | 38.8722009 | -107.0830674 | 3251.0 | 3251.0 | 
| 1414051 | Mount Spurr | Summit | AK | Kenai Peninsula | 61.2997222 | -152.2513889 | 3248.0 | 3248.0 | 
| 779022 | Anaconda Range | Range | MT | Deer Lodge | 46.0500221 | -113.1901018 | 3246.0 | 3246.0 | 
| 1446855 | Upper Chain Lake | Reservoir | UT | Duchesne | 40.7113249 | -110.2569397 | 3240.0 | 3240.0 | 
| 931481 | Double Eagle V Ski Trail | Trail | NM | Santa Fe | 35.7911406 | -105.7950188 | 3240.0 | 3240.0 | 
| 262684 | Little Papoose Creek | Stream | CA | Trinity | 40.8009791 | -122.735855 | None | 3224.0 | 
| 262186 | Kings River Trail | Trail | CA | Inyo | 37.1679871 | -118.48483 | 3224.0 | 3224.0 | 
| 1441700 | Henderson Creek | Stream | UT | Garfield | 37.6077619 | -112.0557457 | None | 3223.0 | 
| 1441572 | Hancock Peak | Summit | UT | Iron | 37.6349097 | -112.7619161 | 3223.0 | 3223.0 | 
| 187877 | Raber Lohr Ditch | Canal | CO | Hinsdale | 37.6786111 | -107.3222767 | 3218.0 | 3218.0 | 
| 181537 | Utah Hill | Summit | CO | Gilpin | 39.861742 | -105.5629767 | 3202.0 | 3202.0 | 
| 1735447 | Little Keystone Shaft Mine | Mine | CO | Lake | 39.2477668 | -106.4028044 | 3196.0 | 3196.0 | 
| 861027 | Prospect Peak | Summit | NV | Elko | 41.7961632 | -115.3514744 | 3176.0 | 3176.0 | 
| 189182 | Lienhart Mine | Mine | CO | Chaffee | 38.9377726 | -106.2547437 | None | 3173.0 | 
| 189124 | Holywater Mine | Mine | CO | Chaffee | 38.8094397 | -106.358913 | 3173.0 | 3173.0 | 
| 1693838 | Eva Mine Complex | Mine | CO | Clear Creek | 39.6794316 | -105.7277832 | 3166.0 | 3166.0 | 
| 1600421 | Horse Pasture Draw | Valley | WY | Sublette | 42.913281 | -110.3380592 | None | 3151.0 | 
| 1599988 | Goosewing Guard Station | Locale | WY | Teton | 43.5629975 | -110.3032447 | None | 3151.0 | 
| 1599888 | Galena Basin | Basin | WY | Park | 43.905233 | -109.2923677 | 3151.0 | 3151.0 | 
| 177789 | Green Mountain | Summit | CO | Grand | 40.2961549 | -105.8221575 | 3138.0 | 3138.0 | 
| 181517 | Nugget | Populated Place | CO | Gilpin | 39.8635984 | -105.5883336 | 3128.0 | 3128.0 | 
| 1428713 | Highline Trail | Trail | UT | Duchesne | 40.688557 | -110.8173912 | 3120.0 | 3120.0 | 
| 199320 | Trio Dam | Dam | CO | Delta | 39.0466474 | -107.8356185 | 3108.0 | 3108.0 | 
| 862430 | Great Basin National Park | Park | NV | White Pine | 38.9460953 | -114.2579658 | 3102.0 | 3102.0 | 
| 173389 | Crater Lake | Lake | CO | Rio Blanco | 40.1400804 | -107.1207371 | 3097.0 | 3097.0 | 
| 1667345 | Little Shot Lake | Lake | CA | Fresno | 37.1252053 | -118.8260505 | 3096.0 | 3096.0 | 
| 189116 | East Willow Creek | Stream | CO | Gunnison | 38.7538837 | -106.4819717 | 3089.0 | 3089.0 | 
| 1684255 | Pacific-Gold Dust Mine | Mine | CO | Summit | 39.4685983 | -106.0244624 | 3089.0 | 3089.0 | 
| 175365 | Little Giant Reservoir Number 1 | Reservoir | CO | Delta | 39.0405469 | -107.7873713 | 3084.0 | 3084.0 | 
| 1724727 | Lexington Mine | Mine | CO | Teller | 38.7405442 | -105.1560918 | 3076.0 | 3076.0 | 
| 176038 | Dubach Meadows | Flat | CO | Eagle | 39.5402631 | -106.6630902 | 3074.0 | 3074.0 | 
| 917164 | Hidden Treasure Mine | Mine | NM | Colfax | 36.6114208 | -105.2411167 | 3068.0 | 3068.0 | 
| 396870 | Mount McGuire | Summit | ID | Lemhi | 45.174046 | -114.602137 | 3059.0 | 3059.0 | 
| 1684932 | Allied Gold Mining Company Mine | Mine | CO | Boulder | 39.9930424 | -105.576667 | 3049.0 | 3049.0 | 
| 176195 | Guller Creek | Stream | CO | Summit | 39.5035976 | -106.1758565 | 3035.0 | 3035.0 | 
| 176528 | McKinney Draw | Valley | CO | Dolores | 37.8663073 | -108.5713788 | None | 3035.0 | 
| 1445191 | Roundy Spring | Spring | UT | Garfield | 38.0396633 | -111.6903788 | 3025.0 | 3025.0 | 
| 1445316 | Sand Wash | Stream | UT | Garfield | 37.8838658 | -112.425208 | None | 3025.0 | 
| 199786 | Kendall Dam | Dam | CO | Mesa | 39.1149804 | -107.8072845 | 3024.0 | 3024.0 | 
| 1442905 | Long Flat | Flat | UT | Piute | 38.2683083 | -112.3552077 | 3011.0 | 3011.0 | 
| 1444807 | Red Lake | Lake | UT | Daggett | 40.7950226 | -109.8341217 | 3004.0 | 3004.0 | 
| 909123 | North Baldy | Summit | NM | Socorro | 34.0503013 | -107.1812222 | 3002.0 | 3002.0 | 
| 190348 | Rudifor Lake | Lake | CO | Rio Grande | 37.5242201 | -106.6722501 | 2997.0 | 2997.0 | 
| 176590 | Kilpacker Creek | Stream | CO | Dolores | 37.8233271 | -108.0511831 | 2995.0 | 2995.0 | 
| 261678 | Iceberg Lake | Lake | CA | Madera | 37.6704116 | -119.1691178 | 2981.0 | 2981.0 | 
| 1438313 | Avon | Populated Place | UT | Cache | 41.5329911 | -111.8132743 | None | 2978.0 | 
| 1438255 | Antimony Knoll | Summit | UT | Garfield | 38.0514094 | -111.815261 | 2978.0 | 2978.0 | 
| 1683679 | Majestic Mine | Mine | CO | Boulder | 40.1247079 | -105.5588892 | 2978.0 | 2978.0 | 
| 806032 | Imelda Lake | Lake | MT | Park | 45.1663594 | -109.9120372 | 2977.0 | 2977.0 | 
| 931531 | Geronimo Ski Trail | Trail | NM | Lincoln | 33.3973012 | -105.7885986 | 2976.0 | 2976.0 | 
| 930659 | Wild Onion Ski Trail | Trail | NM | Lincoln | 33.3973012 | -105.7885986 | 2976.0 | 2976.0 | 
| 1447716 | Manning Meadows Reservoir | Reservoir | UT | Piute | 38.4904107 | -112.0702195 | 2971.0 | 2971.0 | 
| 173999 | Deadhorse Spring | Spring | CO | Garfield | 39.6572051 | -107.2211646 | 2962.0 | 2962.0 | 
| 172337 | Lower Rainbow Lake | Lake | CO | Jackson | 40.6481244 | -106.6193546 | 2960.0 | 2960.0 | 
| 268442 | Trumbull Lake | Lake | CA | Mono | 38.0534252 | -119.2578319 | 2957.0 | 2957.0 | 
| 188507 | Phelps Cabin Creek | Stream | CO | Hinsdale | 38.1488844 | -107.1053286 | 2954.0 | 2954.0 | 
| 1446325 | Tall Four Reservoir | Reservoir | UT | Garfield | 37.9200023 | -111.8341652 | 2950.0 | 2950.0 | 
| 1685340 | Legion Mine | Mine | CO | Clear Creek | 39.7794318 | -105.6813922 | 2942.0 | 2942.0 | 
| 1609296 | Sepulcher Mountain | Summit | WY | Park | 44.9903881 | -110.7665775 | 2939.0 | 2939.0 | 
| 2664241 | Tucker Ponds | Lake | CO | Mineral | 37.49492 | -106.75954 | 2934.0 | 2934.0 | 
| 1724854 | Rose Maud Mine | Mine | CO | Teller | 38.7324889 | -105.1574808 | 2932.0 | 2932.0 | 
| 853139 | Puritan Mine | Mine | NV | Elko | 41.8485179 | -115.3800676 | 2910.0 | 2910.0 | 
| 186986 | McIntosh Mountain | Summit | CO | Gunnison | 38.5750862 | -107.014652 | 2905.0 | 2905.0 | 
| 1447736 | Farnsworth Dam | Dam | UT | Sevier | 38.7733041 | -111.6624129 | 2897.0 | 2897.0 | 
| 269289 | Woodchuck Country | Area | CA | Fresno | 37.0174429 | -118.8887203 | 2884.0 | 2884.0 | 
| 859080 | Buller Mountain | Summit | NV | Mineral | 38.3808139 | -118.6877054 | 2880.0 | 2880.0 | 
| 1606084 | Thompson Reservoir | Reservoir | WY | Carbon | 41.1983044 | -106.3655775 | 2878.0 | 2878.0 | 
| 183510 | The Castle | Pillar | CO | Jefferson | 39.3105454 | -105.3824978 | 2873.0 | 2873.0 | 
| 39068 | Lee Valley Dam | Dam | AZ | Apache | 33.9417151 | -109.4989794 | 2870.0 | 2870.0 | 
| 180170 | Crystal Lakes | Lake | CO | Lake | 39.1950626 | -106.3383877 | 2867.0 | 2867.0 | 
| 2724004 | Petrie Ditch | Canal | CO | Park | 39.3091177 | -105.8102555 | 2862.0 | 2862.0 | 
| 177295 | Hourglass Reservoir | Reservoir | CO | Larimer | 40.5793186 | -105.6319349 | 2859.0 | 2859.0 |