import numpy as np
import pandas as pd


!unzip -u ../../proj/proj2/data/baseball.zip -d data/
!psql postgresql://jovyan@127.0.0.1:5432/baseball -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database()  AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS baseball'
!psql -h localhost -c 'CREATE DATABASE baseball'
!psql -h localhost -d baseball -f data/baseball.sql
!psql -h localhost -c 'SET max_parallel_workers_per_gather = 0;'

Archive:  ../../proj/proj2/data/baseball.zip
 pg_terminate_backend 
----------------------
(0 rows)

DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 5219
COPY 104256
COPY 179
COPY 6236
COPY 425
COPY 6879
COPY 104324
COPY 13943
COPY 17350
COPY 138838
COPY 12028
COPY 31955
COPY 13110
COPY 4191
COPY 3040
COPY 3469
COPY 93
COPY 252
COPY 19370
COPY 45806
COPY 5445
COPY 26428
COPY 1207
COPY 325
COPY 2865
COPY 120
COPY 52
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE
SET


## we'll use the Lahman baseball database in our examples today.
## replace the database connection with a database of your own!
%reload_ext sql
%sql postgresql://localhost:5432/baseball
%config SqlMagic.displaylimit = 20

There's a new jupysql version available (0.10.10), you're running 0.10.0. To upgrade: pip install jupysql --upgrade


%%sql
WITH yearnum AS
  (SELECT yearid, (yearid % 100) as year
     FROM batting
  )
SELECT yearid, CONCAT('''', LPAD(year::text, 2, '0')) as year
  FROM yearnum
 LIMIT 5;


%%sql
EXPLAIN (VERBOSE true)
SELECT yearid,
       CONCAT('''', LPAD((yearid % 100)::text, 2, '0')) AS year
FROM batting;


%%sql
EXPLAIN (VERBOSE true)
SELECT p.playerid, p.so - b.so
  FROM pitching p
  INNER JOIN batting b
  ON p.playerid=b.playerid;


%%sql
SELECT namefirst, namelast, yearid, HR,
       rank() OVER (ORDER BY HR DESC),
       avg(HR)    OVER (PARTITION BY b.playerid ORDER BY yearid ROWS 3 PRECEDING) as avg_3yr,
       lag(HR, 7) OVER (PARTITION BY b.playerid ORDER BY yearid) as previous,
       lag(HR, 2) OVER (PARTITION BY b.playerid ORDER BY yearid) as lag2
FROM batting b, people m
WHERE m.playerid = b.playerid
   AND (namelast = 'Bonds' or namelast = 'Ruth')
ORDER BY HR DESC
LIMIT 10;


%%sql
SELECT MIN(HR),
       percentile_cont(0.25) WITHIN GROUP (ORDER BY HR) AS p25,
       percentile_cont(0.50) WITHIN GROUP (ORDER BY HR) AS median,
       percentile_cont(0.75) WITHIN GROUP (ORDER BY HR) AS p75,
       percentile_cont(0.99) WITHIN GROUP (ORDER BY HR) AS p99,
       MAX(HR),
       AVG(HR)
FROM batting;


%%sql
SELECT HR, COUNT(*) FROM batting GROUP BY HR ORDER BY HR;


hrs = 4 # hypothetically, four home runs


%%sql
SELECT {{hrs}} as hypothetical,
       rank({{hrs}}) WITHIN GROUP (ORDER BY HR DESC),
       dense_rank({{hrs}}) WITHIN GROUP (ORDER BY HR DESC),
       percent_rank({{hrs}}) WITHIN GROUP (ORDER BY HR DESC) * 100 AS pct_rank,
       cume_dist({{hrs}}) WITHIN GROUP (ORDER BY HR)
FROM batting
LIMIT 10;


%%sql
SELECT 4 as hypothetical,
       rank(4) WITHIN GROUP (ORDER BY HR DESC),
       dense_rank(4) WITHIN GROUP (ORDER BY HR DESC),
       percent_rank(4) WITHIN GROUP (ORDER BY HR DESC) * 100 AS pct_rank,
       cume_dist(4) WITHIN GROUP (ORDER BY HR)
FROM batting
LIMIT 10;


!psql -h localhost -d gnis -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS gnis'
!psql -h localhost -c 'CREATE DATABASE gnis' 
!psql -h localhost -d gnis -f data/gnis.sql

 pg_terminate_backend 
----------------------
(0 rows)

DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 3195
COPY 11533
CREATE INDEX


%reload_ext sql
%sql postgresql://localhost:5432/gnis
%config SqlMagic.displaylimit = 15
%config SqlMagic.named_parameters=True


%sql SELECT COUNT(*) FROM national;


%sql SELECT * FROM national WHERE county_name = 'Alameda';


%%sql
SELECT *
FROM national TABLESAMPLE BERNOULLI(10);


%sql SELECT elev_in_m FROM National LIMIT 2;


%%sql
SELECT elev_in_m, 
    (elev_in_m / 100) :: INTEGER AS quantized,
    ((elev_in_m / 100) :: INTEGER) * 100 AS round_to_100,
    SUBSTRING(elev_in_m :: TEXT, 1, 2),
    CONCAT(SUBSTRING(elev_in_m :: TEXT, 1, 2), '00') AS substring2
FROM National
LIMIT 5;


%%sql
/* does not run in jupysql b/c of <<. copy into psql  */
SELECT elev_in_m,
    (elev_in_m::INTEGER::BIT(12)) AS bit12, 
    (elev_in_m::INTEGER::BIT(12)) >> 8 AS shiftright, 
    ((elev_in_m::INTEGER::BIT(12)) >> 8) << 3 AS shiftrightthenleft
FROM national
LIMIT 5;


%%sql
/* does not run in jupysql b/c of <<. copy into psql  */
WITH shifts AS (
SELECT elev_in_m::integer, 
       (elev_in_m::integer::bit(12))      AS bit12, 
       (elev_in_m::integer::bit(12) >> 8) AS rightshifted, 
       ((elev_in_m::integer::bit(12) >> 8) << 8)::integer AS round_to_256
  FROM national
  WHERE elev_in_m >= 0
)
SELECT COUNT(DISTINCT elev_in_m::integer)    AS elevation_meters_count,
       COUNT(DISTINCT bit12)        AS bit12_count,
       COUNT(DISTINCT rightshifted) AS rightshift_count,
       COUNT(DISTINCT round_to_256) AS rounded_count
  FROM shifts;


%%sql
SELECT state_numeric, county_numeric,
       avg(elev_in_m),
       stddev(elev_in_m), count(*)
FROM national TABLESAMPLE BERNOULLI(10)
GROUP BY state_numeric, county_numeric;


%%sql
DROP VIEW IF EXISTS state_elevations;

CREATE VIEW state_elevations AS
SELECT state_numeric,
       avg(elev_in_m),
       stddev(elev_in_m), count(*)
FROM national 
GROUP BY state_numeric
;


%sql SELECT * FROM state_elevations;


%%sql
SELECT state_numeric, feature_class,
       avg(elev_in_m),
       stddev(elev_in_m), count(*)
FROM national TABLESAMPLE Bernoulli(10)
GROUP BY state_numeric, feature_class
ORDER BY count(*) DESC;


%%sql
SELECT state_numeric,
       AVG(elev_in_m),
       STDDEV(elev_in_m), COUNT(*),
       SUM(COUNT(*)) OVER () AS total,
       COUNT(*)/SUM(COUNT(*)) OVER () AS marginal
FROM national TABLESAMPLE Bernoulli(.07)
GROUP BY state_numeric;


%%sql
SELECT COUNT(DISTINCT county_numeric) FROM national;


%sql SELECT * FROM state_elevations;


%sql SELECT * FROM fips_counties LIMIT 10;


%sql CREATE EXTENSION IF NOT EXISTS tablefunc;


%%sql
WITH state_cty AS
(SELECT s.state_numeric, f.fips as county_numeric, s.avg, s.stddev, s.count
  FROM state_elevations s, fips_counties f
  WHERE s.state_numeric = f.state_numeric
)
SELECT s.*,
       n.n AS elev_in_m,
       true as elev_in_m_sim -- user-facing flag
  FROM state_cty s,
       LATERAL normal_rand(CAST(s.count AS INTEGER), s.avg, s.stddev) AS n
LIMIT 10;


## we'll use the Lahman baseball database in our initial examples today.
## replace the database connection with a database of your own!
%reload_ext sql
%sql postgresql://localhost:5432/baseball


%sql SELECT * FROM Appearances WHERE yearid > 1970 LIMIT 2;


%sql SELECT * FROM Teams LIMIT 1;


%%sql
SELECT a.playerid, a.teamid, t.divid, a.*
  FROM Appearances a
  NATURAL JOIN Teams t
WHERE a.yearid = 2015
LIMIT 100;


%%sql
CREATE OR REPLACE VIEW bball_tree AS
SELECT DISTINCT a.playerid, a.teamid, t.divid, a.lgid, a.yearid
  FROM Appearances a NATURAL JOIN Teams t;


%%sql
SELECT namefirst, namelast, yearid,
       MIN(hr), MAX(hr), AVG(hr), STDDEV(hr), SUM(hr)
FROM batting b, people p
WHERE b.playerid = p.playerid
GROUP BY namelast, namefirst, yearid
ORDER BY max DESC
LIMIT 10;


%%sql
WITH batting_tree AS
(
    SELECT b.*, t.divid
    FROM batting b, bball_tree t
    WHERE b.playerid = t.playerid
      AND b.yearid = t.yearid
)
SELECT namefirst, namelast,
       bt.teamid, bt.lgid, bt.divid, bt.yearid,
       MIN(hr), MAX(hr), AVG(hr), STDDEV(hr), SUM(hr)
FROM batting_tree bt, people p
WHERE bt.playerid = p.playerid
GROUP BY bt.playerid, bt.teamid, bt.lgid, bt.divid, bt.yearid, namelast, namefirst
ORDER BY max DESC
LIMIT 10;


# first download and unzip the data
!mkdir -p data
!wget https://geonames.usgs.gov/docs/stategaz/NationalFile.zip -P data/
!unzip -u data/NationalFile.zip -d data/


import os
fname = os.path.join("data", "NationalFile_20210825.txt")
fname


!du -h {fname} # big file


!head -c 1024 {fname}


# next, load it into pandas
import pandas as pd

national = pd.read_csv("data/NationalFile_20210825.txt", delimiter="|")
national.head(2)


national = national.rename(columns=dict([(col, col.lower().strip()) for col in national.columns]))
national.head(2)


import numpy as np

p = 0.005 # fraction, not percentage

np.random.seed(42)
national['keep_bool'] = np.random.random(len(national)) < p
national['keep_bool'].value_counts()


national = national[national['keep_bool']].drop(columns=['keep_bool'])
national


!psql -h localhost -d gnis -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS gnis'
!psql -h localhost -c 'CREATE DATABASE gnis'


%reload_ext sql
%sql postgresql://127.0.0.1:5432/gnis


%sql --persist-replace national


%sql ALTER TABLE national DROP COLUMN index;


!pg_dump -h localhost --encoding utf8 gnis -f data/gnis.sql


!du -h data/gnis.sql


!wget https://transition.fcc.gov/oet/info/maps/census/fips/fips.txt -P data/


import pandas as pd
import re


with open('data/fips.txt', 'r') as f:
    lines = f.readlines()


COUNTY_STARTS = 69
OFFSET = 3 # the start of the data itself, after headers


re.match('\s+(\d+)\s+(\w+)', lines[COUNTY_STARTS+3]).groups()


splits = [re.match('\s+(\d+)\s+(.*)', line).groups()
          for line in 
          lines[COUNTY_STARTS+OFFSET:]]
splits[0]


len(lines), len(splits)


fips_counties = pd.DataFrame(data=splits, columns=['fips', 'county'])
fips_counties['state_numeric'] = fips_counties['fips'].str[:2].astype(int)
fips_counties['fips'] = fips_counties['fips'].astype(int)
fips_counties = fips_counties.set_index('fips')
fips_counties


%reload_ext sql
%sql postgresql://127.0.0.1:5432/gnis


%sql --persist-replace fips_counties


!pg_dump -h localhost --encoding utf8 gnis -f data/gnis.sql


!du -h data/gnis.sql

yearid	year
1871	'71
1871	'71
1871	'71
1871	'71
1871	'71

namefirst	namelast	yearid	hr	rank	avg_3yr	previous	lag2
Barry	Bonds	2001	73	1	48.2500000000000000	37	34
Babe	Ruth	1927	60	2	44.5000000000000000	54	25
Babe	Ruth	1921	59	3	38.2500000000000000	0	29
Babe	Ruth	1920	54	4	24.0000000000000000	None	11
Babe	Ruth	1928	54	4	46.5000000000000000	59	47
Barry	Bonds	2000	49	6	40.0000000000000000	46	37
Babe	Ruth	1930	49	6	52.2500000000000000	41	54
Babe	Ruth	1926	47	8	39.7500000000000000	29	46
Barry	Bonds	1993	46	9	34.5000000000000000	16	25
Barry	Bonds	2002	46	9	50.5000000000000000	33	49

hr	count
0	63825
1	9953
2	5531
3	3806
4	2790
5	2212
6	1825
7	1533
8	1330
9	1147
10	949
11	861
12	868
13	685
14	654
15	578
16	549
17	474
18	460
19	346

feature_id	feature_name	feature_class	state_alpha	state_numeric	county_name	county_numeric	primary_lat_dms	prim_long_dms	prim_lat_dec	prim_long_dec	source_lat_dms	source_long_dms	source_lat_dec	source_long_dec	elev_in_m	elev_in_ft	map_name	date_created	date_edited
218316	Apperson Creek	Stream	CA	6	Alameda	1.0	373349N	1215000W	37.5635453	-121.8332887	373232N	1214804W	37.5422222	-121.8011111	148.0	486.0	La Costa Valley	01/19/1981	None
225998	Irvington High School	School	CA	6	Alameda	1.0	373126N	1215801W	37.523814	-121.9670659	None	None	None	None	13.0	43.0	Niles	01/19/1981	03/31/2021
226951	Laurel Elementary School	School	CA	6	Alameda	1.0	374734N	1221147W	37.792899	-122.1964288	None	None	None	None	68.0	223.0	Oakland East	06/14/2000	03/14/2021
229367	Murray Elementary School	School	CA	6	Alameda	1.0	374318N	1215557W	37.721801	-121.9326269	None	None	None	None	112.0	367.0	Dublin	01/19/1981	03/14/2021
235581	Strawberry Creek	Stream	CA	6	Alameda	1.0	375221N	1221443W	37.8724258	-122.2452464	375251N	1221354W	37.8807588	-122.2316349	154.0	505.0	Oakland East	01/19/1981	08/31/2016
1654274	Hayward Golf Course	Locale	CA	6	Alameda	1.0	373726N	1220250W	37.6238222	-122.0471843	None	None	None	None	5.0	16.0	Newark	01/19/1981	None
1664964	KOFY-AM (San Mateo)	Tower	CA	6	Alameda	1.0	374934N	1221842W	37.8260385	-122.3116366	None	None	None	None	2.0	7.0	Oakland West	07/01/1994	None
1670278	Lake Elizabeth	Lake	CA	6	Alameda	1.0	373255N	1215742W	37.5487056	-121.9617554	None	None	None	None	16.0	52.0	Niles	11/09/1995	03/07/2019
1692819	California School for the Deaf - Fremont	School	CA	6	Alameda	1.0	373334N	1215747W	37.5593966	-121.9631843	None	None	None	None	20.0	66.0	Niles	05/08/1996	09/16/2016
1692863	J A Freitas Library	Building	CA	6	Alameda	1.0	374335N	1220925W	37.7263185	-122.1569101	None	None	None	None	19.0	62.0	San Leandro	05/08/1996	None
1693010	Alice Arts Center	Building	CA	6	Alameda	1.0	374812N	1221557W	37.8032611	-122.2658025	None	None	None	None	11.0	36.0	Oakland West	05/08/1996	None
1961816	New Pilgrim Congregational Church	Church	CA	6	Alameda	1.0	374622N	1221131W	37.7727061	-122.1919115	None	None	None	None	21.0	69.0	Oakland East	09/11/2002	None
1961871	First Southern Baptist Church	Church	CA	6	Alameda	1.0	374657N	1221401W	37.7824282	-122.2335796	None	None	None	None	11.0	36.0	Oakland East	09/11/2002	None
1962002	First Church of Christ Scientist	Church	CA	6	Alameda	1.0	374353N	1220931W	37.7313183	-122.1585769	None	None	None	None	19.0	62.0	San Leandro	09/11/2002	None
2087391	First Presbyterian Church	Church	CA	6	Alameda	1.0	373254N	1220238W	37.5483333	-122.0438889	None	None	None	None	7.0	23.0	Newark	05/22/2006	None

feature_id	feature_name	feature_class	state_alpha	state_numeric	county_name	county_numeric	primary_lat_dms	prim_long_dms	prim_lat_dec	prim_long_dec	source_lat_dms	source_long_dms	source_lat_dec	source_long_dec	elev_in_m	elev_in_ft	map_name	date_created	date_edited
4590	Fiftysix Tank	Reservoir	AZ	4	Gila	7.0	334124N	1102003W	33.6899872	-110.3340976	None	None	None	None	1728.0	5669.0	Popcorn Canyon	02/08/1980	03/19/2019
15418	Kinnison Wash	Arroyo	AZ	4	Pima	19.0	321112N	1104908W	32.1867438	-110.818975	320757N	1104908W	32.1325	-110.8188889	822.0	2697.0	Tucson East	06/27/1984	None
22982	Hells Canyon	Valley	AZ	4	Mohave	15.0	351542N	1132600W	35.26167	-113.433266	351842N	1132756W	35.3116667	-113.4655556	1540.0	5052.0	Tuckayou Spring	06/27/1984	None
24797	Contempo Tempe	Populated Place	AZ	4	Maricopa	13.0	332325N	1115823W	33.3903237	-111.9729222	None	None	None	None	358.0	1175.0	Tempe	06/27/1984	None
28519	Dogie Tank	Reservoir	AZ	4	Yavapai	25.0	345544N	1120004W	34.929	-112.0009754	None	None	None	None	1417.0	4649.0	Sycamore Basin	02/08/1980	03/20/2019
30088	Honey Bee Tank	Reservoir	AZ	4	Yavapai	25.0	344839N	1121359W	34.8109492	-112.2331807	None	None	None	None	1485.0	4872.0	Munds Draw	02/08/1980	03/20/2019
32752	Peach Orchard Spring	Spring	AZ	4	Gila	7.0	341245N	1112137W	34.2124602	-111.360398	None	None	None	None	1415.0	4642.0	Payson South	02/08/1980	04/18/2011
42907	Stowe Gulch	Valley	AZ	4	Graham	9.0	325211N	1102156W	32.8697793	-110.3656905	325705N	1101804W	32.9514099	-110.3011057	1015.0	3330.0	Klondyke	02/08/1980	08/19/2021
46375	Belmont Church	Church	AR	5	Jefferson	69.0	340724N	0914852W	34.1234323	-91.8145769	None	None	None	None	58.0	190.0	Tarry	04/30/1980	None
48317	Cypress Pocket	Swamp	AR	5	Pulaski	119.0	343036N	0920405W	34.5100652	-92.0681112	None	None	None	None	69.0	226.0	Keo	04/30/1980	07/23/2019
49578	Grassy Lake	Swamp	AR	5	Hempstead	57.0	334119N	0935209W	33.6887443	-93.8693011	None	None	None	None	78.0	256.0	McNab	04/30/1980	07/23/2019
53536	Pleasant Grove Church	Church	AR	5	Calhoun	13.0	333216N	0922147W	33.5378906	-92.3629271	None	None	None	None	57.0	187.0	Banks	04/30/1980	None
54087	Rock Church	Church	AR	5	Madison	87.0	361629N	0935219W	36.2747962	-93.8718629	None	None	None	None	455.0	1493.0	Sandstone Mountain	04/30/1980	None
55079	Spring Branch	Stream	AR	5	Miller	91.0	331657N	0935803W	33.2826281	-93.9674054	331833N	0935702W	33.3091667	-93.9505556	62.0	203.0	Fouke	04/30/1980	None
55132	Springfield Church	Church	AR	5	Pulaski	119.0	343630N	0920521W	34.6084264	-92.0890328	None	None	None	None	72.0	236.0	Keo	04/30/1980	None

Lecture 14: GNIS¶

Scalar Functions and Query Plans¶

Window Functions¶

Inverse Distribution Window Functions¶

Hypothetical-Set Window Functions¶

GNIS¶

Numerical Granularity¶

Demo 1: Roll-up / Drill-down Practice¶

Demo 2: Connections to Statistics¶

Roll up with marginal distributions¶

Drill down with normally-distributed elevations:¶

Assembling an Explicit Hierarchy¶

Revisiting the Home Run Query¶

[Extra] Load in the database from scratch¶

FIPS¶

elev_in_m	quantized	round_to_100	substring	substring2
931.0	9	900	93	9300
2707.0	27	2700	27	2700
275.0	3	300	27	2700
1685.0	17	1700	16	1600
1354.0	14	1400	13	1300

state_numeric	county_numeric	avg	stddev	count
5	45.0	109.0	None	1
41	43.0	79.0	None	1
12	1.0	43.0	None	1
41	51.0	63.0	46.64046883698033	4
29	135.0	266.0	None	1
46	67.0	381.0	None	1
27	137.0	453.0	None	1
19	189.0	360.0	None	1
54	7.0	271.0	None	1
45	51.0	8.0	None	1
20	59.0	269.0	None	1
24	25.0	None	None	1
64	2.0	75.66666666666667	131.05851110604505	3
35	43.0	2424.0	26.870057685088806	2
53	17.0	722.0	None	1

state_numeric	avg	stddev	count
54	363.6190476190476	199.26650831834746	204
29	246.09152542372883	80.2483078596168	343
68	6.666666666666667	7.99166232186187	14
4	1315.3798076923076	672.6305522946129	208
34	40.08943089430894	59.88896941733248	123
51	254.55197132616487	260.54513270095333	283
70	18.333333333333332	31.75426480542942	3
10	22.11111111111111	28.015563440198648	27
35	1756.8467432950192	471.8002505531821	273
45	122.83240223463687	123.96059930539184	181
6	516.7286432160804	684.895132373804	599
39	262.80258899676375	57.78124212343727	328
69	90.66666666666667	82.82189726555814	6
36	217.14150943396226	197.0201865875975	424
31	628.2797202797203	293.8114851587473	148

state_numeric	feature_class	avg	stddev	count
17	School	188.63636363636363	16.144799331504416	11
6	School	184.9	231.5025557814283	10
6	Valley	405.8888888888889	276.0278266970762	9
48	Stream	282.0	275.331164236815	9
48	Church	176.125	121.4912548293086	8
42	Populated Place	224.125	131.23200556919903	8
36	Building	166.25	159.79696939733066	8
30	Well	1298.875	275.6905291393646	8
13	Church	176.71428571428572	145.6476504968003	7
51	Church	141.28571428571428	97.23804859846837	7
21	Stream	257.0	62.27358990776106	7
41	Locale	507.14285714285717	696.2888357981499	7
48	School	339.8	432.5369348390956	7
48	Building	282.7142857142857	401.2770091784952	7
6	Locale	862.5714285714286	839.1779424219985	7

state_numeric	avg	stddev	count	total	marginal
4	367.0	None	1	12	0.08333333333333333333
6	885.0	None	1	12	0.08333333333333333333
12	13.0	None	1	12	0.08333333333333333333
13	114.0	None	1	12	0.08333333333333333333
18	238.0	96.16652224137046	2	12	0.16666666666666666667
21	279.0	None	1	12	0.08333333333333333333
28	144.0	None	1	12	0.08333333333333333333
41	182.5	136.47160876900367	2	12	0.16666666666666666667
42	391.0	None	1	12	0.08333333333333333333
51	0.0	None	1	12	0.08333333333333333333

fips	county	state_numeric
1000	Alabama	1
1001	Autauga County	1
1003	Baldwin County	1
1005	Barbour County	1
1007	Bibb County	1
1009	Blount County	1
1011	Bullock County	1
1013	Butler County	1
1015	Calhoun County	1
1017	Chambers County	1

state_numeric	county_numeric	avg	stddev	count	elev_in_m	elev_in_m_sim
1	1000	146.37888198757764	102.92185851771194	339	86.38931448068088	True
1	1000	146.37888198757764	102.92185851771194	339	241.10131080357328	True
1	1000	146.37888198757764	102.92185851771194	339	232.5231081945127	True
1	1000	146.37888198757764	102.92185851771194	339	46.359022968780025	True
1	1000	146.37888198757764	102.92185851771194	339	135.80239287281742	True
1	1000	146.37888198757764	102.92185851771194	339	97.00680510236256	True
1	1000	146.37888198757764	102.92185851771194	339	107.08312694186878	True
1	1000	146.37888198757764	102.92185851771194	339	204.88908230500292	True
1	1000	146.37888198757764	102.92185851771194	339	202.72662605310643	True
1	1000	146.37888198757764	102.92185851771194	339	119.32389233384822	True

yearid	teamid	lgid	playerid	g_all	gs	g_batting	g_defense	g_p	g_c	g_1b	g_2b	g_3b	g_ss	g_lf	g_cf	g_rf	g_of	g_dh	g_ph	g_pr
1971	ATL	NL	aaronha01	139	129	139	129	0	0	71	0	0	0	0	0	60	60	0	10	0
1971	ATL	NL	aaronto01	25	10	25	18	0	0	11	0	7	0	0	0	0	0	0	8	0

playerid	teamid	divid	yearid	teamid_1	lgid	playerid_1	g_all	gs	g_batting	g_defense	g_p	g_c	g_1b	g_2b	g_3b	g_ss	g_lf	g_cf	g_rf	g_of	g_dh	g_ph	g_pr
alvarda02	BAL	E	2015	BAL	AL	alvarda02	12	10	12	12	0	0	0	0	0	0	0	1	12	12	0	0	0
brachbr01	BAL	E	2015	BAL	AL	brachbr01	62	0	5	62	62	0	0	0	0	0	0	0	0	0	0	0	0
brittza01	BAL	E	2015	BAL	AL	brittza01	64	0	2	64	64	0	0	0	0	0	0	0	0	0	0	0	0
cabrace01	BAL	E	2015	BAL	AL	cabrace01	2	0	0	2	2	0	0	0	0	0	0	0	0	0	0	0	0
cabreev01	BAL	E	2015	BAL	AL	cabreev01	29	28	29	28	0	0	0	2	0	27	0	0	0	0	0	0	1
chenwe02	BAL	E	2015	BAL	AL	chenwe02	31	31	0	31	31	0	0	0	0	0	0	0	0	0	0	0	0
clevest01	BAL	E	2015	BAL	AL	clevest01	30	24	30	10	0	9	1	0	0	0	0	0	0	0	18	4	0
davisch02	BAL	E	2015	BAL	AL	davisch02	160	159	160	138	0	0	111	0	0	0	0	0	30	30	22	0	0
deazaal01	BAL	E	2015	BAL	AL	deazaal01	30	27	30	27	0	0	0	0	0	0	19	0	13	27	0	3	0
drakeol01	BAL	E	2015	BAL	AL	drakeol01	13	0	1	13	13	0	0	0	0	0	0	0	0	0	0	0	0
flahery01	BAL	E	2015	BAL	AL	flahery01	91	81	91	90	0	0	11	56	8	15	1	0	5	6	0	2	3
garcija03	BAL	E	2015	BAL	AL	garcija03	21	0	3	21	21	0	0	0	0	0	0	0	0	0	0	0	0
gausmke01	BAL	E	2015	BAL	AL	gausmke01	25	17	1	25	25	0	0	0	0	0	0	0	0	0	0	0	0
givenmy01	BAL	E	2015	BAL	AL	givenmy01	22	0	1	22	22	0	0	0	0	0	0	0	0	0	0	0	0
gonzami03	BAL	E	2015	BAL	AL	gonzami03	26	26	1	26	26	0	0	0	0	0	0	0	0	0	0	0	0

QUERY PLAN
Seq Scan on public.batting (cost=0.00..3927.29 rows=104324 width=36)
Output: yearid, concat('''', lpad(((yearid % 100))::text, 2, '0'::text))

QUERY PLAN
Merge Join (cost=0.83..433476.50 rows=23893326 width=42)
Output: p.playerid, (p.so - b.so)
Merge Cond: ((p.playerid)::text = (b.playerid)::text)
-> Index Scan using pitching_pkey on public.pitching p (cost=0.41..5271.51 rows=45806 width=42)
Output: p.playerid, p.yearid, p.stint, p.teamid, p.lgid, p.w, p.l, p.g, p.gs, p.cg, p.sho, p.sv, p.ipouts, p.h, p.er, p.hr, p.bb, p.so, p.baopp, p.era, p.ibb, p.wp, p.hbp, p.bk, p.bfp, p.gf, p.r, p.sh, p.sf, p.gidp
-> Materialize (cost=0.42..10218.09 rows=104324 width=42)
Output: b.playerid, b.yearid, b.stint, b.teamid, b.lgid, b.g, b.ab, b.r, b.h, b.h2b, b.h3b, b.hr, b.rbi, b.sb, b.cs, b.bb, b.so, b.ibb, b.hbp, b.sh, b.sf, b.gidp
-> Index Scan using batting_pkey on public.batting b (cost=0.42..9957.28 rows=104324 width=42)
Output: b.playerid, b.yearid, b.stint, b.teamid, b.lgid, b.g, b.ab, b.r, b.h, b.h2b, b.h3b, b.hr, b.rbi, b.sb, b.cs, b.bb, b.so, b.ibb, b.hbp, b.sh, b.sf, b.gidp
JIT:
Functions: 7
Options: Inlining false, Optimization false, Expressions true, Deforming true

namefirst	namelast	yearid	min	max	avg	stddev	sum
Barry	Bonds	2001	73	73	73.0000000000000000	None	73
Mark	McGwire	1998	70	70	70.0000000000000000	None	70
Sammy	Sosa	1998	66	66	66.0000000000000000	None	66
Mark	McGwire	1999	65	65	65.0000000000000000	None	65
Sammy	Sosa	2001	64	64	64.0000000000000000	None	64
Sammy	Sosa	1999	63	63	63.0000000000000000	None	63
Roger	Maris	1961	61	61	61.0000000000000000	None	61
Babe	Ruth	1927	60	60	60.0000000000000000	None	60
Babe	Ruth	1921	59	59	59.0000000000000000	None	59
Giancarlo	Stanton	2017	59	59	59.0000000000000000	None	59

namefirst	namelast	teamid	lgid	divid	yearid	min	max	avg	stddev	sum
Barry	Bonds	SFN	NL	W	2001	73	73	73.0000000000000000	None	73
Mark	McGwire	SLN	NL	C	1998	70	70	70.0000000000000000	None	70
Sammy	Sosa	CHN	NL	C	1998	66	66	66.0000000000000000	None	66
Mark	McGwire	SLN	NL	C	1999	65	65	65.0000000000000000	None	65
Sammy	Sosa	CHN	NL	C	2001	64	64	64.0000000000000000	None	64
Sammy	Sosa	CHN	NL	C	1999	63	63	63.0000000000000000	None	63
Roger	Maris	NYA	AL	None	1961	61	61	61.0000000000000000	None	61
Babe	Ruth	NYA	AL	None	1927	60	60	60.0000000000000000	None	60
Babe	Ruth	NYA	AL	None	1921	59	59	59.0000000000000000	None	59
Giancarlo	Stanton	MIA	NL	E	2017	59	59	59.0000000000000000	None	59