# to save space, do a symbolic link to previous lecture's data
!ln -sf ../lec11/data .


import pandas as pd


mm = pd.read_csv('data/mm.txt', header=0)
mm


mm_melted = mm.melt(id_vars=['Year'])
mm_melted


mm_melted[mm_melted['Year'] == 2002]


# how do we get back to something that resembles the original matrix?
# mm_melted.pivot(index='variable', columns='Year') # A
mm_melted.pivot(index='Year', columns='variable')#.reset_index() #B


!du -h data/*

1.4M	data/jc1.txt
51M	data/jq2.txt
696K	data/mmp.txt
4.1M	data/mmr.txt
4.0K	data/mm.txt
824K	data/mpf.txt
4.0K	data/requirements.txt


!ls -lh data/*

-rw-r--r-- 1 jovyan jovyan 1.4M Sep 28 19:24 data/jc1.txt
-rw-r--r-- 1 jovyan jovyan  51M Sep 28 19:24 data/jq2.txt
-rw-r--r-- 1 jovyan jovyan 695K Sep 28 19:24 data/mmp.txt
-rw-r--r-- 1 jovyan jovyan 4.1M Sep 28 19:24 data/mmr.txt
-rw-r--r-- 1 jovyan jovyan 1.3K Sep 28 19:24 data/mm.txt
-rw-r--r-- 1 jovyan jovyan 822K Sep 28 19:24 data/mpf.txt
-rw-r--r-- 1 jovyan jovyan 2.1K Sep 28 19:24 data/requirements.txt


!head -c 1024 data/jc1.txt

﻿_input,_num,_widgetName,_source,_resultNumber,_pageUrl,game_number,bio1,bio2,bio3,contestant1_name,contestant1_score,contestant2_name,contestant2_score,contestant3_name,contestant3_score
,1,Jeopardy_Winners,Jeopardy_Winners,1,http://www.j-archive.com/showgame.php?game_id=3350,Show #5883 - Wednesday March 24 2010,Derek Honoré an attorney from Inglewood California,Tatiana Walton a graphic designer from Cutler Bay Florida,Regina Robbins an arts teacher from New York New York (whose 1-day cash winnings total $38500),Regina,$19401,Tatiana,$7100,Derek,$11900
,2,Jeopardy_Winners,Jeopardy_Winners,2,http://www.j-archive.com/showgame.php?game_id=4400,Show #6756 - Monday January 20 2014,Jon McGuire a software-development manager from Matthews North Carolina,Blake Perkins an environmental scientist from Baton Rouge Louisiana,Sarah McNitt a study abroad adviser originally from Ann Arbor Michigan (whose 4-day cash winnings total $69199),Sarah,$20199,Blake,$0,Jon,$8380
,3,Jeopardy_Winners,Jeopardy_Winners,3,http://www.j


!head -c 1024 data/jq2.txt

{"category":"HISTORY","air_date":"2004-12-31","question":"'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'","value":"$200","answer":"Copernicus","round":"Jeopardy!","show_number":"4680"}
{"category":"ESPN's TOP 10 ALL-TIME ATHLETES","air_date":"2004-12-31","question":"'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves'","value":"$200","answer":"Jim Thorpe","round":"Jeopardy!","show_number":"4680"}
{"category":"EVERYBODY TALKS ABOUT IT...","air_date":"2004-12-31","question":"'The city of Yuma in this state has a record average of 4,055 hours of sunshine each year'","value":"$200","answer":"Arizona","round":"Jeopardy!","show_number":"4680"}
{"category":"THE COMPANY LINE","air_date":"2004-12-31","question":"'In 1963, live on \"The Art Linkletter Show\", this company served its billionth burger'","value":"$200","answer":"McDonald\\'s","round":"Jeopardy!","show_number":"4680"}
{"category":"EPITAPHS & TRIBUTE


!head -c 1024 data/jc1.txt


import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd

jc1 = pd.read_csv('data/jc1.txt', header=0, sep=',')
jc1.head(3)


jc1.dtypes


jc1.columns
for col in jc1.columns:
    if '_score' in col:
        jc1[col] = jc1[col].map(lambda x: float(x.strip('$')) if type(x) == str else np.nan)


jc1.dtypes


jc1.hist(column='contestant1_score')
jc1.hist(column='contestant2_score')
jc1.hist(column='contestant3_score')


!ls -lh data/m*

-rw-r--r-- 1 jovyan jovyan 695K Sep 28 19:24 data/mmp.txt
-rw-r--r-- 1 jovyan jovyan 4.1M Sep 28 19:24 data/mmr.txt
-rw-r--r-- 1 jovyan jovyan 1.3K Sep 28 19:24 data/mm.txt
-rw-r--r-- 1 jovyan jovyan 822K Sep 28 19:24 data/mpf.txt


!head -n 20 data/mpf.txt

2002, 'SOUTHERN OREGON COASTAL'
2002, 'ID', 'Location', 'OCT', 'NOV', 'DEC', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'WY to Date', 'Pct Avg to Date', 'Pct Tot WY'
2002, 'ASHO3', 'ASHLAND', '0.86', '0.49', '2.12', '3.42', '1.38', '1.00', '0.36', '2.30', '1.54', '0.00', '0.00', '0.16', '13.63', '68', '68'
2002, 'CVJO3', 'CAVE JUNCTION', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M'
2002, 'GOLO3', 'GOLD BEACH', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M'
2002, 'GPSO3', 'GRANTS PASS KAJO', '0.61', '1.21', '4.19', '6.31', '0.24', '0.77', '0.58', '2.02', '0.87', '0.00', '0.00', '0.20', '17.00', '55', '55'
2002, 'GSPO3', 'GREEN SPRINGS PP', '0.35', '0.75', '2.44', '4.14', '0.66', 'M', '0.26', '2.59', 'M', 'M', '0.00', '0.20', 'M'
2002, 'LEMO3', 'LEMOLO LAKE', '3.68', '1.81', '5.59', '18.19', '4.34', '4.32', '3.37', '4.05', '2.52', '0.00', '0.02', '2.33', '50.22', '76', '76'
2002, 'MFR', 'MEDFORD', '0.65', '0.24', '2.86', '3.43', '0.51', '0.74', '0.46', '2.50', '1.20', '0.00', '0.00', '0.05', '12.64', '69', '69'
2002, 'OTH', 'NORTH BEND', '2.43', '2.10', '9.16', '7.73', '2.85', '3.71', '2.71', '4.66', '2.00', '0.04', '0.07', '1.37', '38.83', '60', '60'
2002, 'PRSO3', 'PROSPECT 2SW', '2.22', '1.95', '4.12', '9.19', '2.08', '3.27', '1.43', '4.06', '1.80', '0.00', '0.00', '0.77', '30.89', '74', '74'
2002, 'RIDO3', 'RIDDLE', '0.88', '1.73', '4.20', '7.34', '1.35', '1.74', '1.34', '2.16', '0.81', '0.00', '0.00', '0.72', '22.27', '71', '71'
2002, 'ROGO3', 'ROSEBURG', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M'
2002, 'SXT', 'SEXTON SUMMIT', 'M', '0.42', '5.34', '7.06', '0.73', '2.15', '0.78', '4.16', '0.92', '0.00', '0.05', '0.80', 'M'
2002, 'TKFO3', 'TOKETEE FALLS', '2.70', '1.54', '3.55', '10.37', '3.16', '4.31', '2.39', '4.26', '2.41', '0.00', '0.01', '1.54', '36.24', '75', '75'
2002, 'WINO3', 'WINCHESTER DAM', '1.28', '1.77', '4.14', '8.24', '1.60', '2.63', '1.69', '2.75', '1.33', '0.00', '0.00', '1.10', '26.53', '73', '73'
2002, 'WLMO3', 'WILLIAMS', '0.65', '0.85', '4.52', '7.00', '0.27', '1.40', '0.31', '2.70', '0.79', '0.00', '0.01', '0.08', '18.58', '57', '57'
2002, 'OREGON CLOSED BASINS'
2002, 'ID', 'Location', 'OCT', 'NOV', 'DEC', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'WY to Date', 'Pct Avg to Date', 'Pct Tot WY'
2002, 'HMRO3', 'HART MOUNTAIN REF', '0.00', '0.00', 'M', '2.60', '0.08', '1.63', '0.69', '0.97', '2.59', '0.03', '0.02', '0.00', 'M'


pd.set_option('display.min_rows', 15) # changes truncated view in pandas


mmr = pd.read_csv('data/mmr.txt')
mmr


(mmr
 .groupby(['Year', 'Month'])
 .mean('Inches of Precipitation')
 .reset_index()
 .pivot(index='Year', columns='Month')
)


## we'll use the Lahman baseball database in our examples today.
## replace the database connection with a database of your own!
%reload_ext sql
%sql postgresql://localhost:5432/baseball
%config SqlMagic.displaylimit = 30


%%sql
WITH yearnum AS
  (SELECT yearid, (yearid % 100) as year
     FROM batting
  )
SELECT yearid, CONCAT('''', LPAD(year::text, 2, '0')) as year
  FROM yearnum
 LIMIT 5;


%%sql
EXPLAIN (VERBOSE true)
SELECT yearid,
       CONCAT('''', LPAD((yearid % 100)::text, 2, '0')) AS year
FROM batting;


%%sql
EXPLAIN (VERBOSE true)
SELECT p.playerid, p.so - b.so
  FROM pitching p
  INNER JOIN batting b
  ON p.playerid=b.playerid;


!head -c 4096 data/mpf.txt


curr_file = None
with open('data/mpf.txt') as f:
    for line in f.readlines():
        splits = line.split(', ')
        if len(splits) == 2:
            print(splits[1])


curr_file = None
with open('lec12_data/mpf.txt') as f:
    for num, line in enumerate(f.readlines()):
        splits = line.split(', ')
        if len(splits) == 2:
            print(splits[1].strip(" '\n ").replace('...', '-').replace(' ', '_'))


curr_file = None
with open('data/mpf.txt') as master_file:
    for line in master_file.readlines():
        splits = line.split(', ')
        if len(splits) == 2:
            if curr_file is not None:
                curr_file.close()
            filename = splits[0] + '_' + splits[1].strip(" '\n ")
            filename = filename.replace('...', '-')
            filename = filename.replace(' ', '_')
            curr_file = open('data/subfiles/' + filename + '.txt', 'w')
        curr_file.write(line)
curr_file.close()


import os
mpf = pd.DataFrame()
parent_dir = os.getcwd() + '/data/subfiles/'
for filename in os.listdir(parent_dir):
    # read from the first header. not the second!
    region_df = pd.read_csv(parent_dir + '/' + filename, header=1)
    # BE CAREFUL! Let's rename our schema...because right now, this is problematic.
    # The first column is just labeled as the year itself (e.g. 2007). Not 'Year'.
    region_df = region_df.rename(columns={filename.split('_')[0]: 'Year'})
    mpf = mpf.append(region_df)
mpf = mpf.sort_values('Year')
mpf

mpf


mpf.dtypes


col_set = {}
for column in mpf.columns:
    if column != 'Year':
        col_set.update({column: column.strip(" '")})
mpf = mpf.rename(columns=col_set)
mpf


# Let's remove the Ms and replace them with NaNs, and strip the quotes
import math
def _process_data(x):
    if type(x) == float:
        return x
    if type(x) != str and math.isnan(x):
        return np.nan
    x = x.strip("' ")
    if x == 'M' or x == 'NA':
        return np.nan
    else:
        return float(x)

mpf = mpf[mpf['Year'] == 2002]
for col in mpf.columns:
    if col in ["ID", "Location"]:
        print(col)
        mpf[col] = mpf[col].map(lambda x: str(x).strip("'"))
    elif col != 'Year':
        mpf[col] = mpf[col].map(lambda x: _process_data(x))
mpf

	Year	OCT	NOV	DEC	JAN	FEB	MAR	APR	MAY	JUN	JUL	AUG	SEP
0	2002	545.92	3115.08	3996.76	1815.74	1204.14	1644.02	795.92	540.24	112.62	79.52	22.20	171.70
1	2003	55.41	1242.23	2976.94	797.72	836.01	1026.11	1571.27	468.59	24.93	98.33	267.40	99.20
2	2004	55.90	834.40	2311.72	942.75	2019.22	399.52	339.18	251.64	72.38	55.57	116.74	97.48
3	2006	347.22	908.44	2981.16	1793.97	995.27	2031.19	1602.55	287.21	102.44	90.31	18.75	33.76
4	2005	1449.23	619.77	1789.93	1777.23	1055.41	1472.91	743.11	1113.26	309.19	46.61	86.51	93.98
5	2007	178.81	942.89	1279.33	320.66	1615.47	317.77	519.85	150.15	85.32	102.60	62.74	164.02
6	2008	612.92	329.64	1189.43	2153.17	1007.57	316.55	153.53	255.80	42.45	41.73	36.83	12.59
7	2009	272.27	777.81	1102.87	533.64	1479.80	881.30	297.80	580.28	320.84	35.95	53.25	42.99
8	2010	722.61	379.36	1029.03	1780.54	1136.93	814.83	1225.52	487.75	146.90	38.05	46.77	88.72
9	2011	1059.68	1016.81	2555.51	463.27	1059.94	2173.84	465.80	552.36	350.04	84.51	21.59	76.77
10	2020	56.21	311.24	1202.38	578.53	70.34	789.14	509.50	375.76	70.86	17.71	25.99	21.65
11	2012	527.67	523.53	156.69	1066.21	421.75	1643.46	828.51	115.19	164.73	98.85	92.70	45.65
12	2014	119.18	281.26	178.54	233.12	1314.08	978.52	376.71	179.59	42.21	96.86	163.17	244.05
13	2013	376.65	1131.15	1759.84	338.40	189.12	391.51	259.37	254.56	142.22	72.63	96.91	343.69
14	2015	342.81	551.60	1654.38	149.87	860.72	243.26	343.53	338.50	109.07	201.23	49.52	84.94
15	2016	305.11	635.79	1530.09	1754.74	367.08	1395.37	427.64	245.77	110.11	37.56	28.19	52.88
16	2017	1175.34	581.67	1155.19	2628.17	2110.15	751.90	825.07	153.14	118.32	53.72	72.88	156.63
17	2018	130.64	967.23	118.69	847.21	201.54	1540.98	578.42	201.33	43.35	68.03	22.84	14.06
18	2019	199.39	754.07	585.14	1282.06	2261.94	1090.67	442.48	743.01	48.86	25.14	48.34	208.95

	Year	variable	value
0	2002	OCT	545.92
1	2003	OCT	55.41
2	2004	OCT	55.90
3	2006	OCT	347.22
4	2005	OCT	1449.23
...	...	...	...
223	2015	SEP	84.94
224	2016	SEP	52.88
225	2017	SEP	156.63
226	2018	SEP	14.06
227	2019	SEP	208.95

	value
variable	APR	AUG	DEC	FEB	JAN	JUL	JUN	MAR	MAY	NOV	OCT	SEP
Year
2002	795.92	22.20	3996.76	1204.14	1815.74	79.52	112.62	1644.02	540.24	3115.08	545.92	171.70
2003	1571.27	267.40	2976.94	836.01	797.72	98.33	24.93	1026.11	468.59	1242.23	55.41	99.20
2004	339.18	116.74	2311.72	2019.22	942.75	55.57	72.38	399.52	251.64	834.40	55.90	97.48
2005	743.11	86.51	1789.93	1055.41	1777.23	46.61	309.19	1472.91	1113.26	619.77	1449.23	93.98
2006	1602.55	18.75	2981.16	995.27	1793.97	90.31	102.44	2031.19	287.21	908.44	347.22	33.76
2007	519.85	62.74	1279.33	1615.47	320.66	102.60	85.32	317.77	150.15	942.89	178.81	164.02
2008	153.53	36.83	1189.43	1007.57	2153.17	41.73	42.45	316.55	255.80	329.64	612.92	12.59
2009	297.80	53.25	1102.87	1479.80	533.64	35.95	320.84	881.30	580.28	777.81	272.27	42.99
2010	1225.52	46.77	1029.03	1136.93	1780.54	38.05	146.90	814.83	487.75	379.36	722.61	88.72
2011	465.80	21.59	2555.51	1059.94	463.27	84.51	350.04	2173.84	552.36	1016.81	1059.68	76.77
2012	828.51	92.70	156.69	421.75	1066.21	98.85	164.73	1643.46	115.19	523.53	527.67	45.65
2013	259.37	96.91	1759.84	189.12	338.40	72.63	142.22	391.51	254.56	1131.15	376.65	343.69
2014	376.71	163.17	178.54	1314.08	233.12	96.86	42.21	978.52	179.59	281.26	119.18	244.05
2015	343.53	49.52	1654.38	860.72	149.87	201.23	109.07	243.26	338.50	551.60	342.81	84.94
2016	427.64	28.19	1530.09	367.08	1754.74	37.56	110.11	1395.37	245.77	635.79	305.11	52.88
2017	825.07	72.88	1155.19	2110.15	2628.17	53.72	118.32	751.90	153.14	581.67	1175.34	156.63
2018	578.42	22.84	118.69	201.54	847.21	68.03	43.35	1540.98	201.33	967.23	130.64	14.06
2019	442.48	48.34	585.14	2261.94	1282.06	25.14	48.86	1090.67	743.01	754.07	199.39	208.95
2020	509.50	25.99	1202.38	70.34	578.53	17.71	70.86	789.14	375.76	311.24	56.21	21.65

	Year	ID	Location	Station	Month	Inches of Precipitation
0	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	OCT	6.43
1	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	NOV	14.53
2	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	DEC	17.32
3	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	JAN	17.32
4	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	FEB	9.10
...	...	...	...	...	...	...
78751	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	MAY	0.07
78752	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	JUN	0.00
78753	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	JUL	0.00
78754	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	AUG	0.11
78755	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	SEP	0.00

	Inches of Precipitation
Month	APR	AUG	DEC	FEB	JAN	JUL	JUN	MAR	MAY	NOV	OCT	SEP
Year
2002	1.137029	0.031988	5.759020	1.715299	2.593914	0.113276	0.164169	2.341909	0.778444	4.450114	0.779886	0.245989
2003	4.328567	0.749020	8.223591	2.309420	2.203646	0.273139	0.069637	2.834558	1.287335	3.441080	0.152645	0.274792
2004	0.952753	0.348478	6.403657	5.577956	2.604282	0.167885	0.209191	1.106704	0.710847	2.330726	0.153994	0.303676
2005	2.147717	0.269502	5.158300	3.068052	5.242566	0.147035	0.954290	4.220372	3.293669	1.786081	4.117131	0.339278
2006	4.769494	0.063776	8.898985	2.988799	5.387297	0.286698	0.317152	6.027270	0.891957	2.744532	1.036478	0.108553
2007	1.589755	0.220915	3.888541	4.986019	0.974650	0.337500	0.272588	0.989938	0.464861	2.874665	0.546820	0.598613
2008	0.498474	0.128776	3.812276	3.281987	6.945710	0.144394	0.147396	1.021129	0.825161	1.039874	1.939620	0.048238
2009	0.954487	0.192935	3.569159	4.758199	1.710385	0.125699	1.091293	2.833762	1.842159	2.618889	0.895625	0.155199
2010	3.991922	0.175827	3.418704	3.679385	5.818758	0.137862	0.488040	2.636990	1.583604	1.223742	2.369213	0.342548
2011	1.589761	0.078509	8.518367	3.521395	1.544233	0.320114	1.291661	7.198146	2.008582	3.378106	3.544080	0.290795
2012	2.770936	0.337091	0.534778	1.429661	3.638942	0.350532	0.571979	5.496522	0.389155	1.792911	1.813299	0.172264
2013	0.936354	0.372731	6.666061	0.675429	1.230545	0.269000	0.509749	1.403262	0.922319	4.098370	1.359747	1.249782
2014	1.400409	0.608843	0.637643	4.676441	0.838561	0.365509	0.157500	3.519856	0.653055	1.026496	0.419648	0.924432
2015	1.281828	0.194196	6.060000	3.118551	0.546971	0.828107	0.432817	0.875036	1.398760	2.105344	1.251131	0.359915
2016	1.696984	0.120987	5.709291	1.450909	6.572060	0.155851	0.460711	5.603896	1.054807	2.399208	1.191836	0.216721
2017	3.496059	0.290359	4.936709	8.755809	10.512680	0.206615	0.475181	3.186017	0.595875	2.540044	4.917741	0.611836
2018	2.370574	0.098026	0.474760	0.790353	3.361944	0.276545	0.173400	6.066850	0.821755	3.838214	0.518413	0.056466
2019	1.762869	0.198930	2.589115	9.120726	5.341917	0.103884	0.191608	4.260430	3.057654	3.168361	0.830792	0.949773
2020	2.079592	0.103135	4.790359	0.278024	2.332782	0.071124	0.291605	3.194899	1.521296	1.383289	0.233237	0.087298

Lecture 12: Data Preparation I¶

[review] Structural Transformation: From Relations to Matrices and Back¶

[Review] What does an unpivot look like (Matrix -> Relational)?¶

PIVOT(UNPIVOT) = ??¶

Data Unboxing, Part I¶

Data Unboxing, Part II¶

CNRFC-NOAA Rainfall Dataset¶

Fulfilling Structural Transformation Requests¶

Scalar Functions and Query Plans¶

[Extra] Recreating the CNRFC-NOAA data¶

yearid	year
2004	'04
2007	'07
2009	'09
2010	'10
2012	'12

QUERY PLAN
Seq Scan on public.batting (cost=0.00..3922.29 rows=104324 width=36)
Output: yearid, concat('''', lpad(((yearid % 100))::text, 2, '0'::text))

QUERY PLAN
Nested Loop (cost=0.43..13004.27 rows=339358 width=13)
Output: p.playerid, (p.so - b.so)
-> Seq Scan on public.pitching p (cost=0.00..1374.06 rows=45806 width=13)
Output: p.playerid, p.yearid, p.stint, p.teamid, p.lgid, p.w, p.l, p.g, p.gs, p.cg, p.sho, p.sv, p.ipouts, p.h, p.er, p.hr, p.bb, p.so, p.baopp, p.era, p.ibb, p.wp, p.hbp, p.bk, p.bfp, p.gf, p.r, p.sh, p.sf, p.gidp
-> Memoize (cost=0.43..0.73 rows=7 width=13)
Output: b.so, b.playerid
Cache Key: p.playerid
Cache Mode: logical
-> Index Scan using batting_pkey on public.batting b (cost=0.42..0.72 rows=7 width=13)
Output: b.so, b.playerid
Index Cond: ((b.playerid)::text = (p.playerid)::text)