# to save space, do a symbolic link to previous lecture's data
!ln -sf ../lec11/data .


import pandas as pd


mm = pd.read_csv('data/mm.txt', header=0)
mm


!du -h data/*

1.4M	data/jc1.txt
51M	data/jq2.txt
600K	data/mmp.txt
4.3M	data/mmr.txt
4.0K	data/mm.txt
628K	data/mpf.txt
4.0K	data/requirements.txt


!ls -lh data/*

-rw-r--r-- 1 jovyan jovyan 1.4M Feb 28 20:14 data/jc1.txt
-rw-r--r-- 1 jovyan jovyan  51M Feb 28 20:14 data/jq2.txt
-rw-r--r-- 1 jovyan jovyan 600K Feb 28 20:14 data/mmp.txt
-rw-r--r-- 1 jovyan jovyan 4.3M Feb 28 20:14 data/mmr.txt
-rw-r--r-- 1 jovyan jovyan 4.0K Feb 28 20:14 data/mm.txt
-rw-r--r-- 1 jovyan jovyan 627K Feb 28 20:14 data/mpf.txt
-rw-r--r-- 1 jovyan jovyan 2.1K Feb 28 20:14 data/requirements.txt


!head -c 1024 data/jc1.txt

﻿_input,_num,_widgetName,_source,_resultNumber,_pageUrl,game_number,bio1,bio2,bio3,contestant1_name,contestant1_score,contestant2_name,contestant2_score,contestant3_name,contestant3_score
,1,Jeopardy_Winners,Jeopardy_Winners,1,http://www.j-archive.com/showgame.php?game_id=3350,Show #5883 - Wednesday March 24 2010,Derek Honoré an attorney from Inglewood California,Tatiana Walton a graphic designer from Cutler Bay Florida,Regina Robbins an arts teacher from New York New York (whose 1-day cash winnings total $38500),Regina,$19401,Tatiana,$7100,Derek,$11900
,2,Jeopardy_Winners,Jeopardy_Winners,2,http://www.j-archive.com/showgame.php?game_id=4400,Show #6756 - Monday January 20 2014,Jon McGuire a software-development manager from Matthews North Carolina,Blake Perkins an environmental scientist from Baton Rouge Louisiana,Sarah McNitt a study abroad adviser originally from Ann Arbor Michigan (whose 4-day cash winnings total $69199),Sarah,$20199,Blake,$0,Jon,$8380
,3,Jeopardy_Winners,Jeopardy_Winners,3,http://www.j


!head -c 1024 data/jq2.txt

{"category":"HISTORY","air_date":"2004-12-31","question":"'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'","value":"$200","answer":"Copernicus","round":"Jeopardy!","show_number":"4680"}
{"category":"ESPN's TOP 10 ALL-TIME ATHLETES","air_date":"2004-12-31","question":"'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves'","value":"$200","answer":"Jim Thorpe","round":"Jeopardy!","show_number":"4680"}
{"category":"EVERYBODY TALKS ABOUT IT...","air_date":"2004-12-31","question":"'The city of Yuma in this state has a record average of 4,055 hours of sunshine each year'","value":"$200","answer":"Arizona","round":"Jeopardy!","show_number":"4680"}
{"category":"THE COMPANY LINE","air_date":"2004-12-31","question":"'In 1963, live on \"The Art Linkletter Show\", this company served its billionth burger'","value":"$200","answer":"McDonald\\'s","round":"Jeopardy!","show_number":"4680"}
{"category":"EPITAPHS & TRIBUTE


!head -c 1024 data/jc1.txt

﻿_input,_num,_widgetName,_source,_resultNumber,_pageUrl,game_number,bio1,bio2,bio3,contestant1_name,contestant1_score,contestant2_name,contestant2_score,contestant3_name,contestant3_score
,1,Jeopardy_Winners,Jeopardy_Winners,1,http://www.j-archive.com/showgame.php?game_id=3350,Show #5883 - Wednesday March 24 2010,Derek Honoré an attorney from Inglewood California,Tatiana Walton a graphic designer from Cutler Bay Florida,Regina Robbins an arts teacher from New York New York (whose 1-day cash winnings total $38500),Regina,$19401,Tatiana,$7100,Derek,$11900
,2,Jeopardy_Winners,Jeopardy_Winners,2,http://www.j-archive.com/showgame.php?game_id=4400,Show #6756 - Monday January 20 2014,Jon McGuire a software-development manager from Matthews North Carolina,Blake Perkins an environmental scientist from Baton Rouge Louisiana,Sarah McNitt a study abroad adviser originally from Ann Arbor Michigan (whose 4-day cash winnings total $69199),Sarah,$20199,Blake,$0,Jon,$8380
,3,Jeopardy_Winners,Jeopardy_Winners,3,http://www.j


import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd

jc1 = pd.read_csv('data/jc1.txt', header=0, sep=',')
jc1.head(3)


jc1.dtypes

_input               float64
_num                   int64
_widgetName           object
_source               object
_resultNumber          int64
_pageUrl              object
game_number           object
bio1                  object
bio2                  object
bio3                  object
contestant1_name      object
contestant1_score     object
contestant2_name      object
contestant2_score     object
contestant3_name      object
contestant3_score     object
dtype: object


jc1.columns
for col in jc1.columns:
    if '_score' in col:
        jc1[col] = jc1[col].map(lambda x: float(x.strip('$')) if type(x) == str else np.nan)


jc1.dtypes

_input               float64
_num                   int64
_widgetName           object
_source               object
_resultNumber          int64
_pageUrl              object
game_number           object
bio1                  object
bio2                  object
bio3                  object
contestant1_name      object
contestant1_score    float64
contestant2_name      object
contestant2_score    float64
contestant3_name      object
contestant3_score    float64
dtype: object


jc1.hist(column='contestant1_score')
jc1.hist(column='contestant2_score')
jc1.hist(column='contestant3_score')

array([[<Axes: title={'center': 'contestant3_score'}>]], dtype=object)


!ls -lh data/m*

-rw-r--r-- 1 jovyan jovyan 600K Feb 28 20:14 data/mmp.txt
-rw-r--r-- 1 jovyan jovyan 4.3M Feb 28 20:14 data/mmr.txt
-rw-r--r-- 1 jovyan jovyan 4.0K Feb 28 20:14 data/mm.txt
-rw-r--r-- 1 jovyan jovyan 627K Feb 28 20:14 data/mpf.txt


!head -n 20 data/mpf.txt

Year,ID,Location,Station,OCT,NOV,DEC,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP
2002,4BK,BROOKINGS,SOUTHERN OREGON COASTAL,6.43,14.53,17.32,17.32,9.1,6.05,6.62,3.65,3.68,0.02,0.03,1.45
2002,ASHO3,ASHLAND,SOUTHERN OREGON COASTAL,0.38,3.5,3.41,1.32,1.29,1.29,2.92,1.88,0.08,0.28,0,0.2
2002,COPO3,COPPER 4NE,SOUTHERN OREGON COASTAL,0.29,6.68,6.98,3.42,1.99,1.8,,,,,,0.27
2002,CVJO3,CAVE JUNCTION,SOUTHERN OREGON COASTAL,2.46,13.6,14.81,9.76,6.46,4.63,1.94,0.89,0,0,0,0.33
2002,GOLO3,GOLD BEACH,SOUTHERN OREGON COASTAL,4.63,11.72,16.59,14.58,8.89,6.62,4.73,1.5,2.09,0.02,0,0.62
2002,GPSO3,GRANTS PASS KAJO,SOUTHERN OREGON COASTAL,0.39,6.37,6.94,4.31,2.89,1.36,0.87,0.62,0.02,0,0,0.05
2002,GSPO3,GREEN SPRINGS PP,SOUTHERN OREGON COASTAL,0.36,4.79,5.9,2.52,1.47,1.74,2.41,1.15,0.11,0.01,0,0.2
2002,LEMO3,LEMOLO LAKE,SOUTHERN OREGON COASTAL,5.13,10.3,12.72,11.48,4.99,6.32,5.49,1.14,2.22,0.4,0.2,2.13
2002,MFR,MEDFORD,SOUTHERN OREGON COASTAL,0.19,4.16,4.34,1.59,1.65,1.33,1.49,0.53,0.03,0.08,0,0.54
2002,OTH,NORTH BEND,SOUTHERN OREGON COASTAL,4.45,10.25,14.1,15.79,4.14,6.67,4.56,1.48,1.42,0.01,0.22,1.33
2002,PRSO3,PROSPECT 2SW,SOUTHERN OREGON COASTAL,1.78,7.17,9.1,5.86,3.45,3.51,2.88,0.65,0.34,0.03,0,0.82
2002,RIDO3,RIDDLE,SOUTHERN OREGON COASTAL,1.02,6.84,6.71,4.81,2.76,1.71,1.23,1.3,0.26,0.02,0,0.24
2002,ROGO3,ROSEBURG,SOUTHERN OREGON COASTAL,1.91,6.88,6.65,4.83,2.4,0.85,1.54,1.13,0.19,0,0,0.32
2002,SXT,SEXTON SUMMIT,SOUTHERN OREGON COASTAL,0.28,5.13,5.75,5.75,3.03,2.9,1.63,1.08,0.07,0.01,0,0.2
2002,TKFO3,TOKETEE FALLS,SOUTHERN OREGON COASTAL,3.23,7.45,9.59,6.94,3.54,4.06,3.31,1.22,1.17,0.04,0.02,1.99
2002,WINO3,WINCHESTER DAM,SOUTHERN OREGON COASTAL,1.66,7.37,7.39,5.32,2.77,2.93,1.46,1.29,0.25,0,0.08,0.65
2002,WLMO3,WILLIAMS,SOUTHERN OREGON COASTAL,0.4,7.11,7.98,7.98,2.68,2.68,1.79,0.46,0,0,0,0.08
2002,DNON2,DENIO,OREGON CLOSED BASINS,0.25,1.54,1.37,0.69,0.91,0.56,0.83,0.15,0.12,0,0.08,0.53
2002,HMRO3,HART MOUNTAIN REF,OREGON CLOSED BASINS,0.34,1.37,0.61,0.67,0.63,0.72,1.45,0.6,0.66,0,0.15,0.57


pd.set_option('display.min_rows', 15) # changes truncated view in pandas


mmr = pd.read_csv('data/mmr.txt')
mmr


(mmr
 .groupby(['Year', 'Month'])
 .mean('Inches of Precipitation')
)


(mmr
 .groupby(['Year', 'Month'])
 .mean('Inches of Precipitation')
 .reset_index()
)


(mmr
 .groupby(['Year', 'Month'])
 .mean('Inches of Precipitation')
 .reset_index()
 .pivot(index='Year', columns='Month')
)


!head -c 4096 data/mpf.txt


curr_file = None
with open('data/mpf.txt') as f:
    for line in f.readlines():
        splits = line.split(', ')
        if len(splits) == 2:
            print(splits[1])


curr_file = None
with open('lec12_data/mpf.txt') as f:
    for num, line in enumerate(f.readlines()):
        splits = line.split(', ')
        if len(splits) == 2:
            print(splits[1].strip(" '\n ").replace('...', '-').replace(' ', '_'))


curr_file = None
with open('data/mpf.txt') as master_file:
    for line in master_file.readlines():
        splits = line.split(', ')
        if len(splits) == 2:
            if curr_file is not None:
                curr_file.close()
            filename = splits[0] + '_' + splits[1].strip(" '\n ")
            filename = filename.replace('...', '-')
            filename = filename.replace(' ', '_')
            curr_file = open('data/subfiles/' + filename + '.txt', 'w')
        curr_file.write(line)
curr_file.close()


import os
mpf = pd.DataFrame()
parent_dir = os.getcwd() + '/data/subfiles/'
for filename in os.listdir(parent_dir):
    # read from the first header. not the second!
    region_df = pd.read_csv(parent_dir + '/' + filename, header=1)
    # BE CAREFUL! Let's rename our schema...because right now, this is problematic.
    # The first column is just labeled as the year itself (e.g. 2007). Not 'Year'.
    region_df = region_df.rename(columns={filename.split('_')[0]: 'Year'})
    mpf = mpf.append(region_df)
mpf = mpf.sort_values('Year')
mpf

mpf


mpf.dtypes


col_set = {}
for column in mpf.columns:
    if column != 'Year':
        col_set.update({column: column.strip(" '")})
mpf = mpf.rename(columns=col_set)
mpf


# Let's remove the Ms and replace them with NaNs, and strip the quotes
import math
def _process_data(x):
    if type(x) == float:
        return x
    if type(x) != str and math.isnan(x):
        return np.nan
    x = x.strip("' ")
    if x == 'M' or x == 'NA':
        return np.nan
    else:
        return float(x)

mpf = mpf[mpf['Year'] == 2002]
for col in mpf.columns:
    if col in ["ID", "Location"]:
        print(col)
        mpf[col] = mpf[col].map(lambda x: str(x).strip("'"))
    elif col != 'Year':
        mpf[col] = mpf[col].map(lambda x: _process_data(x))
mpf

	Year	OCT	NOV	DEC	JAN	FEB	MAR	APR	MAY	JUN	JUL	AUG	SEP
0	2002	545.92	3115.08	3996.76	1815.74	1204.14	1644.02	795.92	540.24	112.62	79.52	22.20	171.70
1	2003	55.41	1242.23	2976.94	797.72	836.01	1026.11	1571.27	468.59	24.93	98.33	267.40	99.20
2	2004	55.90	834.40	2311.72	942.75	2019.22	399.52	339.18	251.64	72.38	55.57	116.74	97.48
3	2006	347.22	908.44	2981.16	1793.97	995.27	2031.19	1602.55	287.21	102.44	90.31	18.75	33.76
4	2005	1449.23	619.77	1789.93	1777.23	1055.41	1472.91	743.11	1113.26	309.19	46.61	86.51	93.98
5	2007	178.81	942.89	1279.33	320.66	1615.47	317.77	519.85	150.15	85.32	102.60	62.74	164.02
6	2008	612.92	329.64	1189.43	2153.17	1007.57	316.55	153.53	255.80	42.45	41.73	36.83	12.59
7	2009	272.27	777.81	1102.87	533.64	1479.80	881.30	297.80	580.28	320.84	35.95	53.25	42.99
8	2010	722.61	379.36	1029.03	1780.54	1136.93	814.83	1225.52	487.75	146.90	38.05	46.77	88.72
9	2011	1059.68	1016.81	2555.51	463.27	1059.94	2173.84	465.80	552.36	350.04	84.51	21.59	76.77
10	2020	56.21	311.24	1202.38	578.53	70.34	789.14	509.50	375.76	70.86	17.71	25.99	21.65
11	2012	527.67	523.53	156.69	1066.21	421.75	1643.46	828.51	115.19	164.73	98.85	92.70	45.65
12	2014	119.18	281.26	178.54	233.12	1314.08	978.52	376.71	179.59	42.21	96.86	163.17	244.05
13	2013	376.65	1131.15	1759.84	338.40	189.12	391.51	259.37	254.56	142.22	72.63	96.91	343.69
14	2015	342.81	551.60	1654.38	149.87	860.72	243.26	343.53	338.50	109.07	201.23	49.52	84.94
15	2016	305.11	635.79	1530.09	1754.74	367.08	1395.37	427.64	245.77	110.11	37.56	28.19	52.88
16	2017	1175.34	581.67	1155.19	2628.17	2110.15	751.90	825.07	153.14	118.32	53.72	72.88	156.63
17	2018	130.64	967.23	118.69	847.21	201.54	1540.98	578.42	201.33	43.35	68.03	22.84	14.06
18	2019	199.39	754.07	585.14	1282.06	2261.94	1090.67	442.48	743.01	48.86	25.14	48.34	208.95

	Inches of Precipitation
Month	APR	AUG	DEC	FEB	JAN	JUL	JUN	MAR	MAY	NOV	OCT	SEP
Year
2002	1.137029	0.031988	5.759020	1.715299	2.593914	0.113276	0.164169	2.341909	0.778444	4.450114	0.779886	0.245989
2003	4.328567	0.749020	8.223591	2.309420	2.203646	0.273139	0.069637	2.834558	1.287335	3.441080	0.152645	0.274792
2004	0.952753	0.348478	6.403657	5.577956	2.604282	0.167885	0.209191	1.106704	0.710847	2.330726	0.153994	0.303676
2005	2.147717	0.269502	5.158300	3.068052	5.242566	0.147035	0.954290	4.220372	3.293669	1.786081	4.117131	0.339278
2006	4.769494	0.063776	8.898985	2.988799	5.387297	0.286698	0.317152	6.027270	0.891957	2.744532	1.036478	0.108553
2007	1.589755	0.220915	3.888541	4.986019	0.974650	0.337500	0.272588	0.989938	0.464861	2.874665	0.546820	0.598613
2008	0.498474	0.128776	3.812276	3.281987	6.945710	0.144394	0.147396	1.021129	0.825161	1.039874	1.939620	0.048238
2009	0.954487	0.192935	3.569159	4.758199	1.710385	0.125699	1.091293	2.833762	1.842159	2.618889	0.895625	0.155199
2010	3.991922	0.175827	3.418704	3.679385	5.818758	0.137862	0.488040	2.636990	1.583604	1.223742	2.369213	0.342548
2011	1.589761	0.078509	8.518367	3.521395	1.544233	0.320114	1.291661	7.198146	2.008582	3.378106	3.544080	0.290795
2012	2.770936	0.337091	0.534778	1.429661	3.638942	0.350532	0.571979	5.496522	0.389155	1.792911	1.813299	0.172264
2013	0.936354	0.372731	6.666061	0.675429	1.230545	0.269000	0.509749	1.403262	0.922319	4.098370	1.359747	1.249782
2014	1.400409	0.608843	0.637643	4.676441	0.838561	0.365509	0.157500	3.519856	0.653055	1.026496	0.419648	0.924432
2015	1.281828	0.194196	6.060000	3.118551	0.546971	0.828107	0.432817	0.875036	1.398760	2.105344	1.251131	0.359915
2016	1.696984	0.120987	5.709291	1.450909	6.572060	0.155851	0.460711	5.603896	1.054807	2.399208	1.191836	0.216721
2017	3.496059	0.290359	4.936709	8.755809	10.512680	0.206615	0.475181	3.186017	0.595875	2.540044	4.917741	0.611836
2018	2.370574	0.098026	0.474760	0.790353	3.361944	0.276545	0.173400	6.066850	0.821755	3.838214	0.518413	0.056466
2019	1.762869	0.198930	2.589115	9.120726	5.341917	0.103884	0.191608	4.260430	3.057654	3.168361	0.830792	0.949773
2020	2.079592	0.103135	4.790359	0.278024	2.332782	0.071124	0.291605	3.194899	1.521296	1.383289	0.233237	0.087298

Lecture 13: Data Preparation I¶

Data Unboxing, Part I¶

[On your own] Data Unboxing, Part II¶

CNRFC-NOAA Rainfall Dataset¶

Fulfilling Structural Transformation Requests¶

[Extra] Recreating the CNRFC-NOAA data¶

	_input	_num	_widgetName	_source	_resultNumber	_pageUrl	game_number	bio1	bio2	bio3	contestant1_name	contestant1_score	contestant2_name	contestant2_score	contestant3_name	contestant3_score
0	NaN	1	Jeopardy_Winners	Jeopardy_Winners	1	http://www.j-archive.com/showgame.php?game_id=...	Show #5883 - Wednesday March 24 2010	Derek Honoré an attorney from Inglewood Califo...	Tatiana Walton a graphic designer from Cutler ...	Regina Robbins an arts teacher from New York N...	Regina	$19401	Tatiana	$7100	Derek	$11900
1	NaN	2	Jeopardy_Winners	Jeopardy_Winners	2	http://www.j-archive.com/showgame.php?game_id=...	Show #6756 - Monday January 20 2014	Jon McGuire a software-development manager fro...	Blake Perkins an environmental scientist from ...	Sarah McNitt a study abroad adviser originally...	Sarah	$20199	Blake	$0	Jon	$8380
2	NaN	3	Jeopardy_Winners	Jeopardy_Winners	3	http://www.j-archive.com/showgame.php?game_id=...	Show #5754 - Thursday September 24 2009	Christopher Nold an economics and film studies...	Rachel Landau a museum night security guard fr...	Joann Mariani a high school English teacher fr...	Joann	$99	Rachel	NaN	Christopher	$23000

	Year	ID	Location	Station	Month	Inches of Precipitation
0	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	OCT	6.43
1	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	NOV	14.53
2	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	DEC	17.32
3	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	JAN	17.32
4	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	FEB	9.10
5	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	MAR	6.05
6	2002	4BK	BROOKINGS	SOUTHERN OREGON COASTAL	APR	6.62
...	...	...	...	...	...	...
78749	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	MAR	3.65
78750	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	APR	1.30
78751	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	MAY	0.07
78752	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	JUN	0.00
78753	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	JUL	0.00
78754	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	AUG	0.11
78755	2020	EMRU1	ENTERPRISE	ESCALANTE DESERT	SEP	0.00