# to save space, do a symbolic link to previous lecture's data
!ln -sf ../lec11/data .
import pandas as pd
mm = pd.read_csv('data/mm.txt', header=0)
mm
Year | OCT | NOV | DEC | JAN | FEB | MAR | APR | MAY | JUN | JUL | AUG | SEP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2002 | 545.92 | 3115.08 | 3996.76 | 1815.74 | 1204.14 | 1644.02 | 795.92 | 540.24 | 112.62 | 79.52 | 22.20 | 171.70 |
1 | 2003 | 55.41 | 1242.23 | 2976.94 | 797.72 | 836.01 | 1026.11 | 1571.27 | 468.59 | 24.93 | 98.33 | 267.40 | 99.20 |
2 | 2004 | 55.90 | 834.40 | 2311.72 | 942.75 | 2019.22 | 399.52 | 339.18 | 251.64 | 72.38 | 55.57 | 116.74 | 97.48 |
3 | 2006 | 347.22 | 908.44 | 2981.16 | 1793.97 | 995.27 | 2031.19 | 1602.55 | 287.21 | 102.44 | 90.31 | 18.75 | 33.76 |
4 | 2005 | 1449.23 | 619.77 | 1789.93 | 1777.23 | 1055.41 | 1472.91 | 743.11 | 1113.26 | 309.19 | 46.61 | 86.51 | 93.98 |
5 | 2007 | 178.81 | 942.89 | 1279.33 | 320.66 | 1615.47 | 317.77 | 519.85 | 150.15 | 85.32 | 102.60 | 62.74 | 164.02 |
6 | 2008 | 612.92 | 329.64 | 1189.43 | 2153.17 | 1007.57 | 316.55 | 153.53 | 255.80 | 42.45 | 41.73 | 36.83 | 12.59 |
7 | 2009 | 272.27 | 777.81 | 1102.87 | 533.64 | 1479.80 | 881.30 | 297.80 | 580.28 | 320.84 | 35.95 | 53.25 | 42.99 |
8 | 2010 | 722.61 | 379.36 | 1029.03 | 1780.54 | 1136.93 | 814.83 | 1225.52 | 487.75 | 146.90 | 38.05 | 46.77 | 88.72 |
9 | 2011 | 1059.68 | 1016.81 | 2555.51 | 463.27 | 1059.94 | 2173.84 | 465.80 | 552.36 | 350.04 | 84.51 | 21.59 | 76.77 |
10 | 2020 | 56.21 | 311.24 | 1202.38 | 578.53 | 70.34 | 789.14 | 509.50 | 375.76 | 70.86 | 17.71 | 25.99 | 21.65 |
11 | 2012 | 527.67 | 523.53 | 156.69 | 1066.21 | 421.75 | 1643.46 | 828.51 | 115.19 | 164.73 | 98.85 | 92.70 | 45.65 |
12 | 2014 | 119.18 | 281.26 | 178.54 | 233.12 | 1314.08 | 978.52 | 376.71 | 179.59 | 42.21 | 96.86 | 163.17 | 244.05 |
13 | 2013 | 376.65 | 1131.15 | 1759.84 | 338.40 | 189.12 | 391.51 | 259.37 | 254.56 | 142.22 | 72.63 | 96.91 | 343.69 |
14 | 2015 | 342.81 | 551.60 | 1654.38 | 149.87 | 860.72 | 243.26 | 343.53 | 338.50 | 109.07 | 201.23 | 49.52 | 84.94 |
15 | 2016 | 305.11 | 635.79 | 1530.09 | 1754.74 | 367.08 | 1395.37 | 427.64 | 245.77 | 110.11 | 37.56 | 28.19 | 52.88 |
16 | 2017 | 1175.34 | 581.67 | 1155.19 | 2628.17 | 2110.15 | 751.90 | 825.07 | 153.14 | 118.32 | 53.72 | 72.88 | 156.63 |
17 | 2018 | 130.64 | 967.23 | 118.69 | 847.21 | 201.54 | 1540.98 | 578.42 | 201.33 | 43.35 | 68.03 | 22.84 | 14.06 |
18 | 2019 | 199.39 | 754.07 | 585.14 | 1282.06 | 2261.94 | 1090.67 | 442.48 | 743.01 | 48.86 | 25.14 | 48.34 | 208.95 |
!du -h data/*
1.4M data/jc1.txt 51M data/jq2.txt 600K data/mmp.txt 4.3M data/mmr.txt 4.0K data/mm.txt 628K data/mpf.txt 4.0K data/requirements.txt
!ls -lh data/*
-rw-r--r-- 1 jovyan jovyan 1.4M Feb 28 20:14 data/jc1.txt -rw-r--r-- 1 jovyan jovyan 51M Feb 28 20:14 data/jq2.txt -rw-r--r-- 1 jovyan jovyan 600K Feb 28 20:14 data/mmp.txt -rw-r--r-- 1 jovyan jovyan 4.3M Feb 28 20:14 data/mmr.txt -rw-r--r-- 1 jovyan jovyan 4.0K Feb 28 20:14 data/mm.txt -rw-r--r-- 1 jovyan jovyan 627K Feb 28 20:14 data/mpf.txt -rw-r--r-- 1 jovyan jovyan 2.1K Feb 28 20:14 data/requirements.txt
What category of data is the file below? Any observations about the data?
!head -c 1024 data/jc1.txt
_input,_num,_widgetName,_source,_resultNumber,_pageUrl,game_number,bio1,bio2,bio3,contestant1_name,contestant1_score,contestant2_name,contestant2_score,contestant3_name,contestant3_score ,1,Jeopardy_Winners,Jeopardy_Winners,1,http://www.j-archive.com/showgame.php?game_id=3350,Show #5883 - Wednesday March 24 2010,Derek Honoré an attorney from Inglewood California,Tatiana Walton a graphic designer from Cutler Bay Florida,Regina Robbins an arts teacher from New York New York (whose 1-day cash winnings total $38500),Regina,$19401,Tatiana,$7100,Derek,$11900 ,2,Jeopardy_Winners,Jeopardy_Winners,2,http://www.j-archive.com/showgame.php?game_id=4400,Show #6756 - Monday January 20 2014,Jon McGuire a software-development manager from Matthews North Carolina,Blake Perkins an environmental scientist from Baton Rouge Louisiana,Sarah McNitt a study abroad adviser originally from Ann Arbor Michigan (whose 4-day cash winnings total $69199),Sarah,$20199,Blake,$0,Jon,$8380 ,3,Jeopardy_Winners,Jeopardy_Winners,3,http://www.j
Exercise: What do you see this time? Category? Interesting features of the data?
!head -c 1024 data/jq2.txt
{"category":"HISTORY","air_date":"2004-12-31","question":"'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'","value":"$200","answer":"Copernicus","round":"Jeopardy!","show_number":"4680"} {"category":"ESPN's TOP 10 ALL-TIME ATHLETES","air_date":"2004-12-31","question":"'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves'","value":"$200","answer":"Jim Thorpe","round":"Jeopardy!","show_number":"4680"} {"category":"EVERYBODY TALKS ABOUT IT...","air_date":"2004-12-31","question":"'The city of Yuma in this state has a record average of 4,055 hours of sunshine each year'","value":"$200","answer":"Arizona","round":"Jeopardy!","show_number":"4680"} {"category":"THE COMPANY LINE","air_date":"2004-12-31","question":"'In 1963, live on \"The Art Linkletter Show\", this company served its billionth burger'","value":"$200","answer":"McDonald\\'s","round":"Jeopardy!","show_number":"4680"} {"category":"EPITAPHS & TRIBUTE
Let's explore the first file here:
!head -c 1024 data/jc1.txt
_input,_num,_widgetName,_source,_resultNumber,_pageUrl,game_number,bio1,bio2,bio3,contestant1_name,contestant1_score,contestant2_name,contestant2_score,contestant3_name,contestant3_score ,1,Jeopardy_Winners,Jeopardy_Winners,1,http://www.j-archive.com/showgame.php?game_id=3350,Show #5883 - Wednesday March 24 2010,Derek Honoré an attorney from Inglewood California,Tatiana Walton a graphic designer from Cutler Bay Florida,Regina Robbins an arts teacher from New York New York (whose 1-day cash winnings total $38500),Regina,$19401,Tatiana,$7100,Derek,$11900 ,2,Jeopardy_Winners,Jeopardy_Winners,2,http://www.j-archive.com/showgame.php?game_id=4400,Show #6756 - Monday January 20 2014,Jon McGuire a software-development manager from Matthews North Carolina,Blake Perkins an environmental scientist from Baton Rouge Louisiana,Sarah McNitt a study abroad adviser originally from Ann Arbor Michigan (whose 4-day cash winnings total $69199),Sarah,$20199,Blake,$0,Jon,$8380 ,3,Jeopardy_Winners,Jeopardy_Winners,3,http://www.j
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
jc1 = pd.read_csv('data/jc1.txt', header=0, sep=',')
jc1.head(3)
_input | _num | _widgetName | _source | _resultNumber | _pageUrl | game_number | bio1 | bio2 | bio3 | contestant1_name | contestant1_score | contestant2_name | contestant2_score | contestant3_name | contestant3_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | 1 | Jeopardy_Winners | Jeopardy_Winners | 1 | http://www.j-archive.com/showgame.php?game_id=... | Show #5883 - Wednesday March 24 2010 | Derek Honoré an attorney from Inglewood Califo... | Tatiana Walton a graphic designer from Cutler ... | Regina Robbins an arts teacher from New York N... | Regina | $19401 | Tatiana | $7100 | Derek | $11900 |
1 | NaN | 2 | Jeopardy_Winners | Jeopardy_Winners | 2 | http://www.j-archive.com/showgame.php?game_id=... | Show #6756 - Monday January 20 2014 | Jon McGuire a software-development manager fro... | Blake Perkins an environmental scientist from ... | Sarah McNitt a study abroad adviser originally... | Sarah | $20199 | Blake | $0 | Jon | $8380 |
2 | NaN | 3 | Jeopardy_Winners | Jeopardy_Winners | 3 | http://www.j-archive.com/showgame.php?game_id=... | Show #5754 - Thursday September 24 2009 | Christopher Nold an economics and film studies... | Rachel Landau a museum night security guard fr... | Joann Mariani a high school English teacher fr... | Joann | $99 | Rachel | NaN | Christopher | $23000 |
What data types are each column?
jc1.dtypes
_input float64 _num int64 _widgetName object _source object _resultNumber int64 _pageUrl object game_number object bio1 object bio2 object bio3 object contestant1_name object contestant1_score object contestant2_name object contestant2_score object contestant3_name object contestant3_score object dtype: object
Currently, we're working with objects, because of how pandas read in the dtypes. Let's try to modify some scores here for contestants to make them numbers
jc1.columns
for col in jc1.columns:
if '_score' in col:
jc1[col] = jc1[col].map(lambda x: float(x.strip('$')) if type(x) == str else np.nan)
What are our dtypes now?
jc1.dtypes
_input float64 _num int64 _widgetName object _source object _resultNumber int64 _pageUrl object game_number object bio1 object bio2 object bio3 object contestant1_name object contestant1_score float64 contestant2_name object contestant2_score float64 contestant3_name object contestant3_score float64 dtype: object
Can we visualize our contestant winnings?
jc1.hist(column='contestant1_score')
jc1.hist(column='contestant2_score')
jc1.hist(column='contestant3_score')
array([[<Axes: title={'center': 'contestant3_score'}>]], dtype=object)
rainfallscrape
folder. You may need to install additional Python packages via pip -r rainfallscrape/requirements.txt
.Let's unbox this data too:
!ls -lh data/m*
-rw-r--r-- 1 jovyan jovyan 600K Feb 28 20:14 data/mmp.txt -rw-r--r-- 1 jovyan jovyan 4.3M Feb 28 20:14 data/mmr.txt -rw-r--r-- 1 jovyan jovyan 4.0K Feb 28 20:14 data/mm.txt -rw-r--r-- 1 jovyan jovyan 627K Feb 28 20:14 data/mpf.txt
All of these files look rectangular. What data model does each follow, structurally?
Analyze the first few lines of each file in order below. Remember to adjust -n
number of lines as needed.
mm.txt
mmp.txt
mmr.txt
mpf.txt
- may need to look at first 20 lines here!head -n 20 data/mpf.txt
Year,ID,Location,Station,OCT,NOV,DEC,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP 2002,4BK,BROOKINGS,SOUTHERN OREGON COASTAL,6.43,14.53,17.32,17.32,9.1,6.05,6.62,3.65,3.68,0.02,0.03,1.45 2002,ASHO3,ASHLAND,SOUTHERN OREGON COASTAL,0.38,3.5,3.41,1.32,1.29,1.29,2.92,1.88,0.08,0.28,0,0.2 2002,COPO3,COPPER 4NE,SOUTHERN OREGON COASTAL,0.29,6.68,6.98,3.42,1.99,1.8,,,,,,0.27 2002,CVJO3,CAVE JUNCTION,SOUTHERN OREGON COASTAL,2.46,13.6,14.81,9.76,6.46,4.63,1.94,0.89,0,0,0,0.33 2002,GOLO3,GOLD BEACH,SOUTHERN OREGON COASTAL,4.63,11.72,16.59,14.58,8.89,6.62,4.73,1.5,2.09,0.02,0,0.62 2002,GPSO3,GRANTS PASS KAJO,SOUTHERN OREGON COASTAL,0.39,6.37,6.94,4.31,2.89,1.36,0.87,0.62,0.02,0,0,0.05 2002,GSPO3,GREEN SPRINGS PP,SOUTHERN OREGON COASTAL,0.36,4.79,5.9,2.52,1.47,1.74,2.41,1.15,0.11,0.01,0,0.2 2002,LEMO3,LEMOLO LAKE,SOUTHERN OREGON COASTAL,5.13,10.3,12.72,11.48,4.99,6.32,5.49,1.14,2.22,0.4,0.2,2.13 2002,MFR,MEDFORD,SOUTHERN OREGON COASTAL,0.19,4.16,4.34,1.59,1.65,1.33,1.49,0.53,0.03,0.08,0,0.54 2002,OTH,NORTH BEND,SOUTHERN OREGON COASTAL,4.45,10.25,14.1,15.79,4.14,6.67,4.56,1.48,1.42,0.01,0.22,1.33 2002,PRSO3,PROSPECT 2SW,SOUTHERN OREGON COASTAL,1.78,7.17,9.1,5.86,3.45,3.51,2.88,0.65,0.34,0.03,0,0.82 2002,RIDO3,RIDDLE,SOUTHERN OREGON COASTAL,1.02,6.84,6.71,4.81,2.76,1.71,1.23,1.3,0.26,0.02,0,0.24 2002,ROGO3,ROSEBURG,SOUTHERN OREGON COASTAL,1.91,6.88,6.65,4.83,2.4,0.85,1.54,1.13,0.19,0,0,0.32 2002,SXT,SEXTON SUMMIT,SOUTHERN OREGON COASTAL,0.28,5.13,5.75,5.75,3.03,2.9,1.63,1.08,0.07,0.01,0,0.2 2002,TKFO3,TOKETEE FALLS,SOUTHERN OREGON COASTAL,3.23,7.45,9.59,6.94,3.54,4.06,3.31,1.22,1.17,0.04,0.02,1.99 2002,WINO3,WINCHESTER DAM,SOUTHERN OREGON COASTAL,1.66,7.37,7.39,5.32,2.77,2.93,1.46,1.29,0.25,0,0.08,0.65 2002,WLMO3,WILLIAMS,SOUTHERN OREGON COASTAL,0.4,7.11,7.98,7.98,2.68,2.68,1.79,0.46,0,0,0,0.08 2002,DNON2,DENIO,OREGON CLOSED BASINS,0.25,1.54,1.37,0.69,0.91,0.56,0.83,0.15,0.12,0,0.08,0.53 2002,HMRO3,HART MOUNTAIN REF,OREGON CLOSED BASINS,0.34,1.37,0.61,0.67,0.63,0.72,1.45,0.6,0.66,0,0.15,0.57
Let's start from the long (tidy, relational) file:
pd.set_option('display.min_rows', 15) # changes truncated view in pandas
mmr = pd.read_csv('data/mmr.txt')
mmr
Year | ID | Location | Station | Month | Inches of Precipitation | |
---|---|---|---|---|---|---|
0 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | OCT | 6.43 |
1 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | NOV | 14.53 |
2 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | DEC | 17.32 |
3 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | JAN | 17.32 |
4 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | FEB | 9.10 |
5 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | MAR | 6.05 |
6 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | APR | 6.62 |
... | ... | ... | ... | ... | ... | ... |
78749 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | MAR | 3.65 |
78750 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | APR | 1.30 |
78751 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | MAY | 0.07 |
78752 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | JUN | 0.00 |
78753 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | JUL | 0.00 |
78754 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | AUG | 0.11 |
78755 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | SEP | 0.00 |
78756 rows × 6 columns
A data analyst asks for just the precipitation grouped by year and month, with no location data. How do we do this?
(mmr
.groupby(['Year', 'Month'])
.mean('Inches of Precipitation')
)
Inches of Precipitation | ||
---|---|---|
Year | Month | |
2002 | APR | 1.137029 |
AUG | 0.031988 | |
DEC | 5.759020 | |
FEB | 1.715299 | |
JAN | 2.593914 | |
JUL | 0.113276 | |
JUN | 0.164169 | |
... | ... | ... |
2020 | JUL | 0.071124 |
JUN | 0.291605 | |
MAR | 3.194899 | |
MAY | 1.521296 | |
NOV | 1.383289 | |
OCT | 0.233237 | |
SEP | 0.087298 |
228 rows × 1 columns
(mmr
.groupby(['Year', 'Month'])
.mean('Inches of Precipitation')
.reset_index()
)
Year | Month | Inches of Precipitation | |
---|---|---|---|
0 | 2002 | APR | 1.137029 |
1 | 2002 | AUG | 0.031988 |
2 | 2002 | DEC | 5.759020 |
3 | 2002 | FEB | 1.715299 |
4 | 2002 | JAN | 2.593914 |
5 | 2002 | JUL | 0.113276 |
6 | 2002 | JUN | 0.164169 |
... | ... | ... | ... |
221 | 2020 | JUL | 0.071124 |
222 | 2020 | JUN | 0.291605 |
223 | 2020 | MAR | 3.194899 |
224 | 2020 | MAY | 1.521296 |
225 | 2020 | NOV | 1.383289 |
226 | 2020 | OCT | 0.233237 |
227 | 2020 | SEP | 0.087298 |
228 rows × 3 columns
(mmr
.groupby(['Year', 'Month'])
.mean('Inches of Precipitation')
.reset_index()
.pivot(index='Year', columns='Month')
)
Inches of Precipitation | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
Month | APR | AUG | DEC | FEB | JAN | JUL | JUN | MAR | MAY | NOV | OCT | SEP |
Year | ||||||||||||
2002 | 1.137029 | 0.031988 | 5.759020 | 1.715299 | 2.593914 | 0.113276 | 0.164169 | 2.341909 | 0.778444 | 4.450114 | 0.779886 | 0.245989 |
2003 | 4.328567 | 0.749020 | 8.223591 | 2.309420 | 2.203646 | 0.273139 | 0.069637 | 2.834558 | 1.287335 | 3.441080 | 0.152645 | 0.274792 |
2004 | 0.952753 | 0.348478 | 6.403657 | 5.577956 | 2.604282 | 0.167885 | 0.209191 | 1.106704 | 0.710847 | 2.330726 | 0.153994 | 0.303676 |
2005 | 2.147717 | 0.269502 | 5.158300 | 3.068052 | 5.242566 | 0.147035 | 0.954290 | 4.220372 | 3.293669 | 1.786081 | 4.117131 | 0.339278 |
2006 | 4.769494 | 0.063776 | 8.898985 | 2.988799 | 5.387297 | 0.286698 | 0.317152 | 6.027270 | 0.891957 | 2.744532 | 1.036478 | 0.108553 |
2007 | 1.589755 | 0.220915 | 3.888541 | 4.986019 | 0.974650 | 0.337500 | 0.272588 | 0.989938 | 0.464861 | 2.874665 | 0.546820 | 0.598613 |
2008 | 0.498474 | 0.128776 | 3.812276 | 3.281987 | 6.945710 | 0.144394 | 0.147396 | 1.021129 | 0.825161 | 1.039874 | 1.939620 | 0.048238 |
2009 | 0.954487 | 0.192935 | 3.569159 | 4.758199 | 1.710385 | 0.125699 | 1.091293 | 2.833762 | 1.842159 | 2.618889 | 0.895625 | 0.155199 |
2010 | 3.991922 | 0.175827 | 3.418704 | 3.679385 | 5.818758 | 0.137862 | 0.488040 | 2.636990 | 1.583604 | 1.223742 | 2.369213 | 0.342548 |
2011 | 1.589761 | 0.078509 | 8.518367 | 3.521395 | 1.544233 | 0.320114 | 1.291661 | 7.198146 | 2.008582 | 3.378106 | 3.544080 | 0.290795 |
2012 | 2.770936 | 0.337091 | 0.534778 | 1.429661 | 3.638942 | 0.350532 | 0.571979 | 5.496522 | 0.389155 | 1.792911 | 1.813299 | 0.172264 |
2013 | 0.936354 | 0.372731 | 6.666061 | 0.675429 | 1.230545 | 0.269000 | 0.509749 | 1.403262 | 0.922319 | 4.098370 | 1.359747 | 1.249782 |
2014 | 1.400409 | 0.608843 | 0.637643 | 4.676441 | 0.838561 | 0.365509 | 0.157500 | 3.519856 | 0.653055 | 1.026496 | 0.419648 | 0.924432 |
2015 | 1.281828 | 0.194196 | 6.060000 | 3.118551 | 0.546971 | 0.828107 | 0.432817 | 0.875036 | 1.398760 | 2.105344 | 1.251131 | 0.359915 |
2016 | 1.696984 | 0.120987 | 5.709291 | 1.450909 | 6.572060 | 0.155851 | 0.460711 | 5.603896 | 1.054807 | 2.399208 | 1.191836 | 0.216721 |
2017 | 3.496059 | 0.290359 | 4.936709 | 8.755809 | 10.512680 | 0.206615 | 0.475181 | 3.186017 | 0.595875 | 2.540044 | 4.917741 | 0.611836 |
2018 | 2.370574 | 0.098026 | 0.474760 | 0.790353 | 3.361944 | 0.276545 | 0.173400 | 6.066850 | 0.821755 | 3.838214 | 0.518413 | 0.056466 |
2019 | 1.762869 | 0.198930 | 2.589115 | 9.120726 | 5.341917 | 0.103884 | 0.191608 | 4.260430 | 3.057654 | 3.168361 | 0.830792 | 0.949773 |
2020 | 2.079592 | 0.103135 | 4.790359 | 0.278024 | 2.332782 | 0.071124 | 0.291605 | 3.194899 | 1.521296 | 1.383289 | 0.233237 | 0.087298 |
rainfallscrape
folder. You may need to install additional Python packages via pip -r rainfallscrape/requirements.txt
. We'll assume you've already run simple_scrape.py
.!head -c 4096 data/mpf.txt
Messy! You can play in bash or pandas if you like. Let's clean this up a bit. Firstly, we note that the structure of the file is a bit off. It's hard to read multiple dataframes from a single CSV like this...so let's try to write multiple files and concatenate a df.
curr_file = None
with open('data/mpf.txt') as f:
for line in f.readlines():
splits = line.split(', ')
if len(splits) == 2:
print(splits[1])
Okay, first, let's create some files; one for each segment. And let's rename them into something more conventional.
curr_file = None
with open('lec12_data/mpf.txt') as f:
for num, line in enumerate(f.readlines()):
splits = line.split(', ')
if len(splits) == 2:
print(splits[1].strip(" '\n ").replace('...', '-').replace(' ', '_'))
That's probably more of what we want! Now, let's actually write our new files...
curr_file = None
with open('data/mpf.txt') as master_file:
for line in master_file.readlines():
splits = line.split(', ')
if len(splits) == 2:
if curr_file is not None:
curr_file.close()
filename = splits[0] + '_' + splits[1].strip(" '\n ")
filename = filename.replace('...', '-')
filename = filename.replace(' ', '_')
curr_file = open('data/subfiles/' + filename + '.txt', 'w')
curr_file.write(line)
curr_file.close()
Now, let's read these files into a dataframe...and we'll need to do a lot of preprocessing!
import os
mpf = pd.DataFrame()
parent_dir = os.getcwd() + '/data/subfiles/'
for filename in os.listdir(parent_dir):
# read from the first header. not the second!
region_df = pd.read_csv(parent_dir + '/' + filename, header=1)
# BE CAREFUL! Let's rename our schema...because right now, this is problematic.
# The first column is just labeled as the year itself (e.g. 2007). Not 'Year'.
region_df = region_df.rename(columns={filename.split('_')[0]: 'Year'})
mpf = mpf.append(region_df)
mpf = mpf.sort_values('Year')
mpf
Now, let's figure out what our data looks like in Pandas.
mpf
mpf.dtypes
Let's rename our columns!
col_set = {}
for column in mpf.columns:
if column != 'Year':
col_set.update({column: column.strip(" '")})
mpf = mpf.rename(columns=col_set)
mpf
# Let's remove the Ms and replace them with NaNs, and strip the quotes
import math
def _process_data(x):
if type(x) == float:
return x
if type(x) != str and math.isnan(x):
return np.nan
x = x.strip("' ")
if x == 'M' or x == 'NA':
return np.nan
else:
return float(x)
mpf = mpf[mpf['Year'] == 2002]
for col in mpf.columns:
if col in ["ID", "Location"]:
print(col)
mpf[col] = mpf[col].map(lambda x: str(x).strip("'"))
elif col != 'Year':
mpf[col] = mpf[col].map(lambda x: _process_data(x))
mpf