# to save space, do a symbolic link to previous lecture's data
!ln -sf ../lec11/data .
To start, let's take our matrix in mm.txt
, and load it into Pandas.
import pandas as pd
mm = pd.read_csv('data/mm.txt', header=0)
mm
Year | OCT | NOV | DEC | JAN | FEB | MAR | APR | MAY | JUN | JUL | AUG | SEP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2002 | 545.92 | 3115.08 | 3996.76 | 1815.74 | 1204.14 | 1644.02 | 795.92 | 540.24 | 112.62 | 79.52 | 22.20 | 171.70 |
1 | 2003 | 55.41 | 1242.23 | 2976.94 | 797.72 | 836.01 | 1026.11 | 1571.27 | 468.59 | 24.93 | 98.33 | 267.40 | 99.20 |
2 | 2004 | 55.90 | 834.40 | 2311.72 | 942.75 | 2019.22 | 399.52 | 339.18 | 251.64 | 72.38 | 55.57 | 116.74 | 97.48 |
3 | 2006 | 347.22 | 908.44 | 2981.16 | 1793.97 | 995.27 | 2031.19 | 1602.55 | 287.21 | 102.44 | 90.31 | 18.75 | 33.76 |
4 | 2005 | 1449.23 | 619.77 | 1789.93 | 1777.23 | 1055.41 | 1472.91 | 743.11 | 1113.26 | 309.19 | 46.61 | 86.51 | 93.98 |
5 | 2007 | 178.81 | 942.89 | 1279.33 | 320.66 | 1615.47 | 317.77 | 519.85 | 150.15 | 85.32 | 102.60 | 62.74 | 164.02 |
6 | 2008 | 612.92 | 329.64 | 1189.43 | 2153.17 | 1007.57 | 316.55 | 153.53 | 255.80 | 42.45 | 41.73 | 36.83 | 12.59 |
7 | 2009 | 272.27 | 777.81 | 1102.87 | 533.64 | 1479.80 | 881.30 | 297.80 | 580.28 | 320.84 | 35.95 | 53.25 | 42.99 |
8 | 2010 | 722.61 | 379.36 | 1029.03 | 1780.54 | 1136.93 | 814.83 | 1225.52 | 487.75 | 146.90 | 38.05 | 46.77 | 88.72 |
9 | 2011 | 1059.68 | 1016.81 | 2555.51 | 463.27 | 1059.94 | 2173.84 | 465.80 | 552.36 | 350.04 | 84.51 | 21.59 | 76.77 |
10 | 2020 | 56.21 | 311.24 | 1202.38 | 578.53 | 70.34 | 789.14 | 509.50 | 375.76 | 70.86 | 17.71 | 25.99 | 21.65 |
11 | 2012 | 527.67 | 523.53 | 156.69 | 1066.21 | 421.75 | 1643.46 | 828.51 | 115.19 | 164.73 | 98.85 | 92.70 | 45.65 |
12 | 2014 | 119.18 | 281.26 | 178.54 | 233.12 | 1314.08 | 978.52 | 376.71 | 179.59 | 42.21 | 96.86 | 163.17 | 244.05 |
13 | 2013 | 376.65 | 1131.15 | 1759.84 | 338.40 | 189.12 | 391.51 | 259.37 | 254.56 | 142.22 | 72.63 | 96.91 | 343.69 |
14 | 2015 | 342.81 | 551.60 | 1654.38 | 149.87 | 860.72 | 243.26 | 343.53 | 338.50 | 109.07 | 201.23 | 49.52 | 84.94 |
15 | 2016 | 305.11 | 635.79 | 1530.09 | 1754.74 | 367.08 | 1395.37 | 427.64 | 245.77 | 110.11 | 37.56 | 28.19 | 52.88 |
16 | 2017 | 1175.34 | 581.67 | 1155.19 | 2628.17 | 2110.15 | 751.90 | 825.07 | 153.14 | 118.32 | 53.72 | 72.88 | 156.63 |
17 | 2018 | 130.64 | 967.23 | 118.69 | 847.21 | 201.54 | 1540.98 | 578.42 | 201.33 | 43.35 | 68.03 | 22.84 | 14.06 |
18 | 2019 | 199.39 | 754.07 | 585.14 | 1282.06 | 2261.94 | 1090.67 | 442.48 | 743.01 | 48.86 | 25.14 | 48.34 | 208.95 |
mm_melted = mm.melt(id_vars=['Year'])
mm_melted
Year | variable | value | |
---|---|---|---|
0 | 2002 | OCT | 545.92 |
1 | 2003 | OCT | 55.41 |
2 | 2004 | OCT | 55.90 |
3 | 2006 | OCT | 347.22 |
4 | 2005 | OCT | 1449.23 |
... | ... | ... | ... |
223 | 2015 | SEP | 84.94 |
224 | 2016 | SEP | 52.88 |
225 | 2017 | SEP | 156.63 |
226 | 2018 | SEP | 14.06 |
227 | 2019 | SEP | 208.95 |
228 rows × 3 columns
Thanks to the id_var
parameter, the Year
column is named and repeated for all other (variable=column name, value=value) elements in the row.
mm_melted[mm_melted['Year'] == 2002]
# how do we get back to something that resembles the original matrix?
# mm_melted.pivot(index='variable', columns='Year') # A
mm_melted.pivot(index='Year', columns='variable')#.reset_index() #B
value | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
variable | APR | AUG | DEC | FEB | JAN | JUL | JUN | MAR | MAY | NOV | OCT | SEP |
Year | ||||||||||||
2002 | 795.92 | 22.20 | 3996.76 | 1204.14 | 1815.74 | 79.52 | 112.62 | 1644.02 | 540.24 | 3115.08 | 545.92 | 171.70 |
2003 | 1571.27 | 267.40 | 2976.94 | 836.01 | 797.72 | 98.33 | 24.93 | 1026.11 | 468.59 | 1242.23 | 55.41 | 99.20 |
2004 | 339.18 | 116.74 | 2311.72 | 2019.22 | 942.75 | 55.57 | 72.38 | 399.52 | 251.64 | 834.40 | 55.90 | 97.48 |
2005 | 743.11 | 86.51 | 1789.93 | 1055.41 | 1777.23 | 46.61 | 309.19 | 1472.91 | 1113.26 | 619.77 | 1449.23 | 93.98 |
2006 | 1602.55 | 18.75 | 2981.16 | 995.27 | 1793.97 | 90.31 | 102.44 | 2031.19 | 287.21 | 908.44 | 347.22 | 33.76 |
2007 | 519.85 | 62.74 | 1279.33 | 1615.47 | 320.66 | 102.60 | 85.32 | 317.77 | 150.15 | 942.89 | 178.81 | 164.02 |
2008 | 153.53 | 36.83 | 1189.43 | 1007.57 | 2153.17 | 41.73 | 42.45 | 316.55 | 255.80 | 329.64 | 612.92 | 12.59 |
2009 | 297.80 | 53.25 | 1102.87 | 1479.80 | 533.64 | 35.95 | 320.84 | 881.30 | 580.28 | 777.81 | 272.27 | 42.99 |
2010 | 1225.52 | 46.77 | 1029.03 | 1136.93 | 1780.54 | 38.05 | 146.90 | 814.83 | 487.75 | 379.36 | 722.61 | 88.72 |
2011 | 465.80 | 21.59 | 2555.51 | 1059.94 | 463.27 | 84.51 | 350.04 | 2173.84 | 552.36 | 1016.81 | 1059.68 | 76.77 |
2012 | 828.51 | 92.70 | 156.69 | 421.75 | 1066.21 | 98.85 | 164.73 | 1643.46 | 115.19 | 523.53 | 527.67 | 45.65 |
2013 | 259.37 | 96.91 | 1759.84 | 189.12 | 338.40 | 72.63 | 142.22 | 391.51 | 254.56 | 1131.15 | 376.65 | 343.69 |
2014 | 376.71 | 163.17 | 178.54 | 1314.08 | 233.12 | 96.86 | 42.21 | 978.52 | 179.59 | 281.26 | 119.18 | 244.05 |
2015 | 343.53 | 49.52 | 1654.38 | 860.72 | 149.87 | 201.23 | 109.07 | 243.26 | 338.50 | 551.60 | 342.81 | 84.94 |
2016 | 427.64 | 28.19 | 1530.09 | 367.08 | 1754.74 | 37.56 | 110.11 | 1395.37 | 245.77 | 635.79 | 305.11 | 52.88 |
2017 | 825.07 | 72.88 | 1155.19 | 2110.15 | 2628.17 | 53.72 | 118.32 | 751.90 | 153.14 | 581.67 | 1175.34 | 156.63 |
2018 | 578.42 | 22.84 | 118.69 | 201.54 | 847.21 | 68.03 | 43.35 | 1540.98 | 201.33 | 967.23 | 130.64 | 14.06 |
2019 | 442.48 | 48.34 | 585.14 | 2261.94 | 1282.06 | 25.14 | 48.86 | 1090.67 | 743.01 | 754.07 | 199.39 | 208.95 |
2020 | 509.50 | 25.99 | 1202.38 | 70.34 | 578.53 | 17.71 | 70.86 | 789.14 | 375.76 | 311.24 | 56.21 | 21.65 |
!du -h data/*
1.4M data/jc1.txt 51M data/jq2.txt 696K data/mmp.txt 4.1M data/mmr.txt 4.0K data/mm.txt 824K data/mpf.txt 4.0K data/requirements.txt
!ls -lh data/*
-rw-r--r-- 1 jovyan jovyan 1.4M Sep 28 19:24 data/jc1.txt -rw-r--r-- 1 jovyan jovyan 51M Sep 28 19:24 data/jq2.txt -rw-r--r-- 1 jovyan jovyan 695K Sep 28 19:24 data/mmp.txt -rw-r--r-- 1 jovyan jovyan 4.1M Sep 28 19:24 data/mmr.txt -rw-r--r-- 1 jovyan jovyan 1.3K Sep 28 19:24 data/mm.txt -rw-r--r-- 1 jovyan jovyan 822K Sep 28 19:24 data/mpf.txt -rw-r--r-- 1 jovyan jovyan 2.1K Sep 28 19:24 data/requirements.txt
What category of data is the file below? Any observations about the data?
!head -c 1024 data/jc1.txt
_input,_num,_widgetName,_source,_resultNumber,_pageUrl,game_number,bio1,bio2,bio3,contestant1_name,contestant1_score,contestant2_name,contestant2_score,contestant3_name,contestant3_score ,1,Jeopardy_Winners,Jeopardy_Winners,1,http://www.j-archive.com/showgame.php?game_id=3350,Show #5883 - Wednesday March 24 2010,Derek Honoré an attorney from Inglewood California,Tatiana Walton a graphic designer from Cutler Bay Florida,Regina Robbins an arts teacher from New York New York (whose 1-day cash winnings total $38500),Regina,$19401,Tatiana,$7100,Derek,$11900 ,2,Jeopardy_Winners,Jeopardy_Winners,2,http://www.j-archive.com/showgame.php?game_id=4400,Show #6756 - Monday January 20 2014,Jon McGuire a software-development manager from Matthews North Carolina,Blake Perkins an environmental scientist from Baton Rouge Louisiana,Sarah McNitt a study abroad adviser originally from Ann Arbor Michigan (whose 4-day cash winnings total $69199),Sarah,$20199,Blake,$0,Jon,$8380 ,3,Jeopardy_Winners,Jeopardy_Winners,3,http://www.j
Exercise: What do you see this time? Category? Interesting features of the data?
!head -c 1024 data/jq2.txt
{"category":"HISTORY","air_date":"2004-12-31","question":"'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'","value":"$200","answer":"Copernicus","round":"Jeopardy!","show_number":"4680"} {"category":"ESPN's TOP 10 ALL-TIME ATHLETES","air_date":"2004-12-31","question":"'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves'","value":"$200","answer":"Jim Thorpe","round":"Jeopardy!","show_number":"4680"} {"category":"EVERYBODY TALKS ABOUT IT...","air_date":"2004-12-31","question":"'The city of Yuma in this state has a record average of 4,055 hours of sunshine each year'","value":"$200","answer":"Arizona","round":"Jeopardy!","show_number":"4680"} {"category":"THE COMPANY LINE","air_date":"2004-12-31","question":"'In 1963, live on \"The Art Linkletter Show\", this company served its billionth burger'","value":"$200","answer":"McDonald\\'s","round":"Jeopardy!","show_number":"4680"} {"category":"EPITAPHS & TRIBUTE
Let's explore the first file here:
!head -c 1024 data/jc1.txt
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
jc1 = pd.read_csv('data/jc1.txt', header=0, sep=',')
jc1.head(3)
What data types are each column?
jc1.dtypes
Currently, we're working with objects, because of how pandas read in the dtypes. Let's try to modify some scores here for contestants to make them numbers
jc1.columns
for col in jc1.columns:
if '_score' in col:
jc1[col] = jc1[col].map(lambda x: float(x.strip('$')) if type(x) == str else np.nan)
What are our dtypes now?
jc1.dtypes
Can we visualize our contestant winnings?
jc1.hist(column='contestant1_score')
jc1.hist(column='contestant2_score')
jc1.hist(column='contestant3_score')
rainfallscrape
folder. You may need to install additional Python packages via pip -r rainfallscrape/requirements.txt
.Let's unbox this data too:
!ls -lh data/m*
-rw-r--r-- 1 jovyan jovyan 695K Sep 28 19:24 data/mmp.txt -rw-r--r-- 1 jovyan jovyan 4.1M Sep 28 19:24 data/mmr.txt -rw-r--r-- 1 jovyan jovyan 1.3K Sep 28 19:24 data/mm.txt -rw-r--r-- 1 jovyan jovyan 822K Sep 28 19:24 data/mpf.txt
All of these files look rectangular. What data model does each follow, structurally?
Analyze the first few lines of each file in order below. Remember to adjust -n
number of lines as needed.
mm.txt
mmp.txt
mmr.txt
mpf.txt
- may need to look at first 20 lines here!head -n 20 data/mpf.txt
2002, 'SOUTHERN OREGON COASTAL' 2002, 'ID', 'Location', 'OCT', 'NOV', 'DEC', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'WY to Date', 'Pct Avg to Date', 'Pct Tot WY' 2002, 'ASHO3', 'ASHLAND', '0.86', '0.49', '2.12', '3.42', '1.38', '1.00', '0.36', '2.30', '1.54', '0.00', '0.00', '0.16', '13.63', '68', '68' 2002, 'CVJO3', 'CAVE JUNCTION', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M' 2002, 'GOLO3', 'GOLD BEACH', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M' 2002, 'GPSO3', 'GRANTS PASS KAJO', '0.61', '1.21', '4.19', '6.31', '0.24', '0.77', '0.58', '2.02', '0.87', '0.00', '0.00', '0.20', '17.00', '55', '55' 2002, 'GSPO3', 'GREEN SPRINGS PP', '0.35', '0.75', '2.44', '4.14', '0.66', 'M', '0.26', '2.59', 'M', 'M', '0.00', '0.20', 'M' 2002, 'LEMO3', 'LEMOLO LAKE', '3.68', '1.81', '5.59', '18.19', '4.34', '4.32', '3.37', '4.05', '2.52', '0.00', '0.02', '2.33', '50.22', '76', '76' 2002, 'MFR', 'MEDFORD', '0.65', '0.24', '2.86', '3.43', '0.51', '0.74', '0.46', '2.50', '1.20', '0.00', '0.00', '0.05', '12.64', '69', '69' 2002, 'OTH', 'NORTH BEND', '2.43', '2.10', '9.16', '7.73', '2.85', '3.71', '2.71', '4.66', '2.00', '0.04', '0.07', '1.37', '38.83', '60', '60' 2002, 'PRSO3', 'PROSPECT 2SW', '2.22', '1.95', '4.12', '9.19', '2.08', '3.27', '1.43', '4.06', '1.80', '0.00', '0.00', '0.77', '30.89', '74', '74' 2002, 'RIDO3', 'RIDDLE', '0.88', '1.73', '4.20', '7.34', '1.35', '1.74', '1.34', '2.16', '0.81', '0.00', '0.00', '0.72', '22.27', '71', '71' 2002, 'ROGO3', 'ROSEBURG', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M' 2002, 'SXT', 'SEXTON SUMMIT', 'M', '0.42', '5.34', '7.06', '0.73', '2.15', '0.78', '4.16', '0.92', '0.00', '0.05', '0.80', 'M' 2002, 'TKFO3', 'TOKETEE FALLS', '2.70', '1.54', '3.55', '10.37', '3.16', '4.31', '2.39', '4.26', '2.41', '0.00', '0.01', '1.54', '36.24', '75', '75' 2002, 'WINO3', 'WINCHESTER DAM', '1.28', '1.77', '4.14', '8.24', '1.60', '2.63', '1.69', '2.75', '1.33', '0.00', '0.00', '1.10', '26.53', '73', '73' 2002, 'WLMO3', 'WILLIAMS', '0.65', '0.85', '4.52', '7.00', '0.27', '1.40', '0.31', '2.70', '0.79', '0.00', '0.01', '0.08', '18.58', '57', '57' 2002, 'OREGON CLOSED BASINS' 2002, 'ID', 'Location', 'OCT', 'NOV', 'DEC', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'WY to Date', 'Pct Avg to Date', 'Pct Tot WY' 2002, 'HMRO3', 'HART MOUNTAIN REF', '0.00', '0.00', 'M', '2.60', '0.08', '1.63', '0.69', '0.97', '2.59', '0.03', '0.02', '0.00', 'M'
Let's start from the long (tidy, relational) file:
pd.set_option('display.min_rows', 15) # changes truncated view in pandas
mmr = pd.read_csv('data/mmr.txt')
mmr
Year | ID | Location | Station | Month | Inches of Precipitation | |
---|---|---|---|---|---|---|
0 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | OCT | 6.43 |
1 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | NOV | 14.53 |
2 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | DEC | 17.32 |
3 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | JAN | 17.32 |
4 | 2002 | 4BK | BROOKINGS | SOUTHERN OREGON COASTAL | FEB | 9.10 |
... | ... | ... | ... | ... | ... | ... |
78751 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | MAY | 0.07 |
78752 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | JUN | 0.00 |
78753 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | JUL | 0.00 |
78754 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | AUG | 0.11 |
78755 | 2020 | EMRU1 | ENTERPRISE | ESCALANTE DESERT | SEP | 0.00 |
78756 rows × 6 columns
A data analyst asks for just the precipitation grouped by year and month, with no location data. How do we do this?
(mmr
.groupby(['Year', 'Month'])
.mean('Inches of Precipitation')
.reset_index()
.pivot(index='Year', columns='Month')
)
Inches of Precipitation | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
Month | APR | AUG | DEC | FEB | JAN | JUL | JUN | MAR | MAY | NOV | OCT | SEP |
Year | ||||||||||||
2002 | 1.137029 | 0.031988 | 5.759020 | 1.715299 | 2.593914 | 0.113276 | 0.164169 | 2.341909 | 0.778444 | 4.450114 | 0.779886 | 0.245989 |
2003 | 4.328567 | 0.749020 | 8.223591 | 2.309420 | 2.203646 | 0.273139 | 0.069637 | 2.834558 | 1.287335 | 3.441080 | 0.152645 | 0.274792 |
2004 | 0.952753 | 0.348478 | 6.403657 | 5.577956 | 2.604282 | 0.167885 | 0.209191 | 1.106704 | 0.710847 | 2.330726 | 0.153994 | 0.303676 |
2005 | 2.147717 | 0.269502 | 5.158300 | 3.068052 | 5.242566 | 0.147035 | 0.954290 | 4.220372 | 3.293669 | 1.786081 | 4.117131 | 0.339278 |
2006 | 4.769494 | 0.063776 | 8.898985 | 2.988799 | 5.387297 | 0.286698 | 0.317152 | 6.027270 | 0.891957 | 2.744532 | 1.036478 | 0.108553 |
2007 | 1.589755 | 0.220915 | 3.888541 | 4.986019 | 0.974650 | 0.337500 | 0.272588 | 0.989938 | 0.464861 | 2.874665 | 0.546820 | 0.598613 |
2008 | 0.498474 | 0.128776 | 3.812276 | 3.281987 | 6.945710 | 0.144394 | 0.147396 | 1.021129 | 0.825161 | 1.039874 | 1.939620 | 0.048238 |
2009 | 0.954487 | 0.192935 | 3.569159 | 4.758199 | 1.710385 | 0.125699 | 1.091293 | 2.833762 | 1.842159 | 2.618889 | 0.895625 | 0.155199 |
2010 | 3.991922 | 0.175827 | 3.418704 | 3.679385 | 5.818758 | 0.137862 | 0.488040 | 2.636990 | 1.583604 | 1.223742 | 2.369213 | 0.342548 |
2011 | 1.589761 | 0.078509 | 8.518367 | 3.521395 | 1.544233 | 0.320114 | 1.291661 | 7.198146 | 2.008582 | 3.378106 | 3.544080 | 0.290795 |
2012 | 2.770936 | 0.337091 | 0.534778 | 1.429661 | 3.638942 | 0.350532 | 0.571979 | 5.496522 | 0.389155 | 1.792911 | 1.813299 | 0.172264 |
2013 | 0.936354 | 0.372731 | 6.666061 | 0.675429 | 1.230545 | 0.269000 | 0.509749 | 1.403262 | 0.922319 | 4.098370 | 1.359747 | 1.249782 |
2014 | 1.400409 | 0.608843 | 0.637643 | 4.676441 | 0.838561 | 0.365509 | 0.157500 | 3.519856 | 0.653055 | 1.026496 | 0.419648 | 0.924432 |
2015 | 1.281828 | 0.194196 | 6.060000 | 3.118551 | 0.546971 | 0.828107 | 0.432817 | 0.875036 | 1.398760 | 2.105344 | 1.251131 | 0.359915 |
2016 | 1.696984 | 0.120987 | 5.709291 | 1.450909 | 6.572060 | 0.155851 | 0.460711 | 5.603896 | 1.054807 | 2.399208 | 1.191836 | 0.216721 |
2017 | 3.496059 | 0.290359 | 4.936709 | 8.755809 | 10.512680 | 0.206615 | 0.475181 | 3.186017 | 0.595875 | 2.540044 | 4.917741 | 0.611836 |
2018 | 2.370574 | 0.098026 | 0.474760 | 0.790353 | 3.361944 | 0.276545 | 0.173400 | 6.066850 | 0.821755 | 3.838214 | 0.518413 | 0.056466 |
2019 | 1.762869 | 0.198930 | 2.589115 | 9.120726 | 5.341917 | 0.103884 | 0.191608 | 4.260430 | 3.057654 | 3.168361 | 0.830792 | 0.949773 |
2020 | 2.079592 | 0.103135 | 4.790359 | 0.278024 | 2.332782 | 0.071124 | 0.291605 | 3.194899 | 1.521296 | 1.383289 | 0.233237 | 0.087298 |
## we'll use the Lahman baseball database in our examples today.
## replace the database connection with a database of your own!
%reload_ext sql
%sql postgresql://localhost:5432/baseball
%config SqlMagic.displaylimit = 30
%%sql
WITH yearnum AS
(SELECT yearid, (yearid % 100) as year
FROM batting
)
SELECT yearid, CONCAT('''', LPAD(year::text, 2, '0')) as year
FROM yearnum
LIMIT 5;
yearid | year |
---|---|
2004 | '04 |
2007 | '07 |
2009 | '09 |
2010 | '10 |
2012 | '12 |
Let's analyze the below query (we've flattened it for convenience):
%%sql
EXPLAIN (VERBOSE true)
SELECT yearid,
CONCAT('''', LPAD((yearid % 100)::text, 2, '0')) AS year
FROM batting;
QUERY PLAN |
---|
Seq Scan on public.batting (cost=0.00..3922.29 rows=104324 width=36) |
Output: yearid, concat('''', lpad(((yearid % 100))::text, 2, '0'::text)) |
What if scalar functions mention multiple tables?
The below query computes an arbitrary statistic for pitchers:
If the notebook-like output is hard to read, try out the query in psql
. Note that notebooks don't preserve whitespace when displaying dataframes.
%%sql
EXPLAIN (VERBOSE true)
SELECT p.playerid, p.so - b.so
FROM pitching p
INNER JOIN batting b
ON p.playerid=b.playerid;
QUERY PLAN |
---|
Nested Loop (cost=0.43..13004.27 rows=339358 width=13) |
Output: p.playerid, (p.so - b.so) |
-> Seq Scan on public.pitching p (cost=0.00..1374.06 rows=45806 width=13) |
Output: p.playerid, p.yearid, p.stint, p.teamid, p.lgid, p.w, p.l, p.g, p.gs, p.cg, p.sho, p.sv, p.ipouts, p.h, p.er, p.hr, p.bb, p.so, p.baopp, p.era, p.ibb, p.wp, p.hbp, p.bk, p.bfp, p.gf, p.r, p.sh, p.sf, p.gidp |
-> Memoize (cost=0.43..0.73 rows=7 width=13) |
Output: b.so, b.playerid |
Cache Key: p.playerid |
Cache Mode: logical |
-> Index Scan using batting_pkey on public.batting b (cost=0.42..0.72 rows=7 width=13) |
Output: b.so, b.playerid |
Index Cond: ((b.playerid)::text = (p.playerid)::text) |
rainfallscrape
folder. You may need to install additional Python packages via pip -r rainfallscrape/requirements.txt
. We'll assume you've already run simple_scrape.py
.!head -c 4096 data/mpf.txt
Messy! You can play in bash or pandas if you like. Let's clean this up a bit. Firstly, we note that the structure of the file is a bit off. It's hard to read multiple dataframes from a single CSV like this...so let's try to write multiple files and concatenate a df.
curr_file = None
with open('data/mpf.txt') as f:
for line in f.readlines():
splits = line.split(', ')
if len(splits) == 2:
print(splits[1])
Okay, first, let's create some files; one for each segment. And let's rename them into something more conventional.
curr_file = None
with open('lec12_data/mpf.txt') as f:
for num, line in enumerate(f.readlines()):
splits = line.split(', ')
if len(splits) == 2:
print(splits[1].strip(" '\n ").replace('...', '-').replace(' ', '_'))
That's probably more of what we want! Now, let's actually write our new files...
curr_file = None
with open('data/mpf.txt') as master_file:
for line in master_file.readlines():
splits = line.split(', ')
if len(splits) == 2:
if curr_file is not None:
curr_file.close()
filename = splits[0] + '_' + splits[1].strip(" '\n ")
filename = filename.replace('...', '-')
filename = filename.replace(' ', '_')
curr_file = open('data/subfiles/' + filename + '.txt', 'w')
curr_file.write(line)
curr_file.close()
Now, let's read these files into a dataframe...and we'll need to do a lot of preprocessing!
import os
mpf = pd.DataFrame()
parent_dir = os.getcwd() + '/data/subfiles/'
for filename in os.listdir(parent_dir):
# read from the first header. not the second!
region_df = pd.read_csv(parent_dir + '/' + filename, header=1)
# BE CAREFUL! Let's rename our schema...because right now, this is problematic.
# The first column is just labeled as the year itself (e.g. 2007). Not 'Year'.
region_df = region_df.rename(columns={filename.split('_')[0]: 'Year'})
mpf = mpf.append(region_df)
mpf = mpf.sort_values('Year')
mpf
Now, let's figure out what our data looks like in Pandas.
mpf
mpf.dtypes
Let's rename our columns!
col_set = {}
for column in mpf.columns:
if column != 'Year':
col_set.update({column: column.strip(" '")})
mpf = mpf.rename(columns=col_set)
mpf
# Let's remove the Ms and replace them with NaNs, and strip the quotes
import math
def _process_data(x):
if type(x) == float:
return x
if type(x) != str and math.isnan(x):
return np.nan
x = x.strip("' ")
if x == 'M' or x == 'NA':
return np.nan
else:
return float(x)
mpf = mpf[mpf['Year'] == 2002]
for col in mpf.columns:
if col in ["ID", "Location"]:
print(col)
mpf[col] = mpf[col].map(lambda x: str(x).strip("'"))
elif col != 'Year':
mpf[col] = mpf[col].map(lambda x: _process_data(x))
mpf