import numpy as np
import pandas as pd
EXPLAIN ANALYZE
¶%reload_ext sql
%sql postgresql://127.0.0.1:5432/postgres
%config SqlMagic.displaylimit = None
import pandas as pd
%%sql
EXPLAIN ANALYZE
SELECT *
FROM actor, cast_info, movie
WHERE actor.id = cast_info.person_id
AND movie.id = cast_info.movie_id
LIMIT 10;
QUERY PLAN |
---|
Limit (cost=0.15..18.45 rows=2 width=80) (actual time=0.026..0.031 rows=2 loops=1) |
-> Nested Loop (cost=0.15..18.45 rows=2 width=80) (actual time=0.025..0.029 rows=2 loops=1) |
-> Nested Loop (cost=0.00..2.09 rows=2 width=44) (actual time=0.017..0.019 rows=2 loops=1) |
Join Filter: (actor.id = cast_info.person_id) |
Rows Removed by Join Filter: 1 |
-> Seq Scan on cast_info (cost=0.00..1.02 rows=2 width=8) (actual time=0.008..0.009 rows=2 loops=1) |
-> Materialize (cost=0.00..1.03 rows=2 width=36) (actual time=0.003..0.003 rows=2 loops=2) |
-> Seq Scan on actor (cost=0.00..1.02 rows=2 width=36) (actual time=0.002..0.002 rows=2 loops=1) |
-> Index Scan using movie_pkey on movie (cost=0.15..8.17 rows=1 width=36) (actual time=0.004..0.004 rows=1 loops=2) |
Index Cond: (id = cast_info.movie_id) |
Planning Time: 0.477 ms |
Execution Time: 0.066 ms |
%sql CREATE INDEX actoridindex ON actor(id);
%%sql
EXPLAIN ANALYZE
SELECT *
FROM actor, cast_info, movie
WHERE actor.id = cast_info.person_id
AND movie.id = cast_info.movie_id
LIMIT 10;
QUERY PLAN |
---|
Limit (cost=0.15..18.45 rows=2 width=80) (actual time=0.024..0.031 rows=2 loops=1) |
-> Nested Loop (cost=0.15..18.45 rows=2 width=80) (actual time=0.023..0.029 rows=2 loops=1) |
-> Nested Loop (cost=0.00..2.09 rows=2 width=44) (actual time=0.015..0.018 rows=2 loops=1) |
Join Filter: (actor.id = cast_info.person_id) |
Rows Removed by Join Filter: 1 |
-> Seq Scan on cast_info (cost=0.00..1.02 rows=2 width=8) (actual time=0.007..0.008 rows=2 loops=1) |
-> Materialize (cost=0.00..1.03 rows=2 width=36) (actual time=0.002..0.002 rows=2 loops=2) |
-> Seq Scan on actor (cost=0.00..1.02 rows=2 width=36) (actual time=0.002..0.002 rows=2 loops=1) |
-> Index Scan using movie_pkey on movie (cost=0.15..8.17 rows=1 width=36) (actual time=0.004..0.004 rows=1 loops=2) |
Index Cond: (id = cast_info.movie_id) |
Planning Time: 0.369 ms |
Execution Time: 0.067 ms |
%sql CREATE INDEX movieid_castinfoindex ON cast_info(movie_id);
%%sql
EXPLAIN ANALYZE
SELECT *
FROM actor, cast_info, movie
WHERE actor.id = cast_info.person_id
AND movie.id = cast_info.movie_id
LIMIT 10;
QUERY PLAN |
---|
Limit (cost=0.15..18.45 rows=2 width=80) (actual time=0.022..0.029 rows=2 loops=1) |
-> Nested Loop (cost=0.15..18.45 rows=2 width=80) (actual time=0.021..0.027 rows=2 loops=1) |
-> Nested Loop (cost=0.00..2.09 rows=2 width=44) (actual time=0.014..0.017 rows=2 loops=1) |
Join Filter: (actor.id = cast_info.person_id) |
Rows Removed by Join Filter: 1 |
-> Seq Scan on cast_info (cost=0.00..1.02 rows=2 width=8) (actual time=0.005..0.006 rows=2 loops=1) |
-> Materialize (cost=0.00..1.03 rows=2 width=36) (actual time=0.002..0.003 rows=2 loops=2) |
-> Seq Scan on actor (cost=0.00..1.02 rows=2 width=36) (actual time=0.003..0.003 rows=2 loops=1) |
-> Index Scan using movie_pkey on movie (cost=0.15..8.17 rows=1 width=36) (actual time=0.003..0.003 rows=1 loops=2) |
Index Cond: (id = cast_info.movie_id) |
Planning Time: 0.397 ms |
Execution Time: 0.059 ms |
%sql drop index actoridindex;
%sql drop index movieid_castinfoindex;
To start, let's take our matrix in mm.txt
, and load it into Pandas.
mm = pd.read_csv('data/mm.txt', header=0)
mm
Year | OCT | NOV | DEC | JAN | FEB | MAR | APR | MAY | JUN | JUL | AUG | SEP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2002 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
1 | 2003 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
2 | 2004 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
3 | 2005 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
4 | 2006 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
5 | 2007 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
6 | 2008 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
7 | 2009 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
8 | 2010 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
9 | 2011 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
10 | 2012 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
11 | 2013 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
12 | 2014 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
13 | 2015 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
14 | 2016 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
15 | 2017 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
16 | 2018 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
17 | 2019 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
18 | 2020 | 0.0 | 6.03 | 7.18 | 0.82 | 2.01 | 9.96 | 4.74 | 0.78 | 0.13 | 0.0 | 0.09 | 0.0 |
mm_melted = mm.melt(id_vars=['Year'])
mm_melted
Year | variable | value | |
---|---|---|---|
0 | 2002 | OCT | 0.0 |
1 | 2003 | OCT | 0.0 |
2 | 2004 | OCT | 0.0 |
3 | 2005 | OCT | 0.0 |
4 | 2006 | OCT | 0.0 |
... | ... | ... | ... |
223 | 2016 | SEP | 0.0 |
224 | 2017 | SEP | 0.0 |
225 | 2018 | SEP | 0.0 |
226 | 2019 | SEP | 0.0 |
227 | 2020 | SEP | 0.0 |
228 rows × 3 columns
Thanks to the id_var
parameter, the Year
column is named and repeated for all other (variable=column name, value=value) elements in the row.
mm_melted[mm_melted['Year'] == 2002]
Year | variable | value | |
---|---|---|---|
0 | 2002 | OCT | 0.00 |
19 | 2002 | NOV | 6.03 |
38 | 2002 | DEC | 7.18 |
57 | 2002 | JAN | 0.82 |
76 | 2002 | FEB | 2.01 |
95 | 2002 | MAR | 9.96 |
114 | 2002 | APR | 4.74 |
133 | 2002 | MAY | 0.78 |
152 | 2002 | JUN | 0.13 |
171 | 2002 | JUL | 0.00 |
190 | 2002 | AUG | 0.09 |
209 | 2002 | SEP | 0.00 |
#mm_melted.pivot(index='variable', columns='Year')
mm_melted.pivot(index='Year', columns='variable')
value | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
variable | APR | AUG | DEC | FEB | JAN | JUL | JUN | MAR | MAY | NOV | OCT | SEP |
Year | ||||||||||||
2002 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2003 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2004 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2005 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2006 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2007 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2008 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2009 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2010 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2011 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2012 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2013 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2014 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2015 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2016 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2017 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2018 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2019 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
2020 | 4.74 | 0.09 | 7.18 | 2.01 | 0.82 | 0.0 | 0.13 | 9.96 | 0.78 | 6.03 | 0.0 | 0.0 |
Let's go back to mmp.txt
.
mmp = pd.read_csv('data/mmp.txt', header=0)
mmp
Year | Location | Station Name | OCT | NOV | DEC | JAN | FEB | MAR | APR | MAY | JUN | JUL | AUG | SEP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2002 | ASHLAND | SOUTHERN OREGON COASTAL | 0.86 | 0.49 | 2.12 | 3.42 | 1.38 | 1.00 | 0.36 | 2.30 | 1.54 | 0.00 | 0.00 | 0.16 |
1 | 2002 | CAVE JUNCTION | SOUTHERN OREGON COASTAL | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2002 | GOLD BEACH | SOUTHERN OREGON COASTAL | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2002 | GRANTS PASS KAJO | SOUTHERN OREGON COASTAL | 0.61 | 1.21 | 4.19 | 6.31 | 0.24 | 0.77 | 0.58 | 2.02 | 0.87 | 0.00 | 0.00 | 0.20 |
4 | 2002 | GREEN SPRINGS PP | SOUTHERN OREGON COASTAL | 0.35 | 0.75 | 2.44 | 4.14 | 0.66 | NaN | 0.26 | 2.59 | NaN | NaN | 0.00 | 0.20 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5695 | 2020 | WENDOVER | GREAT SALT LAKE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5696 | 2020 | GREAT BASIN N P | GREAT SALT LAKE | 0.13 | 2.52 | 0.84 | 0.20 | NaN | 2.94 | 0.97 | 0.07 | 0.44 | 0.43 | 0.00 | 0.02 |
5697 | 2020 | MONTELLO | GREAT SALT LAKE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5698 | 2020 | CEDAR CITY 5E | ESCALANTE DESERT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5699 | 2020 | ENTERPRISE | ESCALANTE DESERT | 0.00 | 3.51 | 1.33 | 0.20 | 0.97 | 3.65 | 1.30 | 0.07 | 0.00 | 0.00 | 0.11 | 0.00 |
5700 rows × 15 columns
# Unpivot
mmp_melted = mmp.melt(id_vars=['Location', 'Station Name', 'Year'])
mmp_melted
Location | Station Name | Year | variable | value | |
---|---|---|---|---|---|
0 | ASHLAND | SOUTHERN OREGON COASTAL | 2002 | OCT | 0.86 |
1 | CAVE JUNCTION | SOUTHERN OREGON COASTAL | 2002 | OCT | NaN |
2 | GOLD BEACH | SOUTHERN OREGON COASTAL | 2002 | OCT | NaN |
3 | GRANTS PASS KAJO | SOUTHERN OREGON COASTAL | 2002 | OCT | 0.61 |
4 | GREEN SPRINGS PP | SOUTHERN OREGON COASTAL | 2002 | OCT | 0.35 |
... | ... | ... | ... | ... | ... |
68395 | WENDOVER | GREAT SALT LAKE | 2020 | SEP | NaN |
68396 | GREAT BASIN N P | GREAT SALT LAKE | 2020 | SEP | 0.02 |
68397 | MONTELLO | GREAT SALT LAKE | 2020 | SEP | NaN |
68398 | CEDAR CITY 5E | ESCALANTE DESERT | 2020 | SEP | NaN |
68399 | ENTERPRISE | ESCALANTE DESERT | 2020 | SEP | 0.00 |
68400 rows × 5 columns
# Repivot the unpivot
# mmp_melted.pivot(index='Year', columns='variable')
mmp_tt = mmp_melted.pivot(index=['Location', 'Station Name', 'Year'], columns='variable')
mmp_tt
value | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
variable | APR | AUG | DEC | FEB | JAN | JUL | JUN | MAR | MAY | NOV | OCT | SEP | ||
Location | Station Name | Year | ||||||||||||
"LA INT'L AIRPORT" | SOUTHERN CALIFORNIA COASTAL | 2002 | 2.68 | 0.00 | 4.42 | 0.00 | 0.38 | 0.00 | 0.00 | 4.11 | 0.12 | 1.43 | 0.00 | 0.00 |
2003 | 2.68 | 0.00 | 4.42 | 0.00 | 0.38 | 0.00 | 0.00 | 4.11 | 0.12 | 1.43 | 0.00 | 0.00 | ||
2004 | 2.68 | 0.00 | 4.42 | 0.00 | 0.38 | 0.00 | 0.00 | 4.11 | 0.12 | 1.43 | 0.00 | 0.00 | ||
2005 | 2.68 | 0.00 | 4.42 | 0.00 | 0.38 | 0.00 | 0.00 | 4.11 | 0.12 | 1.43 | 0.00 | 0.00 | ||
2006 | 2.68 | 0.00 | 4.42 | 0.00 | 0.38 | 0.00 | 0.00 | 4.11 | 0.12 | 1.43 | 0.00 | 0.00 | ||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
YREKA | LOWER KLAMATH | 2016 | 0.73 | 0.08 | 2.02 | 0.03 | 2.08 | 0.11 | 0.27 | 0.96 | 2.03 | 0.66 | 0.43 | 0.02 |
2017 | 0.73 | 0.08 | 2.02 | 0.03 | 2.08 | 0.11 | 0.27 | 0.96 | 2.03 | 0.66 | 0.43 | 0.02 | ||
2018 | 0.73 | 0.08 | 2.02 | 0.03 | 2.08 | 0.11 | 0.27 | 0.96 | 2.03 | 0.66 | 0.43 | 0.02 | ||
2019 | 0.73 | 0.08 | 2.02 | 0.03 | 2.08 | 0.11 | 0.27 | 0.96 | 2.03 | 0.66 | 0.43 | 0.02 | ||
2020 | 0.73 | 0.08 | 2.02 | 0.03 | 2.08 | 0.11 | 0.27 | 0.96 | 2.03 | 0.66 | 0.43 | 0.02 |
5700 rows × 12 columns
mmp_tt.reset_index()
Location | Station Name | Year | value | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
variable | APR | AUG | DEC | FEB | JAN | JUL | JUN | MAR | MAY | NOV | OCT | SEP | |||
0 | "LA INT'L AIRPORT" | SOUTHERN CALIFORNIA COASTAL | 2002 | 2.68 | 0.00 | 4.42 | 0.00 | 0.38 | 0.00 | 0.00 | 4.11 | 0.12 | 1.43 | 0.00 | 0.00 |
1 | "LA INT'L AIRPORT" | SOUTHERN CALIFORNIA COASTAL | 2003 | 2.68 | 0.00 | 4.42 | 0.00 | 0.38 | 0.00 | 0.00 | 4.11 | 0.12 | 1.43 | 0.00 | 0.00 |
2 | "LA INT'L AIRPORT" | SOUTHERN CALIFORNIA COASTAL | 2004 | 2.68 | 0.00 | 4.42 | 0.00 | 0.38 | 0.00 | 0.00 | 4.11 | 0.12 | 1.43 | 0.00 | 0.00 |
3 | "LA INT'L AIRPORT" | SOUTHERN CALIFORNIA COASTAL | 2005 | 2.68 | 0.00 | 4.42 | 0.00 | 0.38 | 0.00 | 0.00 | 4.11 | 0.12 | 1.43 | 0.00 | 0.00 |
4 | "LA INT'L AIRPORT" | SOUTHERN CALIFORNIA COASTAL | 2006 | 2.68 | 0.00 | 4.42 | 0.00 | 0.38 | 0.00 | 0.00 | 4.11 | 0.12 | 1.43 | 0.00 | 0.00 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5695 | YREKA | LOWER KLAMATH | 2016 | 0.73 | 0.08 | 2.02 | 0.03 | 2.08 | 0.11 | 0.27 | 0.96 | 2.03 | 0.66 | 0.43 | 0.02 |
5696 | YREKA | LOWER KLAMATH | 2017 | 0.73 | 0.08 | 2.02 | 0.03 | 2.08 | 0.11 | 0.27 | 0.96 | 2.03 | 0.66 | 0.43 | 0.02 |
5697 | YREKA | LOWER KLAMATH | 2018 | 0.73 | 0.08 | 2.02 | 0.03 | 2.08 | 0.11 | 0.27 | 0.96 | 2.03 | 0.66 | 0.43 | 0.02 |
5698 | YREKA | LOWER KLAMATH | 2019 | 0.73 | 0.08 | 2.02 | 0.03 | 2.08 | 0.11 | 0.27 | 0.96 | 2.03 | 0.66 | 0.43 | 0.02 |
5699 | YREKA | LOWER KLAMATH | 2020 | 0.73 | 0.08 | 2.02 | 0.03 | 2.08 | 0.11 | 0.27 | 0.96 | 2.03 | 0.66 | 0.43 | 0.02 |
5700 rows × 15 columns
%reload_ext sql
%sql postgresql://127.0.0.1:5432/postgres
import pandas as pd
%%sql
drop table if exists blue;
drop table if exists red;
create table blue (last text, first text);
create table red (last text, first text);
insert into blue values ('Wang', 'Daisy');
insert into blue values ('Wang', 'Daisy');
insert into blue values ('Wang', 'Xin');
insert into red values ('Wang', 'Daisy');
insert into red values ('Wang', 'Xin');
insert into red values ('Wang', 'Xin');
select * from blue;
last | first |
---|---|
Wang | Daisy |
Wang | Daisy |
Wang | Xin |
%sql select * from red;
last | first |
---|---|
Wang | Daisy |
Wang | Xin |
Wang | Xin |
%%sql
drop table if exists bluem;
create table bluem as
select *, count(*) as multiplicity
from blue
group by last, first;
select * from bluem;
last | first | multiplicity |
---|---|---|
Wang | Xin | 1 |
Wang | Daisy | 2 |
%%sql
drop table if exists redm;
create table redm as
select *, count(*) as multiplicity
from red
group by last, first;
select * from redm;
last | first | multiplicity |
---|---|---|
Wang | Xin | 2 |
Wang | Daisy | 1 |
This works exactly the same in both cases. There's nothing special here. Applying WHERE
filters on a counted-set will always yield a set back, because you're only removing rows from a set. By definition, this cannot create an entity that is not a set.
%%sql
-- sigma on multiset
select * from blue
where first = 'Daisy';
last | first |
---|---|
Wang | Daisy |
Wang | Daisy |
%%sql
-- sigma on counted set
select * from bluem where first = 'Daisy';
last | first | multiplicity |
---|---|---|
Wang | Daisy | 2 |
We might want to be a bit careful here. See, what defines a set uniquely is its key, and in this case, the key is the combination of (last, first)
. Simply having last
or just having first
is not enough to uniquely identify a row.
%%sql
-- pi on multiset
select last from blue;
last |
---|
Wang |
Wang |
Wang |
In fact, you can see that if you simply selected last
from a counted-set, you'd get a multi-set as your output.
%%sql
select last from bluem;
last |
---|
Wang |
Wang |
To convert this to a counted-set again, you need to sum up the multiplicities of the tuples that the last
field came from.
%%sql
-- pi on counted set
select last, SUM(multiplicity) from bluem group by last;
last | sum |
---|---|
Wang | 3 |
%%sql
-- x on multiset
select * from blue, red;
last | first | last_1 | first_1 |
---|---|---|---|
Wang | Daisy | Wang | Daisy |
Wang | Daisy | Wang | Xin |
Wang | Daisy | Wang | Xin |
Wang | Daisy | Wang | Daisy |
Wang | Daisy | Wang | Xin |
Wang | Daisy | Wang | Xin |
Wang | Xin | Wang | Daisy |
Wang | Xin | Wang | Xin |
Wang | Xin | Wang | Xin |
Next, convert the output of a multiset cross-product to a counted set as we did before. This is our desired result:
%%sql
-- convert multiset x to counted set
with cte(blast, bfirst, rlast, rfirst)
as (select * from blue, red)
select *, count(*)
from cte
group by blast, bfirst, rlast, rfirst;
blast | bfirst | rlast | rfirst | count |
---|---|---|---|---|
Wang | Daisy | Wang | Daisy | 2 |
Wang | Xin | Wang | Xin | 2 |
Wang | Daisy | Wang | Xin | 4 |
Wang | Xin | Wang | Daisy | 1 |
Now, what went on in the arithmetic here? We can think this through by pushing the arithmetic into the query!
First, what do you get with naive cross-product of counted sets? You get the names from each table, along with the number of times that each name showed up in its respective table. So, for example, ('Wang', 'Xin')
showed up once in blue
and twice in red
.
%%sql
select * from bluem, redm;
last | first | multiplicity | last_1 | first_1 | multiplicity_1 |
---|---|---|---|---|---|
Wang | Xin | 1 | Wang | Xin | 2 |
Wang | Xin | 1 | Wang | Daisy | 1 |
Wang | Daisy | 2 | Wang | Xin | 2 |
Wang | Daisy | 2 | Wang | Daisy | 1 |
What does each row tell us individually? Each row tells us the number of times that the name from the left must be matched with the name from the right in the original cross product between blue
and red
. So if you multiply the multiplicities together, you'll get the number of instances of each ordered pair of names in the final cross product
%%sql
-- fix multiplicity per row
select b.last, b.first, r.last, r.first, b.multiplicity*r.multiplicity from bluem b, redm r;
last | first | last_1 | first_1 | ?column? |
---|---|---|---|---|
Wang | Xin | Wang | Xin | 2 |
Wang | Xin | Wang | Daisy | 1 |
Wang | Daisy | Wang | Xin | 4 |
Wang | Daisy | Wang | Daisy | 2 |
If we simply wanted to drop duplicates instead of monitoring how many there were (the point of a counted-set), our life would have been a lot easier...
%%sql
select distinct b.last, b.first, r.last, r.first
from blue b, red r;
last | first | last_1 | first_1 |
---|---|---|---|
Wang | Daisy | Wang | Daisy |
Wang | Xin | Wang | Xin |
Wang | Daisy | Wang | Xin |
Wang | Xin | Wang | Daisy |
%reload_ext sql
%sql postgresql://127.0.0.1:5432/postgres
import pandas as pd
%%sql
drop table if exists example;
create table example(name text, age integer, gpa float);
insert into example values
('Patty Perfect', 22, 4.0),
('Sameer Soclose', 20, 3.99),
('Jacob Excellent', 21, 3.93);
df = %sql select * from example;
df = df.DataFrame()
df
name | age | gpa | |
---|---|---|---|
0 | Patty Perfect | 22 | 4.00 |
1 | Sameer Soclose | 20 | 3.99 |
2 | Jacob Excellent | 21 | 3.93 |
df.dtypes
name object age int64 gpa float64 dtype: object
dft = df.transpose()
dft
0 | 1 | 2 | |
---|---|---|---|
name | Patty Perfect | Sameer Soclose | Jacob Excellent |
age | 22 | 20 | 21 |
gpa | 4.0 | 3.99 | 3.93 |
dft.dtypes
0 object 1 object 2 object dtype: object
df2 = df.transpose().transpose()
df2
name | age | gpa | |
---|---|---|---|
0 | Patty Perfect | 22 | 4.0 |
1 | Sameer Soclose | 20 | 3.99 |
2 | Jacob Excellent | 21 | 3.93 |
df2.dtypes
name object age object gpa object dtype: object
df2['age'] = df2['age'].astype(int)
df2['gpa'] = df2['gpa'].astype(float)
df2.dtypes
name object age int64 gpa float64 dtype: object
mat = np.array([[1, 0, 0, 1, 20000],
[0, 1, 0, 2, 10011],
[0, 0, 1, 3, 50000],
[0, 0, 1, 3, 10000]])
mat
array([[ 1, 0, 0, 1, 20000], [ 0, 1, 0, 2, 10011], [ 0, 0, 1, 3, 50000], [ 0, 0, 1, 3, 10000]])
df = pd.DataFrame({'CompanyName': ['VW', 'Acura', 'Hona', 'Honda'],
'Categorical Value': [1, 2, 3, 3],
'Price': [20000, 10011, 50000, 10000]})
df
CompanyName | Categorical Value | Price | |
---|---|---|---|
0 | VW | 1 | 20000 |
1 | Acura | 2 | 10011 |
2 | Hona | 3 | 50000 |
3 | Honda | 3 | 10000 |
df.dtypes
CompanyName object Categorical Value int64 Price int64 dtype: object
# %sql --persist df
ValueError: Table 'df' already exists. Consider using --persist-replace to drop the table before persisting the data frame If you need help solving this issue, send us a message: https://ploomber.io/community