import numpy as np
import pandas as pd


%reload_ext sql
%sql postgresql://127.0.0.1:5432/postgres
%config SqlMagic.displaylimit = None
import pandas as pd


%%sql
EXPLAIN ANALYZE
SELECT *
FROM actor, cast_info, movie
WHERE actor.id = cast_info.person_id
  AND movie.id = cast_info.movie_id
LIMIT 10;


%sql CREATE INDEX  actoridindex ON actor(id);


%%sql
EXPLAIN ANALYZE
SELECT *
FROM actor, cast_info, movie
WHERE actor.id = cast_info.person_id
  AND movie.id = cast_info.movie_id
LIMIT 10;


%sql CREATE INDEX movieid_castinfoindex ON cast_info(movie_id);


%%sql
EXPLAIN ANALYZE
SELECT *
FROM actor, cast_info, movie
WHERE actor.id = cast_info.person_id
  AND movie.id = cast_info.movie_id
LIMIT 10;


%sql drop index actoridindex;
%sql drop index movieid_castinfoindex;


mm = pd.read_csv('data/mm.txt', header=0)
mm


mm_melted = mm.melt(id_vars=['Year'])
mm_melted


mm_melted[mm_melted['Year'] == 2002]


#mm_melted.pivot(index='variable', columns='Year')
mm_melted.pivot(index='Year', columns='variable')


mmp = pd.read_csv('data/mmp.txt', header=0)
mmp


# Unpivot
mmp_melted = mmp.melt(id_vars=['Location', 'Station Name', 'Year'])
mmp_melted


# Repivot the unpivot

# mmp_melted.pivot(index='Year', columns='variable')
mmp_tt = mmp_melted.pivot(index=['Location', 'Station Name', 'Year'], columns='variable')
mmp_tt


mmp_tt.reset_index()


%reload_ext sql
%sql postgresql://127.0.0.1:5432/postgres
import pandas as pd


%%sql
drop table if exists blue;
drop table if exists red;
create table blue (last text, first text);
create table red (last text, first text);

insert into blue values ('Wang', 'Daisy');
insert into blue values ('Wang', 'Daisy');
insert into blue values ('Wang', 'Xin');

insert into red values ('Wang', 'Daisy');
insert into red values ('Wang', 'Xin');
insert into red values ('Wang', 'Xin');

select * from blue;


%sql select * from red;


%%sql
drop table if exists bluem;
create table bluem as 
select *, count(*) as multiplicity
  from blue
 group by last, first;

select * from bluem;


%%sql
drop table if exists redm;
create table redm as 
select *, count(*) as multiplicity
  from red
 group by last, first;

select * from redm;


%%sql
-- sigma on multiset
select * from blue 
 where first = 'Daisy';


%%sql
-- sigma on counted set
select * from bluem where first = 'Daisy';


%%sql
-- pi on multiset
select last from blue;


%%sql
select last from bluem;


%%sql
-- pi on counted set
select last, SUM(multiplicity) from bluem group by last;


%%sql
-- x on multiset
select * from blue, red;


%%sql
-- convert multiset x to counted set
  with cte(blast, bfirst, rlast, rfirst)
    as (select * from blue, red)
select *, count(*)
  from cte
 group by blast, bfirst, rlast, rfirst;


%%sql
select * from bluem, redm;


%%sql
-- fix multiplicity per row
select b.last, b.first, r.last, r.first, b.multiplicity*r.multiplicity from bluem b, redm r;


%%sql
select distinct b.last, b.first, r.last, r.first
  from blue b, red r;


%reload_ext sql
%sql postgresql://127.0.0.1:5432/postgres
import pandas as pd


%%sql
drop table if exists example;
create table example(name text, age integer, gpa float);
insert into example values
       ('Patty Perfect', 22, 4.0),
       ('Sameer Soclose', 20, 3.99),
       ('Jacob Excellent', 21, 3.93);


df = %sql select * from example;
df = df.DataFrame()

df


df.dtypes

name     object
age       int64
gpa     float64
dtype: object


dft = df.transpose()
dft


dft.dtypes

0    object
1    object
2    object
dtype: object


df2 = df.transpose().transpose()
df2


df2.dtypes

name    object
age     object
gpa     object
dtype: object


df2['age'] = df2['age'].astype(int)
df2['gpa'] = df2['gpa'].astype(float)


df2.dtypes

name     object
age       int64
gpa     float64
dtype: object


mat = np.array([[1, 0, 0, 1, 20000],
          [0, 1, 0, 2, 10011],
          [0, 0, 1, 3, 50000],
          [0, 0, 1, 3, 10000]])
mat

array([[    1,     0,     0,     1, 20000],
       [    0,     1,     0,     2, 10011],
       [    0,     0,     1,     3, 50000],
       [    0,     0,     1,     3, 10000]])


df = pd.DataFrame({'CompanyName': ['VW', 'Acura', 'Hona', 'Honda'],
              'Categorical Value': [1, 2, 3, 3],
              'Price': [20000, 10011, 50000, 10000]})
df


df.dtypes

CompanyName          object
Categorical Value     int64
Price                 int64
dtype: object


# %sql --persist df

ValueError: Table 'df' already exists. Consider using --persist-replace to drop the table before persisting the data frame
If you need help solving this issue, send us a message: https://ploomber.io/community

	Year	NOV	DEC	JAN	FEB	MAR	APR	MAY	JUN	AUG
0	2002	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
1	2003	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
2	2004	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
3	2005	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
4	2006	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
5	2007	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
6	2008	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
7	2009	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
8	2010	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
9	2011	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
10	2012	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
11	2013	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
12	2014	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
13	2015	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
14	2016	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
15	2017	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
16	2018	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
17	2019	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09
18	2020	6.03	7.18	0.82	2.01	9.96	4.74	0.78	0.13	0.09

	Year	variable	value
0	2002	OCT	0.0
1	2003	OCT	0.0
2	2004	OCT	0.0
3	2005	OCT	0.0
4	2006	OCT	0.0
...	...	...	...
223	2016	SEP	0.0
224	2017	SEP	0.0
225	2018	SEP	0.0
226	2019	SEP	0.0
227	2020	SEP	0.0

	Year	variable	value
0	2002	OCT	0.00
19	2002	NOV	6.03
38	2002	DEC	7.18
57	2002	JAN	0.82
76	2002	FEB	2.01
95	2002	MAR	9.96
114	2002	APR	4.74
133	2002	MAY	0.78
152	2002	JUN	0.13
171	2002	JUL	0.00
190	2002	AUG	0.09
209	2002	SEP	0.00

	value
variable	APR	AUG	DEC	FEB	JAN	JUL	JUN	MAR	MAY	NOV	OCT	SEP
Year
2002	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2003	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2004	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2005	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2006	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2007	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2008	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2009	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2010	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2011	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2012	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2013	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2014	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2015	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2016	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2017	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2018	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2019	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0
2020	4.74	0.09	7.18	2.01	0.82	0.0	0.13	9.96	0.78	6.03	0.0	0.0

	Year	Location	Station Name	OCT	NOV	DEC	JAN	FEB	MAR	APR	MAY	JUN	JUL	AUG	SEP
0	2002	ASHLAND	SOUTHERN OREGON COASTAL	0.86	0.49	2.12	3.42	1.38	1.00	0.36	2.30	1.54	0.00	0.00	0.16
1	2002	CAVE JUNCTION	SOUTHERN OREGON COASTAL	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	2002	GOLD BEACH	SOUTHERN OREGON COASTAL	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	2002	GRANTS PASS KAJO	SOUTHERN OREGON COASTAL	0.61	1.21	4.19	6.31	0.24	0.77	0.58	2.02	0.87	0.00	0.00	0.20
4	2002	GREEN SPRINGS PP	SOUTHERN OREGON COASTAL	0.35	0.75	2.44	4.14	0.66	NaN	0.26	2.59	NaN	NaN	0.00	0.20
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5695	2020	WENDOVER	GREAT SALT LAKE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5696	2020	GREAT BASIN N P	GREAT SALT LAKE	0.13	2.52	0.84	0.20	NaN	2.94	0.97	0.07	0.44	0.43	0.00	0.02
5697	2020	MONTELLO	GREAT SALT LAKE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5698	2020	CEDAR CITY 5E	ESCALANTE DESERT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5699	2020	ENTERPRISE	ESCALANTE DESERT	0.00	3.51	1.33	0.20	0.97	3.65	1.30	0.07	0.00	0.00	0.11	0.00

Lecture 11¶

More `EXPLAIN ANALYZE`¶

Structural Transformation: From Relations to Matrices and Back¶

What does an unpivot look like (Matrix -> Relational)?¶

PIVOT(UNPIVOT) = ??¶

Extra Columns¶

[Extra] Multisets to Sets¶

Set up connections and schema¶

Representing multiset relations as counted-set relations¶

Use a CTAS statement with group by to convert standard tables to counted-set tables¶

How do we make selection on counted-set tables work like multisets?¶

What about projection?¶

What about cross-product?¶

[Scratch] Transposing Demo Code¶

	Location	Station Name	Year	value
variable				APR	AUG	DEC	FEB	JAN	JUL	JUN	MAR	MAY	NOV	OCT	SEP
0	"LA INT'L AIRPORT"	SOUTHERN CALIFORNIA COASTAL	2002	2.68	0.00	4.42	0.00	0.38	0.00	0.00	4.11	0.12	1.43	0.00	0.00
1	"LA INT'L AIRPORT"	SOUTHERN CALIFORNIA COASTAL	2003	2.68	0.00	4.42	0.00	0.38	0.00	0.00	4.11	0.12	1.43	0.00	0.00
2	"LA INT'L AIRPORT"	SOUTHERN CALIFORNIA COASTAL	2004	2.68	0.00	4.42	0.00	0.38	0.00	0.00	4.11	0.12	1.43	0.00	0.00
3	"LA INT'L AIRPORT"	SOUTHERN CALIFORNIA COASTAL	2005	2.68	0.00	4.42	0.00	0.38	0.00	0.00	4.11	0.12	1.43	0.00	0.00
4	"LA INT'L AIRPORT"	SOUTHERN CALIFORNIA COASTAL	2006	2.68	0.00	4.42	0.00	0.38	0.00	0.00	4.11	0.12	1.43	0.00	0.00
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5695	YREKA	LOWER KLAMATH	2016	0.73	0.08	2.02	0.03	2.08	0.11	0.27	0.96	2.03	0.66	0.43	0.02
5696	YREKA	LOWER KLAMATH	2017	0.73	0.08	2.02	0.03	2.08	0.11	0.27	0.96	2.03	0.66	0.43	0.02
5697	YREKA	LOWER KLAMATH	2018	0.73	0.08	2.02	0.03	2.08	0.11	0.27	0.96	2.03	0.66	0.43	0.02
5698	YREKA	LOWER KLAMATH	2019	0.73	0.08	2.02	0.03	2.08	0.11	0.27	0.96	2.03	0.66	0.43	0.02
5699	YREKA	LOWER KLAMATH	2020	0.73	0.08	2.02	0.03	2.08	0.11	0.27	0.96	2.03	0.66	0.43	0.02

last	first	last_1	first_1
Wang	Daisy	Wang	Daisy
Wang	Daisy	Wang	Xin
Wang	Daisy	Wang	Xin
Wang	Daisy	Wang	Daisy
Wang	Daisy	Wang	Xin
Wang	Daisy	Wang	Xin
Wang	Xin	Wang	Daisy
Wang	Xin	Wang	Xin
Wang	Xin	Wang	Xin

	name	age	gpa
0	Patty Perfect	22	4.00
1	Sameer Soclose	20	3.99
2	Jacob Excellent	21	3.93

	0	1	2
name	Patty Perfect	Sameer Soclose	Jacob Excellent
age	22	20	21
gpa	4.0	3.99	3.93

QUERY PLAN
Limit (cost=0.15..18.45 rows=2 width=80) (actual time=0.026..0.031 rows=2 loops=1)
-> Nested Loop (cost=0.15..18.45 rows=2 width=80) (actual time=0.025..0.029 rows=2 loops=1)
-> Nested Loop (cost=0.00..2.09 rows=2 width=44) (actual time=0.017..0.019 rows=2 loops=1)
Join Filter: (actor.id = cast_info.person_id)
Rows Removed by Join Filter: 1
-> Seq Scan on cast_info (cost=0.00..1.02 rows=2 width=8) (actual time=0.008..0.009 rows=2 loops=1)
-> Materialize (cost=0.00..1.03 rows=2 width=36) (actual time=0.003..0.003 rows=2 loops=2)
-> Seq Scan on actor (cost=0.00..1.02 rows=2 width=36) (actual time=0.002..0.002 rows=2 loops=1)
-> Index Scan using movie_pkey on movie (cost=0.15..8.17 rows=1 width=36) (actual time=0.004..0.004 rows=1 loops=2)
Index Cond: (id = cast_info.movie_id)
Planning Time: 0.477 ms
Execution Time: 0.066 ms

QUERY PLAN
Limit (cost=0.15..18.45 rows=2 width=80) (actual time=0.024..0.031 rows=2 loops=1)
-> Nested Loop (cost=0.15..18.45 rows=2 width=80) (actual time=0.023..0.029 rows=2 loops=1)
-> Nested Loop (cost=0.00..2.09 rows=2 width=44) (actual time=0.015..0.018 rows=2 loops=1)
Join Filter: (actor.id = cast_info.person_id)
Rows Removed by Join Filter: 1
-> Seq Scan on cast_info (cost=0.00..1.02 rows=2 width=8) (actual time=0.007..0.008 rows=2 loops=1)
-> Materialize (cost=0.00..1.03 rows=2 width=36) (actual time=0.002..0.002 rows=2 loops=2)
-> Seq Scan on actor (cost=0.00..1.02 rows=2 width=36) (actual time=0.002..0.002 rows=2 loops=1)
-> Index Scan using movie_pkey on movie (cost=0.15..8.17 rows=1 width=36) (actual time=0.004..0.004 rows=1 loops=2)
Index Cond: (id = cast_info.movie_id)
Planning Time: 0.369 ms
Execution Time: 0.067 ms

QUERY PLAN
Limit (cost=0.15..18.45 rows=2 width=80) (actual time=0.022..0.029 rows=2 loops=1)
-> Nested Loop (cost=0.15..18.45 rows=2 width=80) (actual time=0.021..0.027 rows=2 loops=1)
-> Nested Loop (cost=0.00..2.09 rows=2 width=44) (actual time=0.014..0.017 rows=2 loops=1)
Join Filter: (actor.id = cast_info.person_id)
Rows Removed by Join Filter: 1
-> Seq Scan on cast_info (cost=0.00..1.02 rows=2 width=8) (actual time=0.005..0.006 rows=2 loops=1)
-> Materialize (cost=0.00..1.03 rows=2 width=36) (actual time=0.002..0.003 rows=2 loops=2)
-> Seq Scan on actor (cost=0.00..1.02 rows=2 width=36) (actual time=0.003..0.003 rows=2 loops=1)
-> Index Scan using movie_pkey on movie (cost=0.15..8.17 rows=1 width=36) (actual time=0.003..0.003 rows=1 loops=2)
Index Cond: (id = cast_info.movie_id)
Planning Time: 0.397 ms
Execution Time: 0.059 ms

	CompanyName	Categorical Value	Price
0	VW	1	20000
1	Acura	2	10011
2	Hona	3	50000
3	Honda	3	10000

Lecture 11¶

More EXPLAIN ANALYZE¶

Structural Transformation: From Relations to Matrices and Back¶

What does an unpivot look like (Matrix -> Relational)?¶

PIVOT(UNPIVOT) = ??¶

Extra Columns¶

[Extra] Multisets to Sets¶

Set up connections and schema¶

Representing multiset relations as counted-set relations¶

Use a CTAS statement with group by to convert standard tables to counted-set tables¶

How do we make selection on counted-set tables work like multisets?¶

What about projection?¶

What about cross-product?¶

[Scratch] Transposing Demo Code¶

More `EXPLAIN ANALYZE`¶