# Run this cell to set up imports
import numpy as np
import pandas as pd

!ln -sf ../../lec/lec06/data .
!unzip -u data/imdb_perf_lecture.zip -d data/

Archive:  data/imdb_perf_lecture.zip

!psql -h localhost -c 'DROP DATABASE IF EXISTS imdb_perf_lecture'
!psql -h localhost -c 'CREATE DATABASE imdb_perf_lecture' 
!psql -h localhost -d imdb_perf_lecture -f data/imdb_perf_lecture.sql

DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 845888
COPY 2211936
COPY 656453
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE

%reload_ext sql

%sql postgresql://127.0.0.1:5432/imdb_perf_lecture

# run this cell to remove 10-row limit on display
%config SqlMagic.displaylimit = 20

%%sql
/* 1 */
EXPLAIN ANALYZE
SELECT id FROM actors
WHERE id > 4000000 AND
name='Tom Hanks';

%%sql
/* 2 */
EXPLAIN ANALYZE
SELECT id FROM actors
ORDER BY name
LIMIT 10;

%%sql
/* 3 */
EXPLAIN ANALYZE
SELECT id FROM actors
ORDER BY id
LIMIT 10;

%%sql
EXPLAIN ANALYZE
SELECT *
FROM actors, cast_info
WHERE actors.id = cast_info.person_id;

result = _.DataFrame()
result.style.set_properties(**{'text-align': 'left'})
print(result)

                                          QUERY PLAN
0  Hash Join  (cost=30867.48..92474.12 rows=22120...
1       Hash Cond: (cast_info.person_id = actors.id)
2    ->  Seq Scan on cast_info  (cost=0.00..31908...
3    ->  Hash  (cost=13684.88..13684.88 rows=8458...
4          Buckets: 65536  Batches: 32  Memory Us...
5          ->  Seq Scan on actors  (cost=0.00..13...
6                            Planning Time: 0.221 ms
7                        Execution Time: 1449.252 ms

%%sql
EXPLAIN ANALYZE
SELECT *
FROM actors, cast_info
WHERE actors.id = cast_info.person_id
LIMIT 10;

result = _.DataFrame()
result.style.set_properties(**{'text-align': 'left'})
print(result)

                                          QUERY PLAN
0  Limit  (cost=0.42..5.13 rows=10 width=44) (act...
1    ->  Nested Loop  (cost=0.42..1040953.73 rows...
2          ->  Seq Scan on cast_info  (cost=0.00....
3          ->  Index Scan using actor_pkey on act...
4                Index Cond: (id = cast_info.pers...
5                            Planning Time: 0.112 ms
6                           Execution Time: 0.185 ms

%%sql
EXPLAIN ANALYZE
SELECT name, movie_id
FROM actors, cast_info
WHERE actors.id = cast_info.person_id;

result = _.DataFrame()
result.style.set_properties(**{'text-align': 'left'})
print(result)

                                          QUERY PLAN
0  Hash Join  (cost=30867.48..92474.12 rows=22120...
1       Hash Cond: (cast_info.person_id = actors.id)
2    ->  Seq Scan on cast_info  (cost=0.00..31908...
3    ->  Hash  (cost=13684.88..13684.88 rows=8458...
4          Buckets: 65536  Batches: 32  Memory Us...
5          ->  Seq Scan on actors  (cost=0.00..13...
6                            Planning Time: 0.092 ms
7                        Execution Time: 1761.153 ms

%%sql
EXPLAIN ANALYZE
SELECT name, movie_id
FROM actors, cast_info
WHERE actors.id = cast_info.person_id AND actors.id > 4000000;

result = _.DataFrame()
result.style.set_properties(**{'text-align': 'left'})
print(result)

                                           QUERY PLAN
0   Hash Join  (cost=19763.71..76964.36 rows=73736...
1        Hash Cond: (cast_info.person_id = actors.id)
2     ->  Seq Scan on cast_info  (cost=0.00..31908...
3     ->  Hash  (cost=14036.18..14036.18 rows=2819...
4           Buckets: 65536  Batches: 8  Memory Usa...
5           ->  Bitmap Heap Scan on actors  (cost=...
6                        Recheck Cond: (id > 4000000)
7                             Heap Blocks: exact=3088
8                 ->  Bitmap Index Scan on actor_p...
9                          Index Cond: (id > 4000000)
10                            Planning Time: 0.123 ms
11                        Execution Time: 1330.343 ms

%%sql 
EXPLAIN ANALYZE
SELECT *
FROM actors, cast_info, movies
WHERE actors.id = cast_info.person_id
    AND movies.id = cast_info.movie_id
LIMIT 10;

result = _.DataFrame()
result.style.set_properties(**{'text-align': 'left'})
print(result)

                                          QUERY PLAN
0  Limit  (cost=0.85..10.10 rows=10 width=74) (ac...
1    ->  Nested Loop  (cost=0.85..2046318.54 rows...
2          ->  Nested Loop  (cost=0.42..1040884.9...
3                ->  Seq Scan on cast_info  (cost...
4                ->  Index Scan using actor_pkey ...
5                      Index Cond: (id = cast_inf...
6          ->  Index Scan using movie_pkey on mov...
7                Index Cond: (id = cast_info.movi...
8                            Planning Time: 0.336 ms
9                           Execution Time: 0.138 ms

%%sql
EXPLAIN ANALYZE
SELECT *
FROM actors, cast_info, movies
WHERE actors.id = cast_info.person_id
    AND movies.id = cast_info.movie_id
    AND name = 'Tom Hanks';

result = _.DataFrame()
result.style.set_properties(**{'text-align': 'left'})
print(result)

                                           QUERY PLAN
0   Gather  (cost=10654.12..35277.87 rows=11059 wi...
1                                  Workers Planned: 2
2                                 Workers Launched: 2
3     ->  Nested Loop  (cost=9654.12..33171.97 row...
4           ->  Parallel Hash Join  (cost=9653.69....
5                 Hash Cond: (cast_info.person_id ...
6                 ->  Parallel Seq Scan on cast_in...
7                 ->  Parallel Hash  (cost=9631.67...
8                       Buckets: 8192  Batches: 1 ...
9                       ->  Parallel Seq Scan on a...
10                            Filter: (name = 'Tom...
11                            Rows Removed by Filt...
12          ->  Index Scan using movie_pkey on mov...
13                Index Cond: (id = cast_info.movi...
14                            Planning Time: 0.155 ms
15                         Execution Time: 394.368 ms

%%sql
EXPLAIN ANALYZE
SELECT *
FROM actors, cast_info, movies
WHERE actors.id = cast_info.person_id
    AND movies.id = cast_info.movie_id
    AND title LIKE 'Snakes on a Plane';

result = _.DataFrame()
result.style.set_properties(**{'text-align': 'left'})
print(result)

                                           QUERY PLAN
0   Gather  (cost=9279.46..30704.82 rows=7 width=7...
1                                  Workers Planned: 2
2                                 Workers Launched: 2
3     ->  Nested Loop  (cost=8279.46..29704.12 row...
4           ->  Parallel Hash Join  (cost=8279.04....
5                 Hash Cond: (cast_info.movie_id =...
6                 ->  Parallel Seq Scan on cast_in...
7                 ->  Parallel Hash  (cost=8279.03...
8                       Buckets: 1024  Batches: 1 ...
9                       ->  Parallel Seq Scan on m...
10                            Filter: (title ~~ 'S...
11                            Rows Removed by Filt...
12          ->  Index Scan using actor_pkey on act...
13                Index Cond: (id = cast_info.pers...
14                            Planning Time: 0.278 ms
15                         Execution Time: 405.404 ms

%%sql
EXPLAIN ANALYZE
SELECT *
FROM actors, cast_info, movies
WHERE actors.id = cast_info.person_id
    AND movies.id = cast_info.movie_id
LIMIT 10;

result = _.DataFrame()
result.style.set_properties(**{'text-align': 'left'})
print(result)

                                           QUERY PLAN
0   Limit  (cost=0.86..9.60 rows=10 width=74) (act...
1     ->  Nested Loop  (cost=0.86..1932285.82 rows...
2           ->  Nested Loop  (cost=0.43..926852.23...
3                 ->  Seq Scan on cast_info  (cost...
4                 ->  Memoize  (cost=0.43..0.47 ro...
5                       Cache Key: cast_info.perso...
6                                 Cache Mode: logical
7                       Hits: 2  Misses: 8  Evicti...
8                       ->  Index Scan using actor...
9                             Index Cond: (id = ca...
10          ->  Index Scan using movie_pkey on mov...
11                Index Cond: (id = cast_info.movi...
12                            Planning Time: 0.232 ms
13                           Execution Time: 1.233 ms

%sql ALTER TABLE actors DROP CONSTRAINT actor_pkey CASCADE;

%%sql
EXPLAIN ANALYZE
SELECT *
FROM actors, cast_info, movies
WHERE actors.id = cast_info.person_id
    AND movies.id = cast_info.movie_id
LIMIT 10;

result = _.DataFrame()
result.style.set_properties(**{'text-align': 'left'})
print(result)

                                           QUERY PLAN
0   Limit  (cost=0.42..25271.13 rows=10 width=74) ...
1     ->  Nested Loop  (cost=0.42..35381485254.95 ...
2           ->  Nested Loop  (cost=0.00..353751211...
3                 Join Filter: (actors.id = cast_i...
4                 Rows Removed by Join Filter: 111...
5                 ->  Seq Scan on actors  (cost=0....
6                 ->  Materialize  (cost=0.00..516...
7                       ->  Seq Scan on cast_info ...
8           ->  Index Scan using movie_pkey on mov...
9                 Index Cond: (id = cast_info.movi...
10                            Planning Time: 0.401 ms
11                         Execution Time: 300.628 ms

%sql ALTER TABLE movies DROP CONSTRAINT movie_pkey CASCADE;

%%sql
EXPLAIN ANALYZE
SELECT *
FROM actors, cast_info, movies
WHERE actors.id = cast_info.person_id
    AND movies.id = cast_info.movie_id
LIMIT 10;

result = _.DataFrame()
result.style.set_properties(**{'text-align': 'left'})
print(result)

                                           QUERY PLAN
0   Limit  (cost=13884.25..44371.17 rows=10 width=...
1     ->  Nested Loop  (cost=13884.25..42684723061...
2           Join Filter: (cast_info.person_id = ac...
3                Rows Removed by Join Filter: 7743513
4           ->  Gather  (cost=13884.25..270450.57 ...
5                                  Workers Planned: 2
6                                 Workers Launched: 2
7                 ->  Parallel Hash Join  (cost=12...
8                       Hash Cond: (cast_info.movi...
9                       ->  Parallel Seq Scan on c...
10                      ->  Parallel Hash  (cost=7...
11                            Buckets: 65536  Batc...
12                            ->  Parallel Seq Sca...
13          ->  Materialize  (cost=0.00..24523.32 ...
14                ->  Seq Scan on actors  (cost=0....
15                            Planning Time: 0.187 ms
16                        Execution Time: 1822.068 ms

%sql --close postgresql://127.0.0.1:5432/imdb_perf_lecture

!psql -h localhost -c 'DROP DATABASE IF EXISTS imdb_perf_lecture'

DROP DATABASE

Lecture 08¶

Load in the IMDB Performance database¶

Start `jupysql`¶

Matching¶

Two-table demo: LIMIT¶

Two-table demo: Projection¶

Three-way joins¶

Three-way joins with Indexes¶

Cleanup¶

QUERY PLAN
Limit (cost=17366.94..17368.11 rows=10 width=36) (actual time=140.249..142.642 rows=10 loops=1)
-> Gather Merge (cost=17366.94..99611.72 rows=704906 width=36) (actual time=140.248..142.637 rows=10 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Sort (cost=16366.92..17248.05 rows=352453 width=36) (actual time=137.363..137.365 rows=8 loops=3)
Sort Key: name
Sort Method: top-N heapsort Memory: 26kB
Worker 0: Sort Method: top-N heapsort Memory: 26kB
Worker 1: Sort Method: top-N heapsort Memory: 26kB
-> Parallel Seq Scan on actors (cost=0.00..8750.53 rows=352453 width=36) (actual time=0.021..49.143 rows=281963 loops=3)
Planning Time: 0.108 ms
Execution Time: 142.692 ms

QUERY PLAN
Hash Join (cost=30867.48..92474.12 rows=2212088 width=36) (actual time=258.110..1695.561 rows=2211936 loops=1)
Hash Cond: (cast_info.person_id = actors.id)
-> Seq Scan on cast_info (cost=0.00..31908.88 rows=2212088 width=8) (actual time=0.012..293.502 rows=2211936 loops=1)
-> Hash (cost=13684.88..13684.88 rows=845888 width=36) (actual time=257.724..257.726 rows=845888 loops=1)
Buckets: 65536 Batches: 32 Memory Usage: 1848kB
-> Seq Scan on actors (cost=0.00..13684.88 rows=845888 width=36) (actual time=0.007..102.280 rows=845888 loops=1)
Planning Time: 0.092 ms
Execution Time: 1761.153 ms

Lecture 08¶

Load in the IMDB Performance database¶

Start jupysql¶

Matching¶

Two-table demo: LIMIT¶

Two-table demo: Projection¶

Three-way joins¶

Three-way joins with Indexes¶

Cleanup¶

Start `jupysql`¶