!unzip -u ../lec07/data/imdb_perf_lecture.zip -d ../lec07/data/

Archive:  ../lec07/data/imdb_perf_lecture.zip


!psql -h localhost -c 'DROP DATABASE IF EXISTS imdb_perf_lecture'
!psql -h localhost -c 'CREATE DATABASE imdb_perf_lecture' 
!psql -h localhost -d imdb_perf_lecture -f ../lec07/data/imdb_perf_lecture.sql

DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
psql:../lec07/data/imdb_perf_lecture.sql:33: ERROR:  role "yanlisa" does not exist
CREATE TABLE
psql:../lec07/data/imdb_perf_lecture.sql:45: ERROR:  role "yanlisa" does not exist
CREATE TABLE
psql:../lec07/data/imdb_perf_lecture.sql:59: ERROR:  role "yanlisa" does not exist
COPY 845888
COPY 2211936
COPY 656453
ALTER TABLE
ALTER TABLE
ALTER TABLE
ALTER TABLE


%reload_ext sql

There's a new jupysql version available (0.10.1), you're running 0.10.0. To upgrade: pip install jupysql --upgrade


%config SqlMagic.displaylimit = None


%sql postgresql://127.0.0.1:5432/imdb_perf_lecture


%%sql
explain analyze
select *
from actor, cast_info
where actor.id = cast_info.person_id;


%%sql
explain analyze
select actor.name,movie_id
from actor, cast_info
where actor.id = cast_info.person_id
limit 10 ;


%%sql
explain analyze
select actor.name,movie_id
from actor, cast_info
where actor.id = cast_info.person_id;


%%sql
explain analyze
select actor.name,movie_id
from actor, cast_info
where actor.id = cast_info.person_id and actor.id > 4000000;


%%sql
-- repeat previous query
explain analyze
select actor.name,movie_id
from actor, cast_info
where actor.id = cast_info.person_id
limit 10 ;


%%sql
explain analyze
select actor.name,movie_id,actor.id
from actor, cast_info
where actor.id = cast_info.person_id
limit 10 ;


%%sql
explain analyze
select *
from actor, cast_info, movie
where actor.id = cast_info.person_id and movie.id = cast_info.movie_id
limit 10;


# ^ note 2 hash joins where actor and cast_info are joined, followed by a join with movie


%%sql
explain analyze
select *
from actor, cast_info, movie
where actor.id = cast_info.person_id and movie.id = cast_info.movie_id and name='Hanks, Tom' ;


# ^ got rid of limit; same as before, except notice the push down of seq scan


%%sql explain analyze
select *
from actor, cast_info, movie
where actor.id = cast_info.person_id and movie.id = cast_info.movie_id and title='Snakes on a Plane';


# repeat

# note 2 hash joins where actor and cast_info are joined, followed by a join with movie


%%sql
explain analyze
select *
from actor, cast_info, movie
where actor.id = cast_info.person_id and movie.id = cast_info.movie_id
limit 10;


%sql create index actoridindex on actor(id);


%%sql
explain analyze
select *
from actor, cast_info, movie
where actor.id = cast_info.person_id and movie.id = cast_info.movie_id
limit 10;


%sql create index movieid_castinfoindex on cast_info(movie_id);


%%sql
explain analyze
select *
from actor, cast_info, movie
where actor.id = cast_info.person_id and movie.id = cast_info.movie_id
limit 10;


%sql drop index actoridindex;
%sql drop index movieid_castinfoindex;

RuntimeError: (psycopg2.errors.UndefinedObject) index "actoridindex" does not exist

[SQL: drop index actoridindex;]
(Background on this error at: https://sqlalche.me/e/20/f405)
If you need help solving this issue, send us a message: https://ploomber.io/community

Lecture 09: Query Optimization II¶

New IMDB Performance database¶

Demo¶

Multi-table: impact of indexes¶

QUERY PLAN
Hash Join (cost=29215.48..89168.21 rows=2211936 width=26) (actual time=1204.592..2899.130 rows=2211936 loops=1)
Hash Cond: (cast_info.person_id = actor.id)
-> Seq Scan on cast_info (cost=0.00..31907.36 rows=2211936 width=8) (actual time=32.664..576.889 rows=2211936 loops=1)
-> Hash (cost=13684.88..13684.88 rows=845888 width=18) (actual time=1171.392..1171.394 rows=845888 loops=1)
Buckets: 65536 Batches: 16 Memory Usage: 3114kB
-> Seq Scan on actor (cost=0.00..13684.88 rows=845888 width=18) (actual time=21.178..1044.234 rows=845888 loops=1)
Planning Time: 219.930 ms
Execution Time: 2965.872 ms

QUERY PLAN
Limit (cost=0.43..4.45 rows=10 width=18) (actual time=0.038..0.096 rows=10 loops=1)
-> Nested Loop (cost=0.43..888554.34 rows=2211936 width=18) (actual time=0.036..0.093 rows=10 loops=1)
-> Seq Scan on cast_info (cost=0.00..31907.36 rows=2211936 width=8) (actual time=0.014..0.016 rows=10 loops=1)
-> Memoize (cost=0.43..0.47 rows=1 width=18) (actual time=0.007..0.007 rows=1 loops=10)
Cache Key: cast_info.person_id
Cache Mode: logical
Hits: 2 Misses: 8 Evictions: 0 Overflows: 0 Memory Usage: 1kB
-> Index Scan using actor_pkey on actor (cost=0.42..0.46 rows=1 width=18) (actual time=0.007..0.007 rows=1 loops=8)
Index Cond: (id = cast_info.person_id)
Planning Time: 0.241 ms
Execution Time: 0.470 ms

QUERY PLAN
Hash Join (cost=29215.48..89168.21 rows=2211936 width=18) (actual time=287.396..1435.852 rows=2211936 loops=1)
Hash Cond: (cast_info.person_id = actor.id)
-> Seq Scan on cast_info (cost=0.00..31907.36 rows=2211936 width=8) (actual time=0.013..223.331 rows=2211936 loops=1)
-> Hash (cost=13684.88..13684.88 rows=845888 width=18) (actual time=287.183..287.185 rows=845888 loops=1)
Buckets: 65536 Batches: 16 Memory Usage: 3191kB
-> Seq Scan on actor (cost=0.00..13684.88 rows=845888 width=18) (actual time=0.067..108.863 rows=845888 loops=1)
Planning Time: 0.212 ms
Execution Time: 1502.998 ms

QUERY PLAN
Hash Join (cost=23971.10..81574.83 rows=1163852 width=18) (actual time=173.505..1135.131 rows=634763 loops=1)
Hash Cond: (cast_info.person_id = actor.id)
-> Seq Scan on cast_info (cost=0.00..31907.36 rows=2211936 width=8) (actual time=0.008..210.031 rows=2211936 loops=1)
-> Hash (cost=15799.60..15799.60 rows=445080 width=18) (actual time=172.984..172.985 rows=444781 loops=1)
Buckets: 65536 Batches: 8 Memory Usage: 3335kB
-> Seq Scan on actor (cost=0.00..15799.60 rows=445080 width=18) (actual time=0.293..90.924 rows=444781 loops=1)
Filter: (id > 4000000)
Rows Removed by Filter: 401107
Planning Time: 0.163 ms
Execution Time: 1155.587 ms