Lecture 05: Advanced SQL¶

In [1]:
# Run this cell to set up imports
import numpy as np
import pandas as pd

If you haven't yet, load in the imdb_lecture and stops_lecture databases.

In [19]:
!mkdir data
!cp -v -r ../lec02/data .
!cp -v -r ../lec04/data/stops_lecture.sql data/.
mkdir: cannot create directory ‘data’: File exists
'../lec02/data/imdb_lecture.zip' -> './data/imdb_lecture.zip'
'../lec02/data/imdb_lecture.sql' -> './data/imdb_lecture.sql'
'../lec02/data/imdb_lecture_local.sql' -> './data/imdb_lecture_local.sql'
'../lec04/data/stops_lecture.sql' -> 'data/./stops_lecture.sql'
In [11]:
!psql -h localhost -c 'DROP DATABASE IF EXISTS imdb_lecture'
!psql -h localhost -c 'CREATE DATABASE imdb_lecture' 
!psql -h localhost -d imdb_lecture -f data/imdb_lecture.sql
DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 289
COPY 6792
COPY 6201
COPY 1438
COPY 1911
COPY 537
COPY 1814
In [3]:
!psql -h localhost -c 'DROP DATABASE IF EXISTS stops_lecture'
!psql -h localhost -c 'CREATE DATABASE stops_lecture' 
!psql -h localhost -d imdb_lecture -f data/stops_lecture.sql
DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 84779
COPY 211
CREATE INDEX
CREATE INDEX

Exercise #1: String manipulation¶

In [13]:
%load_ext sql
The sql extension is already loaded. To reload it, use:
  %reload_ext sql
In [50]:
%%sql
SELECT 'Hello' || 'World',
       STRPOS('Hello', 'el'),
       SUBSTRING('Hello', 2, 3);
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
1 rows affected.
Out[50]:
?column? strpos substring
HelloWorld 2 ell
In [55]:
%%sql
SELECT REGEXP_REPLACE('Hannah Montana', '(.*)', '\1') as ex1,
       REGEXP_REPLACE('Hannah Montana', '(.*) (.*)', '\2, \1') as ex2,
       REGEXP_REPLACE('Phone Number 510 642 3214', '[a-zA-Z ]', '') as ex3,
       REGEXP_REPLACE('Phone Number 510 642 3214', '[a-zA-Z ]', '', 'g') as ex4
;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
1 rows affected.
Out[55]:
ex1 ex2 ex3 ex4
Hannah Montana Montana, Hannah hone Number 510 642 3214 5106423214

Exercise 2¶

In [13]:
%load_ext sql
The sql extension is already loaded. To reload it, use:
  %reload_ext sql

Choice A¶

In [58]:
%%sql
SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\2') as lastname,
       REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
FROM people;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
1911 rows affected.
Out[58]:
lastname firstname
Stains Sun
Samuels Jeremiah
Waymouth Louis
Gresham John
Spottiswoode Roger
Deibert Amanda
Cuervo Cayetana Guillén
Symonds Craig
Olías Tito
Nascimento Décio Garcia
Truncated to displaylimit of 10.

Choice B¶

In [60]:
%%sql
SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname,
       COUNT(*) as countname
FROM people
GROUP BY firstname
ORDER BY countname desc;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
1198 rows affected.
Out[60]:
firstname countname
Chris 69
Michael 34
David 25
John 23
Robert 14
James 13
Scott 13
Mark 13
Peter 13
Daniel 11
Truncated to displaylimit of 10.

Choice C¶

In [72]:
%%sql
WITH firstnames AS (
    SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
    FROM people )
SELECT MAX(LENGTH(firstname))
FROM firstnames;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
1 rows affected.
Out[72]:
max
21

What is this first name? Why doesn't the below query work?

In [74]:
%%sql
WITH firstnames AS (
    SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
    FROM people )
SELECT MAX(LENGTH(firstname)), firstname
FROM firstnames;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
RuntimeError: (psycopg2.errors.GroupingError) column "firstnames.firstname" must appear in the GROUP BY clause or be used in an aggregate function
LINE 4: SELECT MAX(LENGTH(firstname)), firstname
                                       ^

[SQL: WITH firstnames AS (
    SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
    FROM people )
SELECT MAX(LENGTH(firstname)), firstname
FROM firstnames;]
(Background on this error at: https://sqlalche.me/e/20/f405)
If you need help solving this issue, send us a message: https://ploomber.io/community

Solution:

In [71]:
%%sql
WITH firstnames AS (
    SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
    FROM people )
SELECT MAX(LENGTH(firstname)), firstname
FROM firstnames
GROUP BY firstname
ORDER BY max DESC;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
1198 rows affected.
Out[71]:
max firstname
21 Princess Charlotte of
20 Ismaelpeter Casillas
19 Vincent Del Rosario
19 Rebecca Steel Roven
18 Danno Kristoper C.
17 Branford Marsalis
17 Arlene Newman-Van
17 Blind Mississippi
17 The Piedmont Bird
16 Cayetana Guillén
Truncated to displaylimit of 10.

Exercise #3: Grouping¶

In [1]:
%reload_ext sql
In [6]:
!psql -h localhost -c 'DROP DATABASE IF EXISTS stops_lecture'
!psql -h localhost -c 'CREATE DATABASE stops_lecture' 
!psql -h localhost -d stops_lecture -f data/stops_lecture.sql
DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 84779
COPY 211
CREATE INDEX
CREATE INDEX
In [7]:
%sql postgresql://127.0.0.1:5432/stops_lecture
Connecting and switching to connection postgresql://127.0.0.1:5432/stops_lecture
In [10]:
%%sql
SELECT COUNT(*)
FROM stops;
Running query in 'postgresql://127.0.0.1:5432/stops_lecture'
1 rows affected.
Out[10]:
count
84779
In [13]:
%%sql
SELECT DISTINCT COUNT(*)
FROM Zips;
Running query in 'postgresql://127.0.0.1:5432/stops_lecture'
1 rows affected.
Out[13]:
count
211
In [9]:
%%sql
SELECT COUNT(*)
FROM Stops
    INNER JOIN Zips
    ON Stops.location = Zips.location;
Running query in 'postgresql://127.0.0.1:5432/stops_lecture'
1 rows affected.
Out[9]:
count
15743

Demo: Windows and Casting¶

In [15]:
%reload_ext sql
In [7]:
%sql --close postgresql://127.0.0.1:5432/stops_lecture
Connecting and switching to connection postgresql://127.0.0.1:5432/stops_lecture
In [16]:
%sql postgresql://127.0.0.1:5432/imdb_lecture
Connecting and switching to connection postgresql://127.0.0.1:5432/imdb_lecture

Make a temporary other table

In [26]:
%%sql
DROP TABLE IF EXISTS actor_title;
CREATE TABLE actor_title AS (
    SELECT t1.primary_title AS title, t1.title_id, a1.name
    FROM titles t1, people a1, crew c1
    WHERE t1.title_id = c1.title_id AND a1.person_id = c1.person_id
);
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
5068 rows affected.
Out[26]:
In [32]:
%%sql
SELECT *
FROM actor_title
TABLESAMPLE BERNOULLI (5);
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
247 rows affected.
Out[32]:
title title_id name
O Rei das Berlengas tt0078148 Zita Duarte
Joe Versus the Volcano tt0099892 Richard Halsey
Police Story 3: Super Cop tt0104558 Willie Chan
The Power of One tt0105159 Dean Semler
7 jin gong tt0110923 Michelle Yeoh
Vault of Horror I tt0111594 Kyle MacLachlan
AFI Life Achievement Award: A Tribute to Steven Spielberg tt0114330 Robert Shrum
Toy Story tt0114709 Tom Hanks
Cosmic Voyage tt0115952 Michael Miner
That Thing You Do! tt0117887 Tom Everett Scott
Truncated to displaylimit of 10.

Choice A

In [21]:
%%sql
SELECT title_id, name, title,
       AVG(LENGTH(name)) OVER (PARTITION BY title)
         AS avg_name_length
FROM actor_title
WHERE title LIKE 'The %'
ORDER BY title;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
362 rows affected.
Out[21]:
title_id name title avg_name_length
tt2174102 Antwone Fisher The 14th Annual Producers Guild of America Awards 11.6666666666666667
tt2174102 Adrien Brody The 14th Annual Producers Guild of America Awards 11.6666666666666667
tt2174102 Tom Hanks The 14th Annual Producers Guild of America Awards 11.6666666666666667
tt2262378 Tony Barbieri The 64th Primetime Emmy Awards 10.6666666666666667
tt2262378 Aaron Paul The 64th Primetime Emmy Awards 10.6666666666666667
tt2262378 Tom Hanks The 64th Primetime Emmy Awards 10.6666666666666667
tt0471382 Brendan Fraser The 8th Shanghai International Film Festival 12.4000000000000000
tt0471382 Karen Mok The 8th Shanghai International Film Festival 12.4000000000000000
tt0471382 Jianxin Huang The 8th Shanghai International Film Festival 12.4000000000000000
tt0471382 Tatsuya Fuji The 8th Shanghai International Film Festival 12.4000000000000000
Truncated to displaylimit of 10.

Choice B

In [24]:
%%sql
SELECT title_id, name, title,
       CAST(AVG(LENGTH(name)) OVER (PARTITION BY title)
            AS INTEGER)
         AS avg_name_length
FROM actor_title
WHERE title LIKE 'The %'
ORDER BY title;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
362 rows affected.
Out[24]:
title_id name title avg_name_length
tt2174102 Antwone Fisher The 14th Annual Producers Guild of America Awards 12
tt2174102 Adrien Brody The 14th Annual Producers Guild of America Awards 12
tt2174102 Tom Hanks The 14th Annual Producers Guild of America Awards 12
tt2262378 Tony Barbieri The 64th Primetime Emmy Awards 11
tt2262378 Aaron Paul The 64th Primetime Emmy Awards 11
tt2262378 Tom Hanks The 64th Primetime Emmy Awards 11
tt0471382 Brendan Fraser The 8th Shanghai International Film Festival 12
tt0471382 Karen Mok The 8th Shanghai International Film Festival 12
tt0471382 Jianxin Huang The 8th Shanghai International Film Festival 12
tt0471382 Tatsuya Fuji The 8th Shanghai International Film Festival 12
Truncated to displaylimit of 10.

Choice C

In [25]:
%%sql
SELECT title_id, name, title,
       LENGTH(name),
       RANK() OVER (PARTITION BY title ORDER BY LENGTH(name) DESC)
         AS name_rank
FROM actor_title
WHERE title LIKE 'A %'
ORDER BY title, name_rank;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
38 rows affected.
Out[25]:
title_id name title length name_rank
tt2611968 Claude Julius A Girl's Affair 13 1
tt2611968 Bernard Susse A Girl's Affair 13 1
tt2611968 Chris Evans A Girl's Affair 11 3
tt2980210 Arcadiy Golubovich A Hologram for the King 18 1
tt2980210 Ben Whishaw A Hologram for the King 11 2
tt2980210 Tim O'Hair A Hologram for the King 10 3
tt2980210 Omar Elba A Hologram for the King 9 4
tt2980210 Tom Hanks A Hologram for the King 9 4
tt7405458 Cameron Britton A Man Called Ove 15 1
tt7405458 Marc Forster A Man Called Ove 12 2
Truncated to displaylimit of 10.
In [ ]: