Lecture 05: Advanced SQL¶

In [1]:
# Run this cell to set up imports
import numpy as np
import pandas as pd

If you haven't yet, load in the imdb_lecture database.

In [2]:
!cp -v -r ../lec02/data .
'../lec02/data/imdb_lecture.sql' -> './data/imdb_lecture.sql'
'../lec02/data/imdb_lecture_local.sql' -> './data/imdb_lecture_local.sql'
In [3]:
%load_ext sql
In [4]:
!psql -h localhost -c 'DROP DATABASE IF EXISTS imdb_lecture'
!psql -h localhost -c 'CREATE DATABASE imdb_lecture' 
!psql -h localhost -d imdb_lecture -f data/imdb_lecture.sql
DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 8405
COPY 4043
COPY 4923
COPY 1223
COPY 820
COPY 2420
In [5]:
%sql postgresql://127.0.0.1:5432/imdb_lecture

Exercise #1: String manipulation¶

In [6]:
%%sql
SELECT 'Hello' || 'World',
       STRPOS('Hello', 'el'),
       SUBSTRING('Hello', 2, 3);
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
1 rows affected.
Out[6]:
?column? strpos substring
HelloWorld 2 ell
In [7]:
%%sql
SELECT REGEXP_REPLACE('Hannah Montana', '(.*)', '\1') as ex1,
       REGEXP_REPLACE('Hannah Montana', '(.*) (.*)', '\2, \1') as ex2,
       REGEXP_REPLACE('Phone Number 510 642 3214', '[a-zA-Z ]', '') as ex3,
       REGEXP_REPLACE('Phone Number 510 642 3214', '[a-zA-Z ]', '', 'g') as ex4
;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
1 rows affected.
Out[7]:
ex1 ex2 ex3 ex4
Hannah Montana Montana, Hannah hone Number 510 642 3214 5106423214

Exercise 2¶

Choice A¶

In [8]:
%sql postgresql://127.0.0.1:5432/imdb_lecture
In [9]:
%%sql
SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\2') as lastname,
       REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
FROM people;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
1223 rows affected.
Out[9]:
lastname firstname
Hill Dwayne
Hardman Dave
Pryce-Jones Rich
Sadler William
Laurentiis Giada De
Hartmann Janine
Taliánová Tereza
Heywood Chris
Zhou Chengao
Langley Mark
Truncated to displaylimit of 10.

Choice B¶

In [10]:
%%sql
SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname,
       COUNT(*) as countname
FROM people
GROUP BY firstname
ORDER BY countname desc;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
871 rows affected.
Out[10]:
firstname countname
Chris 79
David 18
Michael 16
Steve 10
Peter 9
John 9
Tom 8
Mark 8
Dan 8
Paul 6
Truncated to displaylimit of 10.

Choice C¶

In [11]:
%%sql
WITH firstnames AS (
    SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
    FROM people )
SELECT MAX(LENGTH(firstname))
FROM firstnames;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
1 rows affected.
Out[11]:
max
20

What is this first name? Why doesn't the below query work?

In [12]:
%%sql
WITH firstnames AS (
    SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
    FROM people )
SELECT MAX(LENGTH(firstname)), firstname
FROM firstnames;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
RuntimeError: (psycopg2.errors.GroupingError) column "firstnames.firstname" must appear in the GROUP BY clause or be used in an aggregate function
LINE 4: SELECT MAX(LENGTH(firstname)), firstname
                                       ^

[SQL: WITH firstnames AS (
    SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
    FROM people )
SELECT MAX(LENGTH(firstname)), firstname
FROM firstnames;]
(Background on this error at: https://sqlalche.me/e/20/f405)
If you need help solving this issue, send us a message: https://ploomber.io/community

Solution:

In [13]:
%%sql
WITH firstnames AS (
    SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
    FROM people )
SELECT MAX(LENGTH(firstname)), firstname
FROM firstnames
GROUP BY firstname
ORDER BY max DESC;
Running query in 'postgresql://127.0.0.1:5432/imdb_lecture'
871 rows affected.
Out[13]:
max firstname
20 Juan Francisco Pérez
19 José María González
19 Margarida Mercês de
16 Dadinha Monteiro
16 Sreemoolanagaram
16 Michelle van der
15 Jonathan Taylor
15 Deirdre Coleman
15 Nam Hoi Sap-Sam
15 Michael Richard
Truncated to displaylimit of 10.
In [14]:
%sql --close postgresql://127.0.0.1:5432/imdb_lecture

Exercise #3: Grouping¶

In [15]:
%reload_ext sql
In [16]:
!psql -h localhost -c 'DROP DATABASE IF EXISTS stops_lecture'
!psql -h localhost -c 'CREATE DATABASE stops_lecture' 
!psql -h localhost -d stops_lecture -f data/stops_lecture.sql
DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 84779
COPY 211
CREATE INDEX
CREATE INDEX
In [17]:
%sql postgresql://127.0.0.1:5432/stops_lecture
Connecting and switching to connection postgresql://127.0.0.1:5432/stops_lecture
In [18]:
%%sql
SELECT COUNT(*)
FROM stops;
Running query in 'postgresql://127.0.0.1:5432/stops_lecture'
1 rows affected.
Out[18]:
count
84779
In [19]:
%%sql
SELECT DISTINCT COUNT(*)
FROM Zips;
Running query in 'postgresql://127.0.0.1:5432/stops_lecture'
1 rows affected.
Out[19]:
count
211
In [20]:
%%sql
SELECT COUNT(*)
FROM Stops
    INNER JOIN Zips
    ON Stops.location = Zips.location;
Running query in 'postgresql://127.0.0.1:5432/stops_lecture'
1 rows affected.
Out[20]:
count
15743
In [21]:
%%sql
SELECT COUNT(*)
FROM Stops
    FULL OUTER JOIN Zips
    ON Stops.location = Zips.location;
Running query in 'postgresql://127.0.0.1:5432/stops_lecture'
1 rows affected.
Out[21]:
count
87291
In [22]:
%%sql
SELECT COUNT(*)
FROM Stops
    LEFT OUTER JOIN Zips
    ON Stops.location = Zips.location;
Running query in 'postgresql://127.0.0.1:5432/stops_lecture'
1 rows affected.
Out[22]:
count
87115
In [23]:
%%sql
SELECT COUNT(*)
FROM Stops
    RIGHT OUTER JOIN Zips
    ON Stops.location = Zips.location;
Running query in 'postgresql://127.0.0.1:5432/stops_lecture'
1 rows affected.
Out[23]:
count
15919
In [24]:
%sql --close postgresql://127.0.0.1:5432/stops_lecture