# Run this cell to set up imports
import numpy as np
import pandas as pd
If you haven't yet, load in the imdb_lecture
and stops_lecture
databases.
!mkdir data
!cp -v -r ../lec02/data .
!cp -v -r ../lec04/data/stops_lecture.sql data/.
mkdir: cannot create directory ‘data’: File exists '../lec02/data/imdb_lecture.zip' -> './data/imdb_lecture.zip' '../lec02/data/imdb_lecture.sql' -> './data/imdb_lecture.sql' '../lec02/data/imdb_lecture_local.sql' -> './data/imdb_lecture_local.sql' '../lec04/data/stops_lecture.sql' -> 'data/./stops_lecture.sql'
!psql -h localhost -c 'DROP DATABASE IF EXISTS imdb_lecture'
!psql -h localhost -c 'CREATE DATABASE imdb_lecture'
!psql -h localhost -d imdb_lecture -f data/imdb_lecture.sql
DROP DATABASE CREATE DATABASE SET SET SET SET SET set_config ------------ (1 row) SET SET SET SET SET SET CREATE TABLE ALTER TABLE CREATE TABLE ALTER TABLE CREATE TABLE ALTER TABLE CREATE TABLE ALTER TABLE CREATE TABLE ALTER TABLE CREATE TABLE ALTER TABLE CREATE TABLE ALTER TABLE COPY 289 COPY 6792 COPY 6201 COPY 1438 COPY 1911 COPY 537 COPY 1814
!psql -h localhost -c 'DROP DATABASE IF EXISTS stops_lecture'
!psql -h localhost -c 'CREATE DATABASE stops_lecture'
!psql -h localhost -d imdb_lecture -f data/stops_lecture.sql
DROP DATABASE CREATE DATABASE SET SET SET SET SET set_config ------------ (1 row) SET SET SET SET SET SET CREATE TABLE ALTER TABLE CREATE TABLE ALTER TABLE COPY 84779 COPY 211 CREATE INDEX CREATE INDEX
%load_ext sql
The sql extension is already loaded. To reload it, use: %reload_ext sql
%%sql
SELECT 'Hello' || 'World',
STRPOS('Hello', 'el'),
SUBSTRING('Hello', 2, 3);
?column? | strpos | substring |
---|---|---|
HelloWorld | 2 | ell |
%%sql
SELECT REGEXP_REPLACE('Hannah Montana', '(.*)', '\1') as ex1,
REGEXP_REPLACE('Hannah Montana', '(.*) (.*)', '\2, \1') as ex2,
REGEXP_REPLACE('Phone Number 510 642 3214', '[a-zA-Z ]', '') as ex3,
REGEXP_REPLACE('Phone Number 510 642 3214', '[a-zA-Z ]', '', 'g') as ex4
;
ex1 | ex2 | ex3 | ex4 |
---|---|---|---|
Hannah Montana | Montana, Hannah | hone Number 510 642 3214 | 5106423214 |
%load_ext sql
The sql extension is already loaded. To reload it, use: %reload_ext sql
%%sql
SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\2') as lastname,
REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
FROM people;
lastname | firstname |
---|---|
Stains | Sun |
Samuels | Jeremiah |
Waymouth | Louis |
Gresham | John |
Spottiswoode | Roger |
Deibert | Amanda |
Cuervo | Cayetana Guillén |
Symonds | Craig |
Olías | Tito |
Nascimento | Décio Garcia |
%%sql
SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname,
COUNT(*) as countname
FROM people
GROUP BY firstname
ORDER BY countname desc;
firstname | countname |
---|---|
Chris | 69 |
Michael | 34 |
David | 25 |
John | 23 |
Robert | 14 |
James | 13 |
Scott | 13 |
Mark | 13 |
Peter | 13 |
Daniel | 11 |
%%sql
WITH firstnames AS (
SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
FROM people )
SELECT MAX(LENGTH(firstname))
FROM firstnames;
max |
---|
21 |
What is this first name? Why doesn't the below query work?
%%sql
WITH firstnames AS (
SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
FROM people )
SELECT MAX(LENGTH(firstname)), firstname
FROM firstnames;
RuntimeError: (psycopg2.errors.GroupingError) column "firstnames.firstname" must appear in the GROUP BY clause or be used in an aggregate function LINE 4: SELECT MAX(LENGTH(firstname)), firstname ^ [SQL: WITH firstnames AS ( SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname FROM people ) SELECT MAX(LENGTH(firstname)), firstname FROM firstnames;] (Background on this error at: https://sqlalche.me/e/20/f405) If you need help solving this issue, send us a message: https://ploomber.io/community
Solution:
%%sql
WITH firstnames AS (
SELECT REGEXP_REPLACE(name, '(.*) (.*)', '\1') as firstname
FROM people )
SELECT MAX(LENGTH(firstname)), firstname
FROM firstnames
GROUP BY firstname
ORDER BY max DESC;
max | firstname |
---|---|
21 | Princess Charlotte of |
20 | Ismaelpeter Casillas |
19 | Vincent Del Rosario |
19 | Rebecca Steel Roven |
18 | Danno Kristoper C. |
17 | Branford Marsalis |
17 | Arlene Newman-Van |
17 | Blind Mississippi |
17 | The Piedmont Bird |
16 | Cayetana Guillén |
%reload_ext sql
!psql -h localhost -c 'DROP DATABASE IF EXISTS stops_lecture'
!psql -h localhost -c 'CREATE DATABASE stops_lecture'
!psql -h localhost -d stops_lecture -f data/stops_lecture.sql
DROP DATABASE CREATE DATABASE SET SET SET SET SET set_config ------------ (1 row) SET SET SET SET SET SET CREATE TABLE ALTER TABLE CREATE TABLE ALTER TABLE COPY 84779 COPY 211 CREATE INDEX CREATE INDEX
%sql postgresql://127.0.0.1:5432/stops_lecture
%%sql
SELECT COUNT(*)
FROM stops;
count |
---|
84779 |
%%sql
SELECT DISTINCT COUNT(*)
FROM Zips;
count |
---|
211 |
%%sql
SELECT COUNT(*)
FROM Stops
INNER JOIN Zips
ON Stops.location = Zips.location;
count |
---|
15743 |
%reload_ext sql
%sql --close postgresql://127.0.0.1:5432/stops_lecture
%sql postgresql://127.0.0.1:5432/imdb_lecture
Make a temporary other table
%%sql
DROP TABLE IF EXISTS actor_title;
CREATE TABLE actor_title AS (
SELECT t1.primary_title AS title, t1.title_id, a1.name
FROM titles t1, people a1, crew c1
WHERE t1.title_id = c1.title_id AND a1.person_id = c1.person_id
);
%%sql
SELECT *
FROM actor_title
TABLESAMPLE BERNOULLI (5);
title | title_id | name |
---|---|---|
O Rei das Berlengas | tt0078148 | Zita Duarte |
Joe Versus the Volcano | tt0099892 | Richard Halsey |
Police Story 3: Super Cop | tt0104558 | Willie Chan |
The Power of One | tt0105159 | Dean Semler |
7 jin gong | tt0110923 | Michelle Yeoh |
Vault of Horror I | tt0111594 | Kyle MacLachlan |
AFI Life Achievement Award: A Tribute to Steven Spielberg | tt0114330 | Robert Shrum |
Toy Story | tt0114709 | Tom Hanks |
Cosmic Voyage | tt0115952 | Michael Miner |
That Thing You Do! | tt0117887 | Tom Everett Scott |
Choice A
%%sql
SELECT title_id, name, title,
AVG(LENGTH(name)) OVER (PARTITION BY title)
AS avg_name_length
FROM actor_title
WHERE title LIKE 'The %'
ORDER BY title;
title_id | name | title | avg_name_length |
---|---|---|---|
tt2174102 | Antwone Fisher | The 14th Annual Producers Guild of America Awards | 11.6666666666666667 |
tt2174102 | Adrien Brody | The 14th Annual Producers Guild of America Awards | 11.6666666666666667 |
tt2174102 | Tom Hanks | The 14th Annual Producers Guild of America Awards | 11.6666666666666667 |
tt2262378 | Tony Barbieri | The 64th Primetime Emmy Awards | 10.6666666666666667 |
tt2262378 | Aaron Paul | The 64th Primetime Emmy Awards | 10.6666666666666667 |
tt2262378 | Tom Hanks | The 64th Primetime Emmy Awards | 10.6666666666666667 |
tt0471382 | Brendan Fraser | The 8th Shanghai International Film Festival | 12.4000000000000000 |
tt0471382 | Karen Mok | The 8th Shanghai International Film Festival | 12.4000000000000000 |
tt0471382 | Jianxin Huang | The 8th Shanghai International Film Festival | 12.4000000000000000 |
tt0471382 | Tatsuya Fuji | The 8th Shanghai International Film Festival | 12.4000000000000000 |
Choice B
%%sql
SELECT title_id, name, title,
CAST(AVG(LENGTH(name)) OVER (PARTITION BY title)
AS INTEGER)
AS avg_name_length
FROM actor_title
WHERE title LIKE 'The %'
ORDER BY title;
title_id | name | title | avg_name_length |
---|---|---|---|
tt2174102 | Antwone Fisher | The 14th Annual Producers Guild of America Awards | 12 |
tt2174102 | Adrien Brody | The 14th Annual Producers Guild of America Awards | 12 |
tt2174102 | Tom Hanks | The 14th Annual Producers Guild of America Awards | 12 |
tt2262378 | Tony Barbieri | The 64th Primetime Emmy Awards | 11 |
tt2262378 | Aaron Paul | The 64th Primetime Emmy Awards | 11 |
tt2262378 | Tom Hanks | The 64th Primetime Emmy Awards | 11 |
tt0471382 | Brendan Fraser | The 8th Shanghai International Film Festival | 12 |
tt0471382 | Karen Mok | The 8th Shanghai International Film Festival | 12 |
tt0471382 | Jianxin Huang | The 8th Shanghai International Film Festival | 12 |
tt0471382 | Tatsuya Fuji | The 8th Shanghai International Film Festival | 12 |
Choice C
%%sql
SELECT title_id, name, title,
LENGTH(name),
RANK() OVER (PARTITION BY title ORDER BY LENGTH(name) DESC)
AS name_rank
FROM actor_title
WHERE title LIKE 'A %'
ORDER BY title, name_rank;
title_id | name | title | length | name_rank |
---|---|---|---|---|
tt2611968 | Claude Julius | A Girl's Affair | 13 | 1 |
tt2611968 | Bernard Susse | A Girl's Affair | 13 | 1 |
tt2611968 | Chris Evans | A Girl's Affair | 11 | 3 |
tt2980210 | Arcadiy Golubovich | A Hologram for the King | 18 | 1 |
tt2980210 | Ben Whishaw | A Hologram for the King | 11 | 2 |
tt2980210 | Tim O'Hair | A Hologram for the King | 10 | 3 |
tt2980210 | Omar Elba | A Hologram for the King | 9 | 4 |
tt2980210 | Tom Hanks | A Hologram for the King | 9 | 4 |
tt7405458 | Cameron Britton | A Man Called Ove | 15 | 1 |
tt7405458 | Marc Forster | A Man Called Ove | 12 | 2 |