import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)
sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline
%reload_ext sql
%sql postgresql://localhost:5432/postgres
%sql SELECT setseed(0.42) -- to fix randomness
setseed |
---|
Just like a Python package, we need to import tablefunc
in order to use the normal_rand
function.
%sql CREATE EXTENSION IF NOT EXISTS tablefunc;
Assume our datapoints are truly Normal. Simulate them in observations
.
%sql DROP TABLE IF EXISTS observations CASCADE;
%sql CREATE TABLE observations AS \
SELECT normal_rand AS x \
FROM normal_rand(1000, 50, 5);
results = %sql SELECT x FROM observations;
sns.displot(results.dict(), fill=True, kde=True, bins=20,
height=4, aspect=2)
<seaborn.axisgrid.FacetGrid at 0x7f2e62b58d50>
Construct a view called normal_outliers
using the 2 sigma metric.
# construct the view
%%sql
CREATE OR REPLACE VIEW normal_outliers AS
WITH bounds AS (
SELECT avg(x) - 2*stddev(x) AS lo,
avg(x) + 2*stddev(x) AS hi
FROM observations
)
SELECT x AS outlier
FROM observations o, bounds b
WHERE x NOT BETWEEN b.lo AND b.hi;
# query the view
%sql SELECT * FROM normal_outliers;
outlier |
---|
59.82230258659907 |
39.688576522693424 |
61.3054707730887 |
61.29016170171167 |
37.638986178841535 |
38.80156045940347 |
36.444988608291084 |
34.799624865301666 |
39.71593476243275 |
61.272855781811394 |
results = %sql SELECT x, 'original' AS label \
FROM observations \
UNION ALL \
SELECT x, 'cleaned' AS label\
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=20,
height=4, aspect=1.5)
plt.xlim(30, 70)
(30.0, 70.0)
A.
## plots outliers and original
results = %sql SELECT x, 'original' AS label \
FROM observations \
UNION ALL \
SELECT x, 'cleaned' AS label\
FROM observations \
WHERE x IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=20,
height=4, aspect=2)
plt.xlim(30, 70)
(30.0, 70.0)
B.
# correct
results = %sql SELECT x, 'original' AS label \
FROM observations \
UNION ALL \
SELECT x, 'cleaned' AS label\
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=20,
height=4, aspect=2)
plt.xlim(30, 70)
(30.0, 70.0)
C.
results = %sql SELECT x, 'original' AS label \
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers) \
UNION ALL \
SELECT x, 'cleaned' AS label \
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=20,
height=4, aspect=2)
plt.xlim(30, 70)
(30.0, 70.0)
D.
results = %sql SELECT x, 'cleaned' AS label \
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=20,
height=4, aspect=2)
plt.xlim(30, 70)
(30.0, 70.0)
%sql UPDATE observations SET x = x*10 \
WHERE x = (SELECT MAX(x) FROM observations);
%sql SELECT min(x), max(x) FROM observations
min | max |
---|---|
33.81466007123295 | 681.9343532189914 |
results = %sql SELECT x, 'original' AS label \
FROM observations \
UNION ALL \
SELECT x, 'cleaned' AS label\
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=np.linspace(32, 70, 20),
height=4, aspect=1.5, rug=True)
plt.xlim(30, 700)
(30.0, 700.0)
results = %sql SELECT x, 'original' AS label \
FROM observations \
UNION ALL \
SELECT x, 'cleaned' AS label\
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=np.linspace(32, 70, 20),
height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
(30.0, 70.0)
## reset before moving on
%sql UPDATE observations SET x = x/10 \
WHERE x = (SELECT MAX(x) FROM OBSERVATIONS);
import numpy as np
arr = np.array([92, 19, 101, 58, 1053, 91, 26, 78, 10, 13, -40, 101, 86, 85, 15, 89, 89, 28, -5, 41])
arr
array([ 92, 19, 101, 58, 1053, 91, 26, 78, 10, 13, -40, 101, 86, 85, 15, 89, 89, 28, -5, 41])
np.mean(arr), np.median(arr)
(101.5, 68.0)
np.percentile(arr, 95, method='nearest')
101
np.percentile(arr, 5, method='nearest')
-5
arr_winsorized = np.array([92, 19, 101, 58, 101, 91, 26, 78, 10, 13, -5, 101, 86, 85, 15, 89, 89, 28, -5, 41])
arr_winsorized
array([ 92, 19, 101, 58, 101, 91, 26, 78, 10, 13, -5, 101, 86, 85, 15, 89, 89, 28, -5, 41])
np.mean(arr_winsorized), np.median(arr_winsorized)
(55.65, 68.0)
%%sql
CREATE OR REPLACE VIEW p1p99 AS
SELECT percentile_cont(.01) WITHIN GROUP (ORDER BY x) AS p1,
percentile_cont(.99) WITHIN GROUP (ORDER BY x) AS p99
FROM observations;
SELECT * FROM p1p99;
p1 | p99 |
---|---|
39.11510510063385 | 62.38853671085507 |
%%sql
CREATE OR REPLACE VIEW trimmed_observations AS
SELECT o.x, 'trimmed' AS label
FROM observations o, p1p99 p
WHERE o.x BETWEEN p.p1 AND p.p99
UNION ALL
SELECT o.x, 'original' AS label
FROM observations o;
CREATE OR REPLACE VIEW trimmed_outliers AS
SELECT o.*
FROM observations o, p1p99 p
WHERE o.x NOT BETWEEN p.p1 AND p.p99;
results = %sql SELECT * from trimmed_observations
sns.displot(results.dict(), x="x", kind='hist', hue='label', kde=True, bins=20,
height=4, aspect=1.5)
<seaborn.axisgrid.FacetGrid at 0x7f2e50e25390>