Lecture 14: Outliers¶
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)
sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline
Gaussian Outliers¶
In [2]:
%reload_ext sql
%sql postgresql://localhost:5432/postgres
In [3]:
%sql SELECT setseed(0.42) -- to fix randomness
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[3]:
setseed |
---|
Just like a Python package, we need to import tablefunc
in order to use the normal_rand
function.
In [4]:
%sql CREATE EXTENSION IF NOT EXISTS tablefunc;
Running query in 'postgresql://localhost:5432/postgres'
Out[4]:
Assume our datapoints are truly Normal. Simulate them in observations
.
- 1000 datapoints, $\mu = 50, \sigma = 5$.
In [5]:
%sql DROP TABLE IF EXISTS observations CASCADE;
%sql CREATE TABLE observations AS \
SELECT normal_rand AS x \
FROM normal_rand(1000, 50, 5);
results = %sql SELECT x FROM observations;
Running query in 'postgresql://localhost:5432/postgres'
Running query in 'postgresql://localhost:5432/postgres'
1000 rows affected.
Running query in 'postgresql://localhost:5432/postgres'
1000 rows affected.
In [6]:
sns.displot(results.dict(), fill=True, kde=True, bins=20,
height=4, aspect=2)
Out[6]:
<seaborn.axisgrid.FacetGrid at 0x7f17d6cbc6d0>
Construct a view called normal_outliers
using the 2 sigma metric.
In [7]:
# construct the view
In [8]:
%%sql
CREATE OR REPLACE VIEW normal_outliers AS
WITH bounds AS (
SELECT avg(x) - 2*stddev(x) AS lo,
avg(x) + 2*stddev(x) AS hi
FROM observations
)
SELECT x AS outlier
FROM observations o, bounds b
WHERE x NOT BETWEEN b.lo AND b.hi;
Running query in 'postgresql://localhost:5432/postgres'
Out[8]:
In [9]:
# query the view
%sql SELECT * FROM normal_outliers;
Running query in 'postgresql://localhost:5432/postgres'
54 rows affected.
Out[9]:
outlier |
---|
39.91898867462347 |
36.13940018676634 |
60.51259654525464 |
62.594491408385366 |
61.48076083878877 |
61.91014579022145 |
64.90347505480153 |
39.625287333768966 |
62.78550731086386 |
60.692317512763616 |
What does this code do?¶
In [10]:
results = %sql SELECT x, 'original' AS label \
FROM observations \
UNION ALL \
SELECT x, 'cleaned' AS label\
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=20,
height=4, aspect=1.5)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1946 rows affected.
Out[10]:
(30.0, 70.0)
Choices [hidden]¶
A.
In [11]:
## plots outliers and original
results = %sql SELECT x, 'original' AS label \
FROM observations \
UNION ALL \
SELECT x, 'cleaned' AS label\
FROM observations \
WHERE x IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=20,
height=4, aspect=2)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1054 rows affected.
Out[11]:
(30.0, 70.0)
B.
In [12]:
# correct
results = %sql SELECT x, 'original' AS label \
FROM observations \
UNION ALL \
SELECT x, 'cleaned' AS label\
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=20,
height=4, aspect=2)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1946 rows affected.
Out[12]:
(30.0, 70.0)
C.
In [13]:
results = %sql SELECT x, 'original' AS label \
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers) \
UNION ALL \
SELECT x, 'cleaned' AS label \
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=20,
height=4, aspect=2)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1892 rows affected.
Out[13]:
(30.0, 70.0)
D.
In [14]:
results = %sql SELECT x, 'cleaned' AS label \
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=20,
height=4, aspect=2)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
946 rows affected.
Out[14]:
(30.0, 70.0)
What if we suddenly had extremes?¶
In [15]:
%sql UPDATE observations SET x = x*10 \
WHERE x = (SELECT MAX(x) FROM observations);
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[15]:
In [16]:
%sql SELECT min(x), max(x) FROM observations
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[16]:
min | max |
---|---|
33.77264290468849 | 649.0347505480154 |
In [17]:
results = %sql SELECT x, 'original' AS label \
FROM observations \
UNION ALL \
SELECT x, 'cleaned' AS label\
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=np.linspace(32, 70, 20),
height=4, aspect=1.5, rug=True)
plt.xlim(30, 700)
Running query in 'postgresql://localhost:5432/postgres'
1999 rows affected.
Out[17]:
(30.0, 700.0)
In [18]:
results = %sql SELECT x, 'original' AS label \
FROM observations \
UNION ALL \
SELECT x, 'cleaned' AS label\
FROM observations \
WHERE x NOT IN \
(SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=np.linspace(32, 70, 20),
height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1999 rows affected.
Out[18]:
(30.0, 70.0)
In [19]:
## reset before moving on
%sql UPDATE observations SET x = x/10 \
WHERE x = (SELECT MAX(x) FROM OBSERVATIONS);
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[19]:
Winsorization¶
In [20]:
import numpy as np
In [21]:
arr = np.array([92, 19, 101, 58, 1053, 91, 26, 78, 10, 13, -40, 101, 86, 85, 15, 89, 89, 28, -5, 41])
arr
Out[21]:
array([ 92, 19, 101, 58, 1053, 91, 26, 78, 10, 13, -40, 101, 86, 85, 15, 89, 89, 28, -5, 41])
In [22]:
np.mean(arr), np.median(arr)
Out[22]:
(101.5, 68.0)
In [23]:
np.percentile(arr, 95, method='nearest')
Out[23]:
101
In [24]:
np.percentile(arr, 5, method='nearest')
Out[24]:
-5
In [25]:
arr_winsorized = np.array([92, 19, 101, 58, 101, 91, 26, 78, 10, 13, -5, 101, 86, 85, 15, 89, 89, 28, -5, 41])
arr_winsorized
Out[25]:
array([ 92, 19, 101, 58, 101, 91, 26, 78, 10, 13, -5, 101, 86, 85, 15, 89, 89, 28, -5, 41])
In [26]:
np.mean(arr_winsorized), np.median(arr_winsorized)
Out[26]:
(55.65, 68.0)
Trimming vs. Winsorizatoin¶
In [27]:
%%sql
CREATE OR REPLACE VIEW p1p99 AS
SELECT percentile_cont(.01) WITHIN GROUP (ORDER BY x) AS p1,
percentile_cont(.99) WITHIN GROUP (ORDER BY x) AS p99
FROM observations;
SELECT * FROM p1p99;
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[27]:
p1 | p99 |
---|---|
38.57167993709219 | 61.85860550205813 |
In [28]:
%%sql
CREATE OR REPLACE VIEW trimmed_observations AS
SELECT o.x, 'trimmed' AS label
FROM observations o, p1p99 p
WHERE o.x BETWEEN p.p1 AND p.p99
UNION ALL
SELECT o.x, 'original' AS label
FROM observations o;
CREATE OR REPLACE VIEW trimmed_outliers AS
SELECT o.*
FROM observations o, p1p99 p
WHERE o.x NOT BETWEEN p.p1 AND p.p99;
Running query in 'postgresql://localhost:5432/postgres'
Out[28]:
In [29]:
results = %sql SELECT * from trimmed_observations
sns.displot(results.dict(), x="x", kind='hist', hue='label', kde=True, bins=20,
height=4, aspect=1.5)
Running query in 'postgresql://localhost:5432/postgres'
1980 rows affected.
Out[29]:
<seaborn.axisgrid.FacetGrid at 0x7f185c64d4d0>
Does corruption affect?¶
In [30]:
# corrupt one value
%sql UPDATE observations SET x = x*10 \
WHERE x = (SELECT MAX(x) FROM observations);
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[30]:
Turns out, we still have as many outliers, which we trim.
Because we are trimming based on the order of the data and not on the distribution statistics (mean/SD)
In [31]:
results = %sql SELECT * from trimmed_observations
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=np.linspace(32, 70, 20),
height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1980 rows affected.
Out[31]:
(30.0, 70.0)
Winsorization¶
In [32]:
## reset before moving on
# %sql UPDATE observations SET x = x*10 \
# WHERE x = (SELECT MAX(x) FROM OBSERVATIONS);
In [33]:
%%sql
CREATE OR REPLACE VIEW winsorized_observations AS
SELECT CASE WHEN o.x BETWEEN p.p1 AND p.p99 THEN o.x
WHEN o.x < p.p1 THEN p.p1
WHEN o.x > p.p99 THEN p.p99
END AS x,
'winsorized' AS label
FROM observations o, p1p99 p
UNION ALL
SELECT o.x, 'original' AS label
FROM observations o;
Running query in 'postgresql://localhost:5432/postgres'
Out[33]:
In [34]:
results = %sql SELECT * from winsorized_observations
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=np.linspace(32, 70, 20),
height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
2000 rows affected.
Out[34]:
(30.0, 70.0)
In [35]:
%%sql
SELECT 'orig' AS distribution, min(x),
percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
max(x), avg(x), stddev(x), count(x)
FROM observations
UNION ALL
SELECT 'winsorized', min(x),
percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
max(x), avg(x), stddev(x), count(x)
FROM winsorized_observations WHERE label = 'winsorized'
UNION ALL
SELECT 'trimmed', min(x),
percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
max(x), avg(x), stddev(x), count(x)
FROM trimmed_observations WHERE label = 'trimmed';
Running query in 'postgresql://localhost:5432/postgres'
3 rows affected.
Out[35]:
distribution | min | p25 | median | p75 | max | avg | stddev | count |
---|---|---|---|---|---|---|---|---|
orig | 33.77264290468849 | 46.54189424401416 | 49.858174101838706 | 53.358019083478794 | 649.0347505480154 | 50.55321958353861 | 19.566801533697422 | 1000 |
winsorized | 38.57167993709219 | 46.54189424401416 | 49.858174101838706 | 53.358019083478794 | 61.85860550205813 | 49.97367467515923 | 4.858971275103253 | 1000 |
trimmed | 38.57185271323869 | 46.63073906852975 | 49.858174101838706 | 53.26122620657416 | 61.85808489308678 | 49.968746755885455 | 4.617482190703675 | 980 |
Robustness, Mean, Median, Hampel X84¶
In [36]:
%%sql
-- percentile_disc returns an actual data value near the percentile
-- percentile_cont returns an interpolated value at the percentile
CREATE OR REPLACE VIEW median AS
(SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY x) as median
FROM observations);
Running query in 'postgresql://localhost:5432/postgres'
Out[36]:
In [37]:
%%sql
CREATE OR REPLACE VIEW mad AS
WITH
absdevs AS
(SELECT abs(x - median) as d
FROM observations, median)
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY d) as mad
FROM absdevs;
SELECT median, mad
FROM median, mad;
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[37]:
median | mad |
---|---|
49.858174101838706 | 3.3797636753810707 |
In [38]:
%%sql
CREATE OR REPLACE VIEW hampelx84x2_observations AS
SELECT o.x,
'hampelx84x2' AS label
FROM observations o, median, mad
WHERE o.x BETWEEN (median-2*1.4826*mad) AND (median+2*1.4826*mad)
UNION ALL
SELECT o.x, 'orig' AS label
FROM observations o;
CREATE OR REPLACE VIEW Hampel84x2_outliers AS
SELECT x
FROM observations o, median, mad
WHERE x NOT BETWEEN (median - 2*1.4826*mad) AND (median + 2*1.4826*mad);
Running query in 'postgresql://localhost:5432/postgres'
Out[38]:
In [39]:
results = %sql SELECT * FROM hampelx84x2_observations
sns.displot(results.dict(), x="x", kind='hist',
hue='label', kde=True, bins=np.linspace(32, 70, 20),
height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1948 rows affected.
Out[39]:
(30.0, 70.0)