Lecture 14: Outliers¶

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)

sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline

Gaussian Outliers¶

In [2]:
%reload_ext sql
%sql postgresql://localhost:5432/postgres
In [3]:
%sql SELECT setseed(0.42) -- to fix randomness
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[3]:
setseed

Just like a Python package, we need to import tablefunc in order to use the normal_rand function.

In [4]:
%sql CREATE EXTENSION IF NOT EXISTS tablefunc;
Running query in 'postgresql://localhost:5432/postgres'
Out[4]:

Assume our datapoints are truly Normal. Simulate them in observations.

  • 1000 datapoints, $\mu = 50, \sigma = 5$.
In [5]:
%sql DROP TABLE IF EXISTS observations CASCADE;
%sql CREATE TABLE observations AS \
     SELECT normal_rand AS x \
     FROM normal_rand(1000, 50, 5);

results = %sql SELECT x FROM observations;
Running query in 'postgresql://localhost:5432/postgres'
Running query in 'postgresql://localhost:5432/postgres'
1000 rows affected.
Running query in 'postgresql://localhost:5432/postgres'
1000 rows affected.
In [6]:
sns.displot(results.dict(), fill=True, kde=True, bins=20,
            height=4, aspect=2)
Out[6]:
<seaborn.axisgrid.FacetGrid at 0x7f17d6cbc6d0>
No description has been provided for this image

Construct a view called normal_outliers using the 2 sigma metric.

In [7]:
# construct the view
In [8]:
%%sql
CREATE OR REPLACE VIEW normal_outliers AS
WITH bounds AS (
   SELECT avg(x) - 2*stddev(x) AS lo,
   avg(x) + 2*stddev(x) AS hi
   FROM observations
)
SELECT x AS outlier
FROM observations o, bounds b 
WHERE x NOT BETWEEN b.lo AND b.hi;
Running query in 'postgresql://localhost:5432/postgres'
Out[8]:
In [9]:
# query the view
%sql SELECT * FROM normal_outliers;
Running query in 'postgresql://localhost:5432/postgres'
54 rows affected.
Out[9]:
outlier
39.91898867462347
36.13940018676634
60.51259654525464
62.594491408385366
61.48076083878877
61.91014579022145
64.90347505480153
39.625287333768966
62.78550731086386
60.692317512763616
Truncated to displaylimit of 10.

What does this code do?¶

In [10]:
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
            height=4, aspect=1.5)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1946 rows affected.
Out[10]:
(30.0, 70.0)
No description has been provided for this image










Choices [hidden]¶

A.

In [11]:
## plots outliers and original
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1054 rows affected.
Out[11]:
(30.0, 70.0)
No description has been provided for this image

B.

In [12]:
# correct
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1946 rows affected.
Out[12]:
(30.0, 70.0)
No description has been provided for this image

C.

In [13]:
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers) \
               UNION ALL \
               SELECT x, 'cleaned' AS label \
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1892 rows affected.
Out[13]:
(30.0, 70.0)
No description has been provided for this image

D.

In [14]:
results = %sql SELECT x, 'cleaned' AS label \
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
946 rows affected.
Out[14]:
(30.0, 70.0)
No description has been provided for this image

What if we suddenly had extremes?¶

In [15]:
%sql UPDATE observations SET x = x*10 \
      WHERE x = (SELECT MAX(x) FROM observations);
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[15]:
In [16]:
%sql SELECT min(x), max(x) FROM observations
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[16]:
min max
33.77264290468849 649.0347505480154
In [17]:
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 700)
Running query in 'postgresql://localhost:5432/postgres'
1999 rows affected.
Out[17]:
(30.0, 700.0)
No description has been provided for this image
In [18]:
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1999 rows affected.
Out[18]:
(30.0, 70.0)
No description has been provided for this image
In [19]:
## reset before moving on
%sql UPDATE observations SET x = x/10 \
      WHERE x = (SELECT MAX(x) FROM OBSERVATIONS);
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[19]:

Winsorization¶

In [20]:
import numpy as np
In [21]:
arr = np.array([92, 19, 101, 58, 1053, 91, 26, 78, 10, 13, -40, 101, 86, 85, 15, 89, 89, 28, -5, 41])
arr
Out[21]:
array([  92,   19,  101,   58, 1053,   91,   26,   78,   10,   13,  -40,
        101,   86,   85,   15,   89,   89,   28,   -5,   41])
In [22]:
np.mean(arr), np.median(arr)
Out[22]:
(101.5, 68.0)
In [23]:
np.percentile(arr, 95, method='nearest')
Out[23]:
101
In [24]:
np.percentile(arr, 5, method='nearest')
Out[24]:
-5
In [25]:
arr_winsorized = np.array([92, 19, 101, 58, 101, 91, 26, 78, 10, 13, -5, 101, 86, 85, 15, 89, 89, 28, -5, 41])
arr_winsorized
Out[25]:
array([ 92,  19, 101,  58, 101,  91,  26,  78,  10,  13,  -5, 101,  86,
        85,  15,  89,  89,  28,  -5,  41])
In [26]:
np.mean(arr_winsorized), np.median(arr_winsorized)
Out[26]:
(55.65, 68.0)

Trimming vs. Winsorizatoin¶

In [27]:
%%sql
CREATE OR REPLACE VIEW p1p99 AS
SELECT percentile_cont(.01) WITHIN GROUP (ORDER BY x) AS p1,
           percentile_cont(.99) WITHIN GROUP (ORDER BY x) AS p99
      FROM observations;
SELECT * FROM p1p99;
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[27]:
p1 p99
38.57167993709219 61.85860550205813
In [28]:
%%sql
CREATE OR REPLACE VIEW trimmed_observations AS
SELECT o.x, 'trimmed' AS label
  FROM observations o, p1p99 p
 WHERE o.x BETWEEN p.p1 AND p.p99
UNION ALL
SELECT o.x, 'original' AS label
  FROM observations o;

CREATE OR REPLACE VIEW trimmed_outliers AS
SELECT o.*
  FROM observations o, p1p99 p
 WHERE o.x NOT BETWEEN p.p1 AND p.p99;
Running query in 'postgresql://localhost:5432/postgres'
Out[28]:
In [29]:
results = %sql SELECT * from trimmed_observations
sns.displot(results.dict(), x="x", kind='hist', hue='label', kde=True, bins=20,
           height=4, aspect=1.5)
Running query in 'postgresql://localhost:5432/postgres'
1980 rows affected.
Out[29]:
<seaborn.axisgrid.FacetGrid at 0x7f185c64d4d0>
No description has been provided for this image

Does corruption affect?¶

In [30]:
# corrupt one value
%sql UPDATE observations SET x = x*10 \
      WHERE x = (SELECT MAX(x) FROM observations);
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[30]:

Turns out, we still have as many outliers, which we trim.

Because we are trimming based on the order of the data and not on the distribution statistics (mean/SD)

In [31]:
results = %sql SELECT * from trimmed_observations
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1980 rows affected.
Out[31]:
(30.0, 70.0)
No description has been provided for this image

Winsorization¶

In [32]:
## reset before moving on
# %sql UPDATE observations SET x = x*10 \
#       WHERE x = (SELECT MAX(x) FROM OBSERVATIONS);
In [33]:
%%sql
CREATE OR REPLACE VIEW winsorized_observations AS
SELECT CASE WHEN o.x BETWEEN p.p1 AND p.p99 THEN o.x
            WHEN o.x < p.p1 THEN p.p1
            WHEN o.x > p.p99 THEN p.p99
        END AS x,
      'winsorized' AS label
  FROM observations o, p1p99 p
UNION ALL
SELECT o.x, 'original' AS label
  FROM observations o;
Running query in 'postgresql://localhost:5432/postgres'
Out[33]:
In [34]:
results = %sql SELECT * from winsorized_observations
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
2000 rows affected.
Out[34]:
(30.0, 70.0)
No description has been provided for this image
In [35]:
%%sql 
SELECT 'orig' AS distribution, min(x),
       percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
       percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
       percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
       max(x), avg(x), stddev(x), count(x) 
       FROM observations
UNION ALL
SELECT 'winsorized', min(x),
       percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
       percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
       percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
       max(x), avg(x), stddev(x), count(x) 
       FROM winsorized_observations WHERE label = 'winsorized'
UNION ALL 
SELECT 'trimmed', min(x),
       percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
       percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
       percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
       max(x), avg(x), stddev(x), count(x) 
       FROM trimmed_observations WHERE label = 'trimmed';
Running query in 'postgresql://localhost:5432/postgres'
3 rows affected.
Out[35]:
distribution min p25 median p75 max avg stddev count
orig 33.77264290468849 46.54189424401416 49.858174101838706 53.358019083478794 649.0347505480154 50.55321958353861 19.566801533697422 1000
winsorized 38.57167993709219 46.54189424401416 49.858174101838706 53.358019083478794 61.85860550205813 49.97367467515923 4.858971275103253 1000
trimmed 38.57185271323869 46.63073906852975 49.858174101838706 53.26122620657416 61.85808489308678 49.968746755885455 4.617482190703675 980

Robustness, Mean, Median, Hampel X84¶

In [36]:
%%sql
-- percentile_disc returns an actual data value near the percentile
-- percentile_cont returns an interpolated value at the percentile
CREATE OR REPLACE VIEW median AS
(SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY x) as median
  FROM observations);
Running query in 'postgresql://localhost:5432/postgres'
Out[36]:
In [37]:
%%sql
CREATE OR REPLACE VIEW mad AS
WITH
absdevs AS
(SELECT abs(x - median) as d
   FROM observations, median)
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY d) as mad
  FROM absdevs;
    
SELECT median, mad
  FROM median, mad;
Running query in 'postgresql://localhost:5432/postgres'
1 rows affected.
Out[37]:
median mad
49.858174101838706 3.3797636753810707
In [38]:
%%sql
CREATE OR REPLACE VIEW hampelx84x2_observations AS
SELECT o.x,
      'hampelx84x2' AS label
  FROM observations o, median, mad
 WHERE o.x BETWEEN (median-2*1.4826*mad) AND (median+2*1.4826*mad)
UNION ALL
SELECT o.x, 'orig' AS label
  FROM observations o;

CREATE OR REPLACE VIEW Hampel84x2_outliers AS
SELECT x
  FROM observations o, median, mad
 WHERE x NOT BETWEEN (median - 2*1.4826*mad) AND (median + 2*1.4826*mad);
Running query in 'postgresql://localhost:5432/postgres'
Out[38]:
In [39]:
results = %sql SELECT * FROM hampelx84x2_observations
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)
Running query in 'postgresql://localhost:5432/postgres'
1948 rows affected.
Out[39]:
(30.0, 70.0)
No description has been provided for this image