import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)

sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline

%reload_ext sql
%sql postgresql://localhost:5432/postgres

%sql SELECT setseed(0.42) -- to fix randomness

%sql CREATE EXTENSION IF NOT EXISTS tablefunc;

%sql DROP TABLE IF EXISTS observations CASCADE;
%sql CREATE TABLE observations AS \
     SELECT normal_rand AS x \
     FROM normal_rand(1000, 50, 5);

results = %sql SELECT x FROM observations;

sns.displot(results.dict(), fill=True, kde=True, bins=20,
            height=4, aspect=2)

<seaborn.axisgrid.FacetGrid at 0x7f17d6cbc6d0>

# construct the view

%%sql
CREATE OR REPLACE VIEW normal_outliers AS
WITH bounds AS (
   SELECT avg(x) - 2*stddev(x) AS lo,
   avg(x) + 2*stddev(x) AS hi
   FROM observations
)
SELECT x AS outlier
FROM observations o, bounds b 
WHERE x NOT BETWEEN b.lo AND b.hi;

# query the view
%sql SELECT * FROM normal_outliers;

results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
            height=4, aspect=1.5)
plt.xlim(30, 70)

(30.0, 70.0)

## plots outliers and original
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)

(30.0, 70.0)

# correct
results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)

(30.0, 70.0)

results = %sql SELECT x, 'original' AS label \
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers) \
               UNION ALL \
               SELECT x, 'cleaned' AS label \
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)

(30.0, 70.0)

results = %sql SELECT x, 'cleaned' AS label \
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=20,
                        height=4, aspect=2)
plt.xlim(30, 70)

(30.0, 70.0)

%sql UPDATE observations SET x = x*10 \
      WHERE x = (SELECT MAX(x) FROM observations);

%sql SELECT min(x), max(x) FROM observations

results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 700)

(30.0, 700.0)

results = %sql SELECT x, 'original' AS label \
               FROM observations \
               UNION ALL \
               SELECT x, 'cleaned' AS label\
               FROM observations \
               WHERE x NOT IN \
                   (SELECT * FROM normal_outliers)
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)

(30.0, 70.0)

## reset before moving on
%sql UPDATE observations SET x = x/10 \
      WHERE x = (SELECT MAX(x) FROM OBSERVATIONS);

import numpy as np

arr = np.array([92, 19, 101, 58, 1053, 91, 26, 78, 10, 13, -40, 101, 86, 85, 15, 89, 89, 28, -5, 41])
arr

array([  92,   19,  101,   58, 1053,   91,   26,   78,   10,   13,  -40,
        101,   86,   85,   15,   89,   89,   28,   -5,   41])

np.mean(arr), np.median(arr)

(101.5, 68.0)

np.percentile(arr, 95, method='nearest')

101

np.percentile(arr, 5, method='nearest')

-5

arr_winsorized = np.array([92, 19, 101, 58, 101, 91, 26, 78, 10, 13, -5, 101, 86, 85, 15, 89, 89, 28, -5, 41])
arr_winsorized

array([ 92,  19, 101,  58, 101,  91,  26,  78,  10,  13,  -5, 101,  86,
        85,  15,  89,  89,  28,  -5,  41])

np.mean(arr_winsorized), np.median(arr_winsorized)

(55.65, 68.0)

%%sql
CREATE OR REPLACE VIEW p1p99 AS
SELECT percentile_cont(.01) WITHIN GROUP (ORDER BY x) AS p1,
           percentile_cont(.99) WITHIN GROUP (ORDER BY x) AS p99
      FROM observations;
SELECT * FROM p1p99;

%%sql
CREATE OR REPLACE VIEW trimmed_observations AS
SELECT o.x, 'trimmed' AS label
  FROM observations o, p1p99 p
 WHERE o.x BETWEEN p.p1 AND p.p99
UNION ALL
SELECT o.x, 'original' AS label
  FROM observations o;

CREATE OR REPLACE VIEW trimmed_outliers AS
SELECT o.*
  FROM observations o, p1p99 p
 WHERE o.x NOT BETWEEN p.p1 AND p.p99;

results = %sql SELECT * from trimmed_observations
sns.displot(results.dict(), x="x", kind='hist', hue='label', kde=True, bins=20,
           height=4, aspect=1.5)

<seaborn.axisgrid.FacetGrid at 0x7f185c64d4d0>

# corrupt one value
%sql UPDATE observations SET x = x*10 \
      WHERE x = (SELECT MAX(x) FROM observations);

results = %sql SELECT * from trimmed_observations
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)

(30.0, 70.0)

## reset before moving on
# %sql UPDATE observations SET x = x*10 \
#       WHERE x = (SELECT MAX(x) FROM OBSERVATIONS);

%%sql
CREATE OR REPLACE VIEW winsorized_observations AS
SELECT CASE WHEN o.x BETWEEN p.p1 AND p.p99 THEN o.x
            WHEN o.x < p.p1 THEN p.p1
            WHEN o.x > p.p99 THEN p.p99
        END AS x,
      'winsorized' AS label
  FROM observations o, p1p99 p
UNION ALL
SELECT o.x, 'original' AS label
  FROM observations o;

results = %sql SELECT * from winsorized_observations
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)

(30.0, 70.0)

%%sql 
SELECT 'orig' AS distribution, min(x),
       percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
       percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
       percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
       max(x), avg(x), stddev(x), count(x) 
       FROM observations
UNION ALL
SELECT 'winsorized', min(x),
       percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
       percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
       percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
       max(x), avg(x), stddev(x), count(x) 
       FROM winsorized_observations WHERE label = 'winsorized'
UNION ALL 
SELECT 'trimmed', min(x),
       percentile_disc(.25) WITHIN GROUP (ORDER BY x) as p25,
       percentile_disc(.50) WITHIN GROUP (ORDER BY x) as median,
       percentile_disc(.75) WITHIN GROUP (ORDER BY x) as p75,
       max(x), avg(x), stddev(x), count(x) 
       FROM trimmed_observations WHERE label = 'trimmed';

%%sql
-- percentile_disc returns an actual data value near the percentile
-- percentile_cont returns an interpolated value at the percentile
CREATE OR REPLACE VIEW median AS
(SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY x) as median
  FROM observations);

%%sql
CREATE OR REPLACE VIEW mad AS
WITH
absdevs AS
(SELECT abs(x - median) as d
   FROM observations, median)
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY d) as mad
  FROM absdevs;
    
SELECT median, mad
  FROM median, mad;

%%sql
CREATE OR REPLACE VIEW hampelx84x2_observations AS
SELECT o.x,
      'hampelx84x2' AS label
  FROM observations o, median, mad
 WHERE o.x BETWEEN (median-2*1.4826*mad) AND (median+2*1.4826*mad)
UNION ALL
SELECT o.x, 'orig' AS label
  FROM observations o;

CREATE OR REPLACE VIEW Hampel84x2_outliers AS
SELECT x
  FROM observations o, median, mad
 WHERE x NOT BETWEEN (median - 2*1.4826*mad) AND (median + 2*1.4826*mad);

results = %sql SELECT * FROM hampelx84x2_observations
sns.displot(results.dict(), x="x", kind='hist',
            hue='label', kde=True, bins=np.linspace(32, 70, 20),
            height=4, aspect=1.5, rug=True)
plt.xlim(30, 70)

(30.0, 70.0)

distribution	min	p25	median	p75	max	avg	stddev	count
orig	33.77264290468849	46.54189424401416	49.858174101838706	53.358019083478794	649.0347505480154	50.55321958353861	19.566801533697422	1000
winsorized	38.57167993709219	46.54189424401416	49.858174101838706	53.358019083478794	61.85860550205813	49.97367467515923	4.858971275103253	1000
trimmed	38.57185271323869	46.63073906852975	49.858174101838706	53.26122620657416	61.85808489308678	49.968746755885455	4.617482190703675	980

Lecture 14: Outliers¶

Gaussian Outliers¶

What does this code do?¶

Choices [hidden]¶

What if we suddenly had extremes?¶

Winsorization¶

Trimming vs. Winsorizatoin¶

Does corruption affect?¶

Winsorization¶

Robustness, Mean, Median, Hampel X84¶

outlier
39.91898867462347
36.13940018676634
60.51259654525464
62.594491408385366
61.48076083878877
61.91014579022145
64.90347505480153
39.625287333768966
62.78550731086386
60.692317512763616