Outlier detection comparison

Note: to run the experiment and use import pycy_usporf module

  • Open SPORF/Python/rerf and download the following files into your local computer

    • pycy_usporf.py
  • Open SPORF/Python/src and download the following files into your local computer

    • cy_usporf.pyx
    • setup.py
  • Open terminal, cd to the files location and run

    • $ python setup.py build_ext --inplace
      
[1]:
%matplotlib inline
[2]:
"""
==================================================
Benchmark performance of USPORF in 3D toy dataset
==================================================

This example compares the performance of isolarion forest (IF)
and USPORF on outlier detection with different 3D datasets.

Algorithms:
- Isolation Forest, `sklearn.ensemble.IsolationForest`
- USPORF using distance matrix, `UnsupervisedRandomForest`
    - C++ backend, cannot extract trees structure
- Usporf using path length, `pycy_usporf.UForest`
    - Cython backend, can extract trees structure

Benchmarks
- Accuracy_score: top left of each data figure
- Computational time: bottom right of each data figure
- AUC score: far right colum in ROC figures

"""
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import time
from sklearn.datasets import make_moons, make_blobs
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.ensemble import IsolationForest
from rerf.urerf import UnsupervisedRandomForest
import pycy_usporf

##  Note: to run an example
# Download `pycy_usporf.py` and `cy_usporf.pyx`
# into your local drive and generate `.so` module using `setup.py`
[3]:
# Data parameter
d_noise = 5  # number of Gaussian dimensional noise
n_samples = 500
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers
line_sd = 0.06  # sd of the line in dataset 1 and 2
cluster_sd = 0.2  # sd of the cluster in dataset 4 and 5
noise_sd = 2  # sd of the cluster in dimensional noise

print("Experimental setup\n===================")
print("informative dimension = 5,\nnoise dimension = %d," %(d_noise))
print("sample size = %d,\noutlier fraction = %.2f" %(
    n_samples, outliers_fraction))
Experimental setup
===================
informative dimension = 5,
noise dimension = 5,
sample size = 500,
outlier fraction = 0.15
[4]:
# Define datasets
## 1: Linear
def fun_linear(samples=1, sd=0.0):
    t_lin = np.transpose(np.linspace(-1, 1, samples))
    X_lin = np.c_[
        0.4 * t_lin + sd * np.random.randn(samples),
        0.6 * t_lin + sd * np.random.randn(samples),
        t_lin + sd * np.random.randn(samples),
    ]
    return X_lin


X_lin = fun_linear(samples=n_inliers, sd=line_sd)

## 2: Helix
def fun_helix(samples=1, sd=0.0):
    t_hex = np.transpose(np.linspace(2 * np.pi, 9 * np.pi, samples))
    xline = t_hex * np.cos(t_hex)  # before rescale
    xline = xline / (max(xline) - min(xline)) * 2 + sd * np.random.randn(samples)
    yline = t_hex * np.sin(t_hex)  # before rescale
    yline = yline / (max(yline) - min(yline)) * 2 + sd * np.random.randn(samples)
    zline = (t_hex - (max(t_hex) + min(t_hex)) / 2) / (
        max(t_hex) - min(t_hex)
    ) * 2 + sd * np.random.randn(samples)
    X_hex = np.c_[xline, yline, zline]
    return X_hex


X_hex = fun_helix(samples=n_inliers, sd=line_sd)

## 3: Sphere, equally distribution
def fibonacci_sphere(samples=1, randomize=True):
    rnd = 1.0
    if randomize:
        rnd = np.random.random() * samples
    points = []
    offset = 2.0 / samples
    increment = np.pi * (3.0 - np.sqrt(5.0))
    for i in range(samples):
        y = ((i * offset) - 1) + (offset / 2)
        r = np.sqrt(1 - pow(y, 2))
        phi = ((i + rnd) % samples) * increment
        x = np.cos(phi) * r
        z = np.sin(phi) * r
        points.append([x, y, z])
    return points


X_sph = np.array(fibonacci_sphere(samples=n_inliers))

## 4: Gaussian Mixture
def gaussian_blobs(samples=3, sd=0.0):
    blobs_params = dict(random_state=0, n_samples=samples, n_features=3)
    X_gau = make_blobs(
        centers=[[-0.7, -0.7, -0.7], [0, 0, 0], [0.7, 0.7, 0.7]],
        cluster_std=[sd, sd, sd],
        **blobs_params
    )[0]
    return X_gau


X_gau = gaussian_blobs(samples=n_inliers, sd=cluster_sd)

## 5: Misaligned Gaussian Mixture
def misaligned_blobs(samples=3, sd=0.0):
    blobs_params = dict(random_state=0, n_samples=samples, n_features=3)
    X_misaligned = make_blobs(
        centers=[[-0.7, -0.7, -0.7], [0.7, 0.7, -0.7], [-0.7, 0.7, 0.7]],
        cluster_std=[sd, sd, sd],
        **blobs_params
    )[0]
    return X_misaligned


X_misaligned = misaligned_blobs(samples=n_inliers, sd=cluster_sd)

## 6: Whole dataset
datasets3D = [X_lin, X_hex, X_sph, X_gau, X_misaligned]

# define to data label: y_true
y_true = np.concatenate([np.ones(n_inliers), -np.ones(n_outliers)], axis=0)
# label 1 as inliers, -1 as outliers
[5]:
# Define algorithms
anomaly_algorithms = [
    ("IF", IsolationForest(contamination=outliers_fraction,
                           random_state=42)),
    ("Usporf_distance", UnsupervisedRandomForest(n_estimators=100, random_state=0)),
    ("Usporf_depth_cy", pycy_usporf.UForest(n_estimators=100)),
]
[6]:
# related functions

def mat_to_score(mat, power =2):
    s = (mat**power).sum(axis=1)/mat.shape[0]
    return s

def decision_function(s_train, s_test, less_inlier = True): # array
    if s_test is None:
        s_test=s_train
    if less_inlier == True:
        s_train = s_train
        s_test = s_test
    if less_inlier == False:
        s_train = -s_train
        s_test = -s_test
    offset_ = np.percentile(s_train, outliers_fraction*100)
    decision_function = s_test - offset_
    return decision_function

def OutPredict(decision_function):
    is_inlier = np.ones(decision_function.shape[0], dtype=int)
    is_inlier[decision_function <= 0] = -1
    return is_inlier
[7]:
# Plot
plt.figure(figsize=((len(anomaly_algorithms)) * 2.5 + 1, len(datasets3D) * 2 + 1))
plt.subplots_adjust(
    left=0.02, right=0.98, bottom=0.001, top=0.98, wspace=0.05, hspace=0.01
)
plot_num = 1
rng = np.random.RandomState(42)
for i_dataset3D, X in enumerate(datasets3D):
    # add uniform distribution outliers
    X = np.concatenate([X, rng.uniform(low=-1.5, high=1.5, size=(n_outliers, 3))], axis=0)
    # add Gaussian dimensional noise, set the center at origin
    X = np.concatenate([X, 14.*(np.random.RandomState(42).rand(n_samples, d_noise))], axis=1)

    # list of AUC and ROC
    list_AUC = []
    list_fpr = []
    list_tpr = []
    list_thresh = []


    algo_index = 0
    for name, algorithm in anomaly_algorithms:
        t0 = time.time()
        algorithm.fit(X)

        # predict outlier
        if name == "Usporf_distance":
            d_sim = algorithm.transform()
            s = mat_to_score(mat=d_sim)
            probas_ = decision_function(s_train=s, s_test=s, less_inlier= True)
            y_pred = OutPredict(decision_function=probas_)
        elif name == "Usporf_depth" or name == "Usporf_depth_cy":
            s = algorithm.compute_paths(X)
            probas_ = decision_function(s_train=s, s_test=s, less_inlier= False)
            y_pred = OutPredict(decision_function=probas_)
        else:
            probas_ = algorithm.fit(X).decision_function(X)
            y_pred = algorithm.predict(X)

        AUC = roc_auc_score(y_true, probas_)
        fpr, tpr, thresholds = roc_curve(y_true, probas_)
        thresh_index = np.where(abs(thresholds) == min(abs(thresholds)))[0][0]
        # store ROC curve
        list_AUC.append(AUC)
        list_fpr.append(fpr)
        list_tpr.append(tpr)
        list_thresh.append(thresh_index)


        # compute the accuracy
        acc = accuracy_score(y_true, y_pred)
        t1 = time.time()

        # add data plot
        ax = plt.subplot(
            len(datasets3D), len(anomaly_algorithms)+1, plot_num, projection="3d"
        )
        ax.axis("on")
        if i_dataset3D == 0:
            plt.title(
                str(algo_index + 1) + ") " + name, size=10, color="black", weight="bold"
            )  # use function's name for a title
        colors = np.array(["#377eb8", "#ff7f00"])
        # color plot ('blue' = outlier, 'orange'=inlier)
        ax.scatter3D(X[:, 0], X[:, 1], X[:, 2], s=5, color=colors[((y_pred + 1) // 2)])
        ax.text2D(
            0.01,
            0.85,
            ("acc %.3f" % acc).lstrip("0"),
            transform=plt.gca().transAxes,
            size=15,
            horizontalalignment="left",
        )
        ax.text2D(
            0.99,
            0.01,
            ("%.2fs" % (t1 - t0)).lstrip("0"),
            transform=plt.gca().transAxes,
            size=15,
            horizontalalignment="right",
        )
        ax.set_xticks([])
        ax.set_yticks([])
        ax.zaxis.set_ticklabels([])
        algo_index += 1
        plot_num += 1

#------------------------------------------------------
# plot the ROC curves and show AUC scores

    plt.subplot(len(datasets3D), len(anomaly_algorithms) + 1, plot_num) # +1 for ROC column

    if i_dataset3D == 0:
        plt.title("ROC", size=15, color="black", weight="bold")

        # lebel the decision_function's thresholds
        plt.scatter([], [], marker="x", color="black", label="thresholds")

    for algo_index in range(len(anomaly_algorithms)):

        if i_dataset3D == 0:
            plt.plot(list_fpr[algo_index], list_tpr[algo_index],
                label="algo " + str(algo_index + 1)+ ")"
                + (" AUC %.2f" % list_AUC[algo_index]).lstrip("0"))
        else:
            plt.plot(list_fpr[algo_index], list_tpr[algo_index],
                label= str(algo_index + 1)+ ")"
                + (" %.2f" % list_AUC[algo_index]).lstrip("0"))

        plt.scatter(
        list_fpr[algo_index][list_thresh[algo_index]],
        list_tpr[algo_index][list_thresh[algo_index]],
        s=40, marker="x", color = 'black')

    plt.plot(np.array([0, 1]), np.array([0, 1]), linestyle="--", color="black")
    plt.legend()
    plt.tick_params(labelleft = False, labelbottom = False, direction  = "in")
    plot_num += 1

plt.show()
../../_images/demos_URerF_3d_usporf_outlier_detection_7_0.png