Python实现异常检测算法

应用高斯分布开发异常检测算法,这个比较简单,高斯分布也叫做正态分布,高中就学过,如果我们的数据符合高斯分布或者比较像高斯分布的时候可以使用这个算法,通过训练集计算高斯分布函数,与交叉验证集比较设置合适的Σ,当测试数据小于Σ时则为异常

#!/usr/bin/python
# coding=utf-8
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio

data = sio.loadmat('./data/ex8data1.mat');
X = data['X'] # 训练集
Xval = data['Xval'] # 交叉验证集
Yval = data['yval']
X1 = X[:, [0]]
X2 = X[:, [1]]

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X1, X2)
# plt.plot(Xval[:, [0]], Xval[:, [1]], '.', markerfacecolor='g', markeredgecolor="k", markersize=14)
plt.show()


# 计算平均数 𝜇 和 方差 𝜎2
def aveandvar(x):
    sum = np.array([0])
    for i in x: sum = sum + i
    ave = sum/len(x)
    sum = 0
    for i in x: sum += (i - ave)*(i - ave)
    var = sum/len(x)
    return ave, var


# 计算概率密度p(x); 特征集, 平均值, 平方差
def gaussian_distribution(x, u, s):
    px = []
    for i in x:
        p = 1/(np.sqrt((2 * np.pi * s))) * np.exp(-((i - u) * (i - u))/(2 * s))
        px.append(p)
    px = np.array(px)
    return px

# 选择阈值
def select_threshold(pval, yval):
    best_epsilon = 0
    best_f1 = 0
    f1 = 0

    step = (pval.max() - pval.min()) / 10000

    for epsilon in np.arange(pval.min(), pval.max(), step):
        preds = pval < epsilon

        tp = np.sum(np.logical_and(preds == 1, yval == 1)).astype(float)
        fp = np.sum(np.logical_and(preds == 1, yval == 0)).astype(float)
        fn = np.sum(np.logical_and(preds == 0, yval == 1)).astype(float)

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = (2 * precision * recall) / (precision + recall)

        if f1 > best_f1:
            best_f1 = f1
            best_epsilon = epsilon

    return best_epsilon, best_f1


# 画一下这两个特征值的高斯曲线 首先要排下序
X1.T.sort()
X2.T.sort()
u,s = aveandvar(X1)
px1 = gaussian_distribution(X1, u, s)
u,s = aveandvar(X2)
px2 = gaussian_distribution(X2, u, s)

plt.subplot(211)
plt.title('X1')
plt.plot(X1, px1 )
plt.subplot(212)
plt.title('X2')
plt.plot(X2, px2 )
plt.show()

# 计算训练集
u,s = aveandvar(X)
px = gaussian_distribution(X, u, s)

# 计算测试集
u,s = aveandvar(Xval)
tpx = gaussian_distribution(X, u, s)

epsilon, f1 = select_threshold(tpx, Yval)
print(epsilon, f1)

# 标记出异常数据
outliers = np.where(px < epsilon)
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1])
ax.scatter(X[outliers[0],0], X[outliers[0],1], s=50, color='r', marker='o')
plt.show()

# 调库验证
from scipy import stats
px = np.zeros((X.shape[0], X.shape[1]))
px[:,0] = stats.norm(u[0], s[0]).pdf(X[:,0])
px[:,1] = stats.norm(u[1], s[1]).pdf(X[:,1])
outliers = np.where(px < epsilon)
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1])
ax.scatter(X[outliers[0],0], X[outliers[0],1], s=50, color='r', marker='o')
plt.show()

这是我们的训练集合,明显有六个是异常数据

image

画出连个特征的高斯函数,比较像高斯分布

image

通过我自己写的高斯密度函数计算,有些过拟合,多拟合到了两个点,不知道为什么。

image

调用scipy的高斯函数库计算后完美的检测到了异常数据

image