Skip to content

Commit 25e9362

Browse files
committed
[circle full] novelty detection example
1 parent 44ece37 commit 25e9362

File tree

2 files changed

+87
-3
lines changed

2 files changed

+87
-3
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
=================================================
3+
Novelty detection with Local Outlier Factor (LOF)
4+
=================================================
5+
6+
The Local Outlier Factor (LOF) algorithm is an unsupervised anomaly detection
7+
method which computes the local density deviation of a given data point with
8+
respect to its neighbors. It considers as outlier samples that have a
9+
substantially lower density than their neighbors. This example shows how to
10+
use LOF for novelty detection. Note that when LOF is used for novelty
11+
detection you MUST not use predict, decision_function and score_samples on the
12+
training set as this would lead to wrong results. You must only use these
13+
methods on new unseen data (which are not in the training set). See
14+
:ref:`User Guide <outlier_detection>`: for details on the difference between
15+
outlier detection and novelty detection and how to use LOF for outlier
16+
detection.
17+
18+
The number of neighbors considered, (parameter n_neighbors) is typically
19+
chosen 1) greater than the minimum number of objects a cluster has to contain,
20+
so that other objects can be local outliers relative to this cluster, and 2)
21+
smaller than the maximum number of close by objects that can potentially be
22+
local outliers.
23+
In practice, such informations are generally not available, and taking
24+
n_neighbors=20 appears to work well in general.
25+
"""
26+
27+
import numpy as np
28+
import matplotlib
29+
import matplotlib.pyplot as plt
30+
from sklearn.neighbors import LocalOutlierFactor
31+
32+
print(__doc__)
33+
34+
np.random.seed(42)
35+
36+
xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500))
37+
# Generate normal (not abnormal) training observations
38+
X = 0.3 * np.random.randn(100, 2)
39+
X_train = np.r_[X + 2, X - 2]
40+
# Generate new normal (not abnormal) observations
41+
X = 0.3 * np.random.randn(20, 2)
42+
X_test = np.r_[X + 2, X - 2]
43+
# Generate some abnormal novel observations
44+
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
45+
46+
# fit the model for novelty detection (novelty=True)
47+
clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1)
48+
clf.fit(X_train)
49+
# DO NOT use predict, decision_function and score_samples on X_train as this
50+
# would give wrong results but only on new unseen data (not used in X_train),
51+
# e.g. X_test, X_outliers or the meshgrid
52+
y_pred_test = clf.predict(X_test)
53+
y_pred_outliers = clf.predict(X_outliers)
54+
n_error_test = y_pred_test[y_pred_test == -1].size
55+
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size
56+
57+
# plot the learned frontier, the points, and the nearest vectors to the plane
58+
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
59+
Z = Z.reshape(xx.shape)
60+
61+
plt.title("Novelty Detection with LOF")
62+
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
63+
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
64+
plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')
65+
66+
s = 40
67+
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')
68+
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s,
69+
edgecolors='k')
70+
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s,
71+
edgecolors='k')
72+
plt.axis('tight')
73+
plt.xlim((-5, 5))
74+
plt.ylim((-5, 5))
75+
plt.legend([a.collections[0], b1, b2, c],
76+
["learned frontier", "training observations",
77+
"new regular observations", "new abnormal observations"],
78+
loc="upper left",
79+
prop=matplotlib.font_manager.FontProperties(size=11))
80+
plt.xlabel(
81+
"errors novel regular: %d/40 ; errors novel abnormal: %d/40"
82+
% (n_error_test, n_error_outliers))
83+
plt.show()

examples/neighbors/plot_lof_outlier_detection.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Outlier detection with Local Outlier Factor (LOF)
44
=================================================
55
6-
The Local Outlier Factor (LOF) algorithm is an unsupervised outlier detection
6+
The Local Outlier Factor (LOF) algorithm is an unsupervised anomaly detection
77
method which computes the local density deviation of a given data point with
88
respect to its neighbors. It considers as outlier samples that have a
99
substantially lower density than their neighbors. This example shows how to
@@ -22,12 +22,13 @@
2222
In practice, such informations are generally not available, and taking
2323
n_neighbors=20 appears to work well in general.
2424
"""
25-
print(__doc__)
2625

2726
import numpy as np
2827
import matplotlib.pyplot as plt
2928
from sklearn.neighbors import LocalOutlierFactor
3029

30+
print(__doc__)
31+
3132
np.random.seed(42)
3233

3334
# Generate train data
@@ -52,7 +53,7 @@
5253
# function on a grid. Note that when using novelty=True, you MUST not use
5354
# predict, decision_function and score_samples on the training set X as this
5455
# would lead to wrong results. You must only use these methods on new unseen
55-
# data (not used in the training set)
56+
# data (which are not in the training set)
5657

5758
# refit the model with novelty=True
5859
clf = LocalOutlierFactor(n_neighbors=20, novelty=True)

0 commit comments

Comments
 (0)