1+ """
2+ =================================================
3+ Novelty detection with Local Outlier Factor (LOF)
4+ =================================================
5+
6+ The Local Outlier Factor (LOF) algorithm is an unsupervised anomaly detection
7+ method which computes the local density deviation of a given data point with
8+ respect to its neighbors. It considers as outlier samples that have a
9+ substantially lower density than their neighbors. This example shows how to
10+ use LOF for novelty detection. Note that when LOF is used for novelty
11+ detection you MUST not use predict, decision_function and score_samples on the
12+ training set as this would lead to wrong results. You must only use these
13+ methods on new unseen data (which are not in the training set). See
14+ :ref:`User Guide <outlier_detection>`: for details on the difference between
15+ outlier detection and novelty detection and how to use LOF for outlier
16+ detection.
17+
18+ The number of neighbors considered, (parameter n_neighbors) is typically
19+ chosen 1) greater than the minimum number of objects a cluster has to contain,
20+ so that other objects can be local outliers relative to this cluster, and 2)
21+ smaller than the maximum number of close by objects that can potentially be
22+ local outliers.
23+ In practice, such informations are generally not available, and taking
24+ n_neighbors=20 appears to work well in general.
25+ """
26+
27+ import numpy as np
28+ import matplotlib
29+ import matplotlib .pyplot as plt
30+ from sklearn .neighbors import LocalOutlierFactor
31+
32+ print (__doc__ )
33+
34+ np .random .seed (42 )
35+
36+ xx , yy = np .meshgrid (np .linspace (- 5 , 5 , 500 ), np .linspace (- 5 , 5 , 500 ))
37+ # Generate normal (not abnormal) training observations
38+ X = 0.3 * np .random .randn (100 , 2 )
39+ X_train = np .r_ [X + 2 , X - 2 ]
40+ # Generate new normal (not abnormal) observations
41+ X = 0.3 * np .random .randn (20 , 2 )
42+ X_test = np .r_ [X + 2 , X - 2 ]
43+ # Generate some abnormal novel observations
44+ X_outliers = np .random .uniform (low = - 4 , high = 4 , size = (20 , 2 ))
45+
46+ # fit the model for novelty detection (novelty=True)
47+ clf = LocalOutlierFactor (n_neighbors = 20 , novelty = True , contamination = 0.1 )
48+ clf .fit (X_train )
49+ # DO NOT use predict, decision_function and score_samples on X_train as this
50+ # would give wrong results but only on new unseen data (not used in X_train),
51+ # e.g. X_test, X_outliers or the meshgrid
52+ y_pred_test = clf .predict (X_test )
53+ y_pred_outliers = clf .predict (X_outliers )
54+ n_error_test = y_pred_test [y_pred_test == - 1 ].size
55+ n_error_outliers = y_pred_outliers [y_pred_outliers == 1 ].size
56+
57+ # plot the learned frontier, the points, and the nearest vectors to the plane
58+ Z = clf .decision_function (np .c_ [xx .ravel (), yy .ravel ()])
59+ Z = Z .reshape (xx .shape )
60+
61+ plt .title ("Novelty Detection with LOF" )
62+ plt .contourf (xx , yy , Z , levels = np .linspace (Z .min (), 0 , 7 ), cmap = plt .cm .PuBu )
63+ a = plt .contour (xx , yy , Z , levels = [0 ], linewidths = 2 , colors = 'darkred' )
64+ plt .contourf (xx , yy , Z , levels = [0 , Z .max ()], colors = 'palevioletred' )
65+
66+ s = 40
67+ b1 = plt .scatter (X_train [:, 0 ], X_train [:, 1 ], c = 'white' , s = s , edgecolors = 'k' )
68+ b2 = plt .scatter (X_test [:, 0 ], X_test [:, 1 ], c = 'blueviolet' , s = s ,
69+ edgecolors = 'k' )
70+ c = plt .scatter (X_outliers [:, 0 ], X_outliers [:, 1 ], c = 'gold' , s = s ,
71+ edgecolors = 'k' )
72+ plt .axis ('tight' )
73+ plt .xlim ((- 5 , 5 ))
74+ plt .ylim ((- 5 , 5 ))
75+ plt .legend ([a .collections [0 ], b1 , b2 , c ],
76+ ["learned frontier" , "training observations" ,
77+ "new regular observations" , "new abnormal observations" ],
78+ loc = "upper left" ,
79+ prop = matplotlib .font_manager .FontProperties (size = 11 ))
80+ plt .xlabel (
81+ "errors novel regular: %d/40 ; errors novel abnormal: %d/40"
82+ % (n_error_test , n_error_outliers ))
83+ plt .show ()
0 commit comments