Skip to content

Commit dd9dcf5

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 9c3c2c5a318c5b98ee4fd6454c01d04079a44a7f
1 parent 894785a commit dd9dcf5

File tree

1,265 files changed

+5233
-4864
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,265 files changed

+5233
-4864
lines changed
Binary file not shown.
Lines changed: 103 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,121 @@
11
"""
2-
==========================================
2+
=======================
33
IsolationForest example
4-
==========================================
4+
=======================
55
66
An example using :class:`~sklearn.ensemble.IsolationForest` for anomaly
77
detection.
88
9-
The IsolationForest 'isolates' observations by randomly selecting a feature
10-
and then randomly selecting a split value between the maximum and minimum
11-
values of the selected feature.
9+
The :ref:`isolation_forest` is an ensemble of "Isolation Trees" that "isolate"
10+
observations by recursive random partitioning, which can be represented by a
11+
tree structure. The number of splittings required to isolate a sample is lower
12+
for outliers and higher for inliers.
1213
13-
Since recursive partitioning can be represented by a tree structure, the
14-
number of splittings required to isolate a sample is equivalent to the path
15-
length from the root node to the terminating node.
16-
17-
This path length, averaged over a forest of such random trees, is a measure
18-
of normality and our decision function.
19-
20-
Random partitioning produces noticeable shorter paths for anomalies.
21-
Hence, when a forest of random trees collectively produce shorter path lengths
22-
for particular samples, they are highly likely to be anomalies.
14+
In the present example we demo two ways to visualize the decision boundary of an
15+
Isolation Forest trained on a toy dataset.
2316
2417
"""
2518

19+
# %%
20+
# Data generation
21+
# ---------------
22+
#
23+
# We generate two clusters (each one containing `n_samples`) by randomly
24+
# sampling the standard normal distribution as returned by
25+
# :func:`numpy.random.randn`. One of them is spherical and the other one is
26+
# slightly deformed.
27+
#
28+
# For consistency with the :class:`~sklearn.ensemble.IsolationForest` notation,
29+
# the inliers (i.e. the gaussian clusters) are assigned a ground truth label `1`
30+
# whereas the outliers (created with :func:`numpy.random.uniform`) are assigned
31+
# the label `-1`.
32+
2633
import numpy as np
34+
from sklearn.model_selection import train_test_split
35+
36+
n_samples, n_outliers = 120, 40
37+
rng = np.random.RandomState(0)
38+
covariance = np.array([[0.5, -0.1], [0.7, 0.4]])
39+
cluster_1 = 0.4 * rng.randn(n_samples, 2) @ covariance + np.array([2, 2]) # general
40+
cluster_2 = 0.3 * rng.randn(n_samples, 2) + np.array([-2, -2]) # spherical
41+
outliers = rng.uniform(low=-4, high=4, size=(n_outliers, 2))
42+
43+
X = np.concatenate([cluster_1, cluster_2, outliers])
44+
y = np.concatenate(
45+
[np.ones((2 * n_samples), dtype=int), -np.ones((n_outliers), dtype=int)]
46+
)
47+
48+
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
49+
50+
# %%
51+
# We can visualize the resulting clusters:
52+
2753
import matplotlib.pyplot as plt
28-
from sklearn.ensemble import IsolationForest
2954

30-
rng = np.random.RandomState(42)
55+
scatter = plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
56+
handles, labels = scatter.legend_elements()
57+
plt.axis("square")
58+
plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
59+
plt.title("Gaussian inliers with \nuniformly distributed outliers")
60+
plt.show()
3161

32-
# Generate train data
33-
X = 0.3 * rng.randn(100, 2)
34-
X_train = np.r_[X + 2, X - 2]
35-
# Generate some regular novel observations
36-
X = 0.3 * rng.randn(20, 2)
37-
X_test = np.r_[X + 2, X - 2]
38-
# Generate some abnormal novel observations
39-
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
62+
# %%
63+
# Training of the model
64+
# ---------------------
4065

41-
# fit the model
42-
clf = IsolationForest(max_samples=100, random_state=rng)
66+
from sklearn.ensemble import IsolationForest
67+
68+
clf = IsolationForest(max_samples=100, random_state=0)
4369
clf.fit(X_train)
44-
y_pred_train = clf.predict(X_train)
45-
y_pred_test = clf.predict(X_test)
46-
y_pred_outliers = clf.predict(X_outliers)
47-
48-
# plot the line, the samples, and the nearest vectors to the plane
49-
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
50-
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
51-
Z = Z.reshape(xx.shape)
52-
53-
plt.title("IsolationForest")
54-
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
55-
56-
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=20, edgecolor="k")
57-
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="green", s=20, edgecolor="k")
58-
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="red", s=20, edgecolor="k")
59-
plt.axis("tight")
60-
plt.xlim((-5, 5))
61-
plt.ylim((-5, 5))
62-
plt.legend(
63-
[b1, b2, c],
64-
["training observations", "new regular observations", "new abnormal observations"],
65-
loc="upper left",
70+
71+
# %%
72+
# Plot discrete decision boundary
73+
# -------------------------------
74+
#
75+
# We use the class :class:`~sklearn.inspection.DecisionBoundaryDisplay` to
76+
# visualize a discrete decision boundary. The background color represents
77+
# whether a sample in that given area is predicted to be an outlier
78+
# or not. The scatter plot displays the true labels.
79+
80+
import matplotlib.pyplot as plt
81+
from sklearn.inspection import DecisionBoundaryDisplay
82+
83+
disp = DecisionBoundaryDisplay.from_estimator(
84+
clf,
85+
X,
86+
response_method="predict",
87+
alpha=0.5,
88+
)
89+
disp.ax_.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
90+
disp.ax_.set_title("Binary decision boundary \nof IsolationForest")
91+
plt.axis("square")
92+
plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
93+
plt.show()
94+
95+
# %%
96+
# Plot path length decision boundary
97+
# ----------------------------------
98+
#
99+
# By setting the `response_method="decision_function"`, the background of the
100+
# :class:`~sklearn.inspection.DecisionBoundaryDisplay` represents the measure of
101+
# normality of an observation. Such score is given by the path length averaged
102+
# over a forest of random trees, which itself is given by the depth of the leaf
103+
# (or equivalently the number of splits) required to isolate a given sample.
104+
#
105+
# When a forest of random trees collectively produce short path lengths for
106+
# isolating some particular samples, they are highly likely to be anomalies and
107+
# the measure of normality is close to `0`. Similarly, large paths correspond to
108+
# values close to `1` and are more likely to be inliers.
109+
110+
disp = DecisionBoundaryDisplay.from_estimator(
111+
clf,
112+
X,
113+
response_method="decision_function",
114+
alpha=0.5,
66115
)
116+
disp.ax_.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
117+
disp.ax_.set_title("Path length decision boundary \nof IsolationForest")
118+
plt.axis("square")
119+
plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
120+
plt.colorbar(disp.ax_.collections[1])
67121
plt.show()

dev/_downloads/4cf0456267ced0f869a458ef4776d4c5/plot_release_highlights_1_1_0.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@
7878
("num", numeric_transformer, numeric_features),
7979
(
8080
"cat",
81-
OneHotEncoder(handle_unknown="ignore", sparse=False),
81+
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
8282
categorical_features,
8383
),
8484
],
@@ -113,7 +113,7 @@
113113
X = np.array(
114114
[["dog"] * 5 + ["cat"] * 20 + ["rabbit"] * 10 + ["snake"] * 3], dtype=object
115115
).T
116-
enc = OneHotEncoder(min_frequency=6, sparse=False).fit(X)
116+
enc = OneHotEncoder(min_frequency=6, sparse_output=False).fit(X)
117117
enc.infrequent_categories_
118118

119119
# %%
@@ -211,7 +211,7 @@
211211

212212
X, _ = make_blobs(n_samples=1000, centers=2, random_state=0)
213213

214-
km = KMeans(n_clusters=5, random_state=0).fit(X)
214+
km = KMeans(n_clusters=5, random_state=0, n_init="auto").fit(X)
215215
bisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)
216216

217217
fig, ax = plt.subplots(1, 2, figsize=(10, 5))

dev/_downloads/68fdea23e50d165632d4bd4e36453cd5/plot_release_highlights_1_1_0.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
},
5252
"outputs": [],
5353
"source": [
54-
"from sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import OneHotEncoder, StandardScaler\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.linear_model import LogisticRegression\n\nX, y = fetch_openml(\n \"titanic\", version=1, as_frame=True, return_X_y=True, parser=\"pandas\"\n)\nnumeric_features = [\"age\", \"fare\"]\nnumeric_transformer = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler())\ncategorical_features = [\"embarked\", \"pclass\"]\n\npreprocessor = ColumnTransformer(\n [\n (\"num\", numeric_transformer, numeric_features),\n (\n \"cat\",\n OneHotEncoder(handle_unknown=\"ignore\", sparse=False),\n categorical_features,\n ),\n ],\n verbose_feature_names_out=False,\n)\nlog_reg = make_pipeline(preprocessor, SelectKBest(k=7), LogisticRegression())\nlog_reg.fit(X, y)"
54+
"from sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import OneHotEncoder, StandardScaler\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.linear_model import LogisticRegression\n\nX, y = fetch_openml(\n \"titanic\", version=1, as_frame=True, return_X_y=True, parser=\"pandas\"\n)\nnumeric_features = [\"age\", \"fare\"]\nnumeric_transformer = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler())\ncategorical_features = [\"embarked\", \"pclass\"]\n\npreprocessor = ColumnTransformer(\n [\n (\"num\", numeric_transformer, numeric_features),\n (\n \"cat\",\n OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False),\n categorical_features,\n ),\n ],\n verbose_feature_names_out=False,\n)\nlog_reg = make_pipeline(preprocessor, SelectKBest(k=7), LogisticRegression())\nlog_reg.fit(X, y)"
5555
]
5656
},
5757
{
@@ -87,7 +87,7 @@
8787
},
8888
"outputs": [],
8989
"source": [
90-
"from sklearn.preprocessing import OneHotEncoder\nimport numpy as np\n\nX = np.array(\n [[\"dog\"] * 5 + [\"cat\"] * 20 + [\"rabbit\"] * 10 + [\"snake\"] * 3], dtype=object\n).T\nenc = OneHotEncoder(min_frequency=6, sparse=False).fit(X)\nenc.infrequent_categories_"
90+
"from sklearn.preprocessing import OneHotEncoder\nimport numpy as np\n\nX = np.array(\n [[\"dog\"] * 5 + [\"cat\"] * 20 + [\"rabbit\"] * 10 + [\"snake\"] * 3], dtype=object\n).T\nenc = OneHotEncoder(min_frequency=6, sparse_output=False).fit(X)\nenc.infrequent_categories_"
9191
]
9292
},
9393
{
@@ -148,7 +148,7 @@
148148
},
149149
"outputs": [],
150150
"source": [
151-
"from sklearn.datasets import make_blobs\nfrom sklearn.cluster import KMeans, BisectingKMeans\nimport matplotlib.pyplot as plt\n\nX, _ = make_blobs(n_samples=1000, centers=2, random_state=0)\n\nkm = KMeans(n_clusters=5, random_state=0).fit(X)\nbisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)\n\nfig, ax = plt.subplots(1, 2, figsize=(10, 5))\nax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)\nax[0].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=20, c=\"r\")\nax[0].set_title(\"KMeans\")\n\nax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)\nax[1].scatter(\n bisect_km.cluster_centers_[:, 0], bisect_km.cluster_centers_[:, 1], s=20, c=\"r\"\n)\n_ = ax[1].set_title(\"BisectingKMeans\")"
151+
"from sklearn.datasets import make_blobs\nfrom sklearn.cluster import KMeans, BisectingKMeans\nimport matplotlib.pyplot as plt\n\nX, _ = make_blobs(n_samples=1000, centers=2, random_state=0)\n\nkm = KMeans(n_clusters=5, random_state=0, n_init=\"auto\").fit(X)\nbisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)\n\nfig, ax = plt.subplots(1, 2, figsize=(10, 5))\nax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)\nax[0].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=20, c=\"r\")\nax[0].set_title(\"KMeans\")\n\nax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)\nax[1].scatter(\n bisect_km.cluster_centers_[:, 0], bisect_km.cluster_centers_[:, 1], s=20, c=\"r\"\n)\n_ = ax[1].set_title(\"BisectingKMeans\")"
152152
]
153153
}
154154
],
Binary file not shown.

dev/_downloads/f39c19ddd9f1c49a604c054eff707568/plot_isolation_forest.ipynb

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,86 @@
1515
"cell_type": "markdown",
1616
"metadata": {},
1717
"source": [
18-
"\n# IsolationForest example\n\nAn example using :class:`~sklearn.ensemble.IsolationForest` for anomaly\ndetection.\n\nThe IsolationForest 'isolates' observations by randomly selecting a feature\nand then randomly selecting a split value between the maximum and minimum\nvalues of the selected feature.\n\nSince recursive partitioning can be represented by a tree structure, the\nnumber of splittings required to isolate a sample is equivalent to the path\nlength from the root node to the terminating node.\n\nThis path length, averaged over a forest of such random trees, is a measure\nof normality and our decision function.\n\nRandom partitioning produces noticeable shorter paths for anomalies.\nHence, when a forest of random trees collectively produce shorter path lengths\nfor particular samples, they are highly likely to be anomalies.\n"
18+
"\n# IsolationForest example\n\nAn example using :class:`~sklearn.ensemble.IsolationForest` for anomaly\ndetection.\n\nThe `isolation_forest` is an ensemble of \"Isolation Trees\" that \"isolate\"\nobservations by recursive random partitioning, which can be represented by a\ntree structure. The number of splittings required to isolate a sample is lower\nfor outliers and higher for inliers.\n\nIn the present example we demo two ways to visualize the decision boundary of an\nIsolation Forest trained on a toy dataset.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"## Data generation\n\nWe generate two clusters (each one containing `n_samples`) by randomly\nsampling the standard normal distribution as returned by\n:func:`numpy.random.randn`. One of them is spherical and the other one is\nslightly deformed.\n\nFor consistency with the :class:`~sklearn.ensemble.IsolationForest` notation,\nthe inliers (i.e. the gaussian clusters) are assigned a ground truth label `1`\nwhereas the outliers (created with :func:`numpy.random.uniform`) are assigned\nthe label `-1`.\n\n"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {
32+
"collapsed": false
33+
},
34+
"outputs": [],
35+
"source": [
36+
"import numpy as np\nfrom sklearn.model_selection import train_test_split\n\nn_samples, n_outliers = 120, 40\nrng = np.random.RandomState(0)\ncovariance = np.array([[0.5, -0.1], [0.7, 0.4]])\ncluster_1 = 0.4 * rng.randn(n_samples, 2) @ covariance + np.array([2, 2]) # general\ncluster_2 = 0.3 * rng.randn(n_samples, 2) + np.array([-2, -2]) # spherical\noutliers = rng.uniform(low=-4, high=4, size=(n_outliers, 2))\n\nX = np.concatenate([cluster_1, cluster_2, outliers])\ny = np.concatenate(\n [np.ones((2 * n_samples), dtype=int), -np.ones((n_outliers), dtype=int)]\n)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"We can visualize the resulting clusters:\n\n"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {
50+
"collapsed": false
51+
},
52+
"outputs": [],
53+
"source": [
54+
"import matplotlib.pyplot as plt\n\nscatter = plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor=\"k\")\nhandles, labels = scatter.legend_elements()\nplt.axis(\"square\")\nplt.legend(handles=handles, labels=[\"outliers\", \"inliers\"], title=\"true class\")\nplt.title(\"Gaussian inliers with \\nuniformly distributed outliers\")\nplt.show()"
55+
]
56+
},
57+
{
58+
"cell_type": "markdown",
59+
"metadata": {},
60+
"source": [
61+
"## Training of the model\n\n"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {
68+
"collapsed": false
69+
},
70+
"outputs": [],
71+
"source": [
72+
"from sklearn.ensemble import IsolationForest\n\nclf = IsolationForest(max_samples=100, random_state=0)\nclf.fit(X_train)"
73+
]
74+
},
75+
{
76+
"cell_type": "markdown",
77+
"metadata": {},
78+
"source": [
79+
"## Plot discrete decision boundary\n\nWe use the class :class:`~sklearn.inspection.DecisionBoundaryDisplay` to\nvisualize a discrete decision boundary. The background color represents\nwhether a sample in that given area is predicted to be an outlier\nor not. The scatter plot displays the true labels.\n\n"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": null,
85+
"metadata": {
86+
"collapsed": false
87+
},
88+
"outputs": [],
89+
"source": [
90+
"import matplotlib.pyplot as plt\nfrom sklearn.inspection import DecisionBoundaryDisplay\n\ndisp = DecisionBoundaryDisplay.from_estimator(\n clf,\n X,\n response_method=\"predict\",\n alpha=0.5,\n)\ndisp.ax_.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor=\"k\")\ndisp.ax_.set_title(\"Binary decision boundary \\nof IsolationForest\")\nplt.axis(\"square\")\nplt.legend(handles=handles, labels=[\"outliers\", \"inliers\"], title=\"true class\")\nplt.show()"
91+
]
92+
},
93+
{
94+
"cell_type": "markdown",
95+
"metadata": {},
96+
"source": [
97+
"## Plot path length decision boundary\n\nBy setting the `response_method=\"decision_function\"`, the background of the\n:class:`~sklearn.inspection.DecisionBoundaryDisplay` represents the measure of\nnormality of an observation. Such score is given by the path length averaged\nover a forest of random trees, which itself is given by the depth of the leaf\n(or equivalently the number of splits) required to isolate a given sample.\n\nWhen a forest of random trees collectively produce short path lengths for\nisolating some particular samples, they are highly likely to be anomalies and\nthe measure of normality is close to `0`. Similarly, large paths correspond to\nvalues close to `1` and are more likely to be inliers.\n\n"
1998
]
2099
},
21100
{
@@ -26,7 +105,7 @@
26105
},
27106
"outputs": [],
28107
"source": [
29-
"import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.ensemble import IsolationForest\n\nrng = np.random.RandomState(42)\n\n# Generate train data\nX = 0.3 * rng.randn(100, 2)\nX_train = np.r_[X + 2, X - 2]\n# Generate some regular novel observations\nX = 0.3 * rng.randn(20, 2)\nX_test = np.r_[X + 2, X - 2]\n# Generate some abnormal novel observations\nX_outliers = rng.uniform(low=-4, high=4, size=(20, 2))\n\n# fit the model\nclf = IsolationForest(max_samples=100, random_state=rng)\nclf.fit(X_train)\ny_pred_train = clf.predict(X_train)\ny_pred_test = clf.predict(X_test)\ny_pred_outliers = clf.predict(X_outliers)\n\n# plot the line, the samples, and the nearest vectors to the plane\nxx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))\nZ = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\nZ = Z.reshape(xx.shape)\n\nplt.title(\"IsolationForest\")\nplt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)\n\nb1 = plt.scatter(X_train[:, 0], X_train[:, 1], c=\"white\", s=20, edgecolor=\"k\")\nb2 = plt.scatter(X_test[:, 0], X_test[:, 1], c=\"green\", s=20, edgecolor=\"k\")\nc = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c=\"red\", s=20, edgecolor=\"k\")\nplt.axis(\"tight\")\nplt.xlim((-5, 5))\nplt.ylim((-5, 5))\nplt.legend(\n [b1, b2, c],\n [\"training observations\", \"new regular observations\", \"new abnormal observations\"],\n loc=\"upper left\",\n)\nplt.show()"
108+
"disp = DecisionBoundaryDisplay.from_estimator(\n clf,\n X,\n response_method=\"decision_function\",\n alpha=0.5,\n)\ndisp.ax_.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor=\"k\")\ndisp.ax_.set_title(\"Path length decision boundary \\nof IsolationForest\")\nplt.axis(\"square\")\nplt.legend(handles=handles, labels=[\"outliers\", \"inliers\"], title=\"true class\")\nplt.colorbar(disp.ax_.collections[1])\nplt.show()"
30109
]
31110
}
32111
],

dev/_downloads/scikit-learn-docs.zip

80.4 KB
Binary file not shown.
-47 Bytes
-48 Bytes
14 Bytes

0 commit comments

Comments
 (0)