scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.97 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.97 KB
diff --git a/‎dev/_downloads/2108844cb1b17bae9a6b4c0b0fb3b211/plot_isolation_forest.py
Lines changed: 103 additions & 49 deletions b/‎dev/_downloads/2108844cb1b17bae9a6b4c0b0fb3b211/plot_isolation_forest.py
Lines changed: 103 additions & 49 deletions
diff --git a/‎dev/_downloads/4cf0456267ced0f869a458ef4776d4c5/plot_release_highlights_1_1_0.py
Lines changed: 3 additions & 3 deletions b/‎dev/_downloads/4cf0456267ced0f869a458ef4776d4c5/plot_release_highlights_1_1_0.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎dev/_downloads/68fdea23e50d165632d4bd4e36453cd5/plot_release_highlights_1_1_0.ipynb
Lines changed: 3 additions & 3 deletions b/‎dev/_downloads/68fdea23e50d165632d4bd4e36453cd5/plot_release_highlights_1_1_0.ipynb
Lines changed: 3 additions & 3 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
3.1 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
3.1 KB
diff --git a/‎dev/_downloads/f39c19ddd9f1c49a604c054eff707568/plot_isolation_forest.ipynb
Lines changed: 81 additions & 2 deletions b/‎dev/_downloads/f39c19ddd9f1c49a604c054eff707568/plot_isolation_forest.ipynb
Lines changed: 81 additions & 2 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
80.4 KB b/‎dev/_downloads/scikit-learn-docs.zip
80.4 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-47 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-47 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-48 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-48 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
14 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
14 Bytes
@@ -1,67 +1,121 @@
 """
-==========================================
+=======================
 IsolationForest example
-==========================================
+=======================
 
 An example using :class:`~sklearn.ensemble.IsolationForest` for anomaly
 detection.
 
-The IsolationForest 'isolates' observations by randomly selecting a feature
-and then randomly selecting a split value between the maximum and minimum
-values of the selected feature.
+The :ref:`isolation_forest` is an ensemble of "Isolation Trees" that "isolate"
+observations by recursive random partitioning, which can be represented by a
+tree structure. The number of splittings required to isolate a sample is lower
+for outliers and higher for inliers.
 
-Since recursive partitioning can be represented by a tree structure, the
-number of splittings required to isolate a sample is equivalent to the path
-length from the root node to the terminating node.
-
-This path length, averaged over a forest of such random trees, is a measure
-of normality and our decision function.
-
-Random partitioning produces noticeable shorter paths for anomalies.
-Hence, when a forest of random trees collectively produce shorter path lengths
-for particular samples, they are highly likely to be anomalies.
+In the present example we demo two ways to visualize the decision boundary of an
+Isolation Forest trained on a toy dataset.
 
 """
 
+# %%
+# Data generation
+# ---------------
+#
+# We generate two clusters (each one containing `n_samples`) by randomly
+# sampling the standard normal distribution as returned by
+# :func:`numpy.random.randn`. One of them is spherical and the other one is
+# slightly deformed.
+#
+# For consistency with the :class:`~sklearn.ensemble.IsolationForest` notation,
+# the inliers (i.e. the gaussian clusters) are assigned a ground truth label `1`
+# whereas the outliers (created with :func:`numpy.random.uniform`) are assigned
+# the label `-1`.
+
 import numpy as np
+from sklearn.model_selection import train_test_split
+
+n_samples, n_outliers = 120, 40
+rng = np.random.RandomState(0)
+covariance = np.array([[0.5, -0.1], [0.7, 0.4]])
+cluster_1 = 0.4 * rng.randn(n_samples, 2) @ covariance + np.array([2, 2])  # general
+cluster_2 = 0.3 * rng.randn(n_samples, 2) + np.array([-2, -2])  # spherical
+outliers = rng.uniform(low=-4, high=4, size=(n_outliers, 2))
+
+X = np.concatenate([cluster_1, cluster_2, outliers])
+y = np.concatenate(
+    [np.ones((2 * n_samples), dtype=int), -np.ones((n_outliers), dtype=int)]
+)
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
+
+# %%
+# We can visualize the resulting clusters:
+
 import matplotlib.pyplot as plt
-from sklearn.ensemble import IsolationForest
 
-rng = np.random.RandomState(42)
+scatter = plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
+handles, labels = scatter.legend_elements()
+plt.axis("square")
+plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
+plt.title("Gaussian inliers with \nuniformly distributed outliers")
+plt.show()
 
-# Generate train data
-X = 0.3 * rng.randn(100, 2)
-X_train = np.r_[X + 2, X - 2]
-# Generate some regular novel observations
-X = 0.3 * rng.randn(20, 2)
-X_test = np.r_[X + 2, X - 2]
-# Generate some abnormal novel observations
-X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
+# %%
+# Training of the model
+# ---------------------
 
-# fit the model
-clf = IsolationForest(max_samples=100, random_state=rng)
+from sklearn.ensemble import IsolationForest
+
+clf = IsolationForest(max_samples=100, random_state=0)
 clf.fit(X_train)
-y_pred_train = clf.predict(X_train)
-y_pred_test = clf.predict(X_test)
-y_pred_outliers = clf.predict(X_outliers)
-
-# plot the line, the samples, and the nearest vectors to the plane
-xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
-Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
-Z = Z.reshape(xx.shape)
-
-plt.title("IsolationForest")
-plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
-
-b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=20, edgecolor="k")
-b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="green", s=20, edgecolor="k")
-c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="red", s=20, edgecolor="k")
-plt.axis("tight")
-plt.xlim((-5, 5))
-plt.ylim((-5, 5))
-plt.legend(
-    [b1, b2, c],
-    ["training observations", "new regular observations", "new abnormal observations"],
-    loc="upper left",
+
+# %%
+# Plot discrete decision boundary
+# -------------------------------
+#
+# We use the class :class:`~sklearn.inspection.DecisionBoundaryDisplay` to
+# visualize a discrete decision boundary. The background color represents
+# whether a sample in that given area is predicted to be an outlier
+# or not. The scatter plot displays the true labels.
+
+import matplotlib.pyplot as plt
+from sklearn.inspection import DecisionBoundaryDisplay
+
+disp = DecisionBoundaryDisplay.from_estimator(
+    clf,
+    X,
+    response_method="predict",
+    alpha=0.5,
+)
+disp.ax_.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
+disp.ax_.set_title("Binary decision boundary \nof IsolationForest")
+plt.axis("square")
+plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
+plt.show()
+
+# %%
+# Plot path length decision boundary
+# ----------------------------------
+#
+# By setting the `response_method="decision_function"`, the background of the
+# :class:`~sklearn.inspection.DecisionBoundaryDisplay` represents the measure of
+# normality of an observation. Such score is given by the path length averaged
+# over a forest of random trees, which itself is given by the depth of the leaf
+# (or equivalently the number of splits) required to isolate a given sample.
+#
+# When a forest of random trees collectively produce short path lengths for
+# isolating some particular samples, they are highly likely to be anomalies and
+# the measure of normality is close to `0`. Similarly, large paths correspond to
+# values close to `1` and are more likely to be inliers.
+
+disp = DecisionBoundaryDisplay.from_estimator(
+    clf,
+    X,
+    response_method="decision_function",
+    alpha=0.5,
 )
+disp.ax_.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
+disp.ax_.set_title("Path length decision boundary \nof IsolationForest")
+plt.axis("square")
+plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
+plt.colorbar(disp.ax_.collections[1])
 plt.show()
@@ -78,7 +78,7 @@
         ("num", numeric_transformer, numeric_features),
         (
             "cat",
-            OneHotEncoder(handle_unknown="ignore", sparse=False),
+            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
             categorical_features,
         ),
     ],
@@ -113,7 +113,7 @@
 X = np.array(
     [["dog"] * 5 + ["cat"] * 20 + ["rabbit"] * 10 + ["snake"] * 3], dtype=object
 ).T
-enc = OneHotEncoder(min_frequency=6, sparse=False).fit(X)
+enc = OneHotEncoder(min_frequency=6, sparse_output=False).fit(X)
 enc.infrequent_categories_
 
 # %%
@@ -211,7 +211,7 @@
 
 X, _ = make_blobs(n_samples=1000, centers=2, random_state=0)
 
-km = KMeans(n_clusters=5, random_state=0).fit(X)
+km = KMeans(n_clusters=5, random_state=0, n_init="auto").fit(X)
 bisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)
 
 fig, ax = plt.subplots(1, 2, figsize=(10, 5))
 
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import OneHotEncoder, StandardScaler\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.linear_model import LogisticRegression\n\nX, y = fetch_openml(\n    \"titanic\", version=1, as_frame=True, return_X_y=True, parser=\"pandas\"\n)\nnumeric_features = [\"age\", \"fare\"]\nnumeric_transformer = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler())\ncategorical_features = [\"embarked\", \"pclass\"]\n\npreprocessor = ColumnTransformer(\n    [\n        (\"num\", numeric_transformer, numeric_features),\n        (\n            \"cat\",\n            OneHotEncoder(handle_unknown=\"ignore\", sparse=False),\n            categorical_features,\n        ),\n    ],\n    verbose_feature_names_out=False,\n)\nlog_reg = make_pipeline(preprocessor, SelectKBest(k=7), LogisticRegression())\nlog_reg.fit(X, y)"
+        "from sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import OneHotEncoder, StandardScaler\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.linear_model import LogisticRegression\n\nX, y = fetch_openml(\n    \"titanic\", version=1, as_frame=True, return_X_y=True, parser=\"pandas\"\n)\nnumeric_features = [\"age\", \"fare\"]\nnumeric_transformer = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler())\ncategorical_features = [\"embarked\", \"pclass\"]\n\npreprocessor = ColumnTransformer(\n    [\n        (\"num\", numeric_transformer, numeric_features),\n        (\n            \"cat\",\n            OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False),\n            categorical_features,\n        ),\n    ],\n    verbose_feature_names_out=False,\n)\nlog_reg = make_pipeline(preprocessor, SelectKBest(k=7), LogisticRegression())\nlog_reg.fit(X, y)"
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.preprocessing import OneHotEncoder\nimport numpy as np\n\nX = np.array(\n    [[\"dog\"] * 5 + [\"cat\"] * 20 + [\"rabbit\"] * 10 + [\"snake\"] * 3], dtype=object\n).T\nenc = OneHotEncoder(min_frequency=6, sparse=False).fit(X)\nenc.infrequent_categories_"
+        "from sklearn.preprocessing import OneHotEncoder\nimport numpy as np\n\nX = np.array(\n    [[\"dog\"] * 5 + [\"cat\"] * 20 + [\"rabbit\"] * 10 + [\"snake\"] * 3], dtype=object\n).T\nenc = OneHotEncoder(min_frequency=6, sparse_output=False).fit(X)\nenc.infrequent_categories_"
       ]
     },
     {
@@ -148,7 +148,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.datasets import make_blobs\nfrom sklearn.cluster import KMeans, BisectingKMeans\nimport matplotlib.pyplot as plt\n\nX, _ = make_blobs(n_samples=1000, centers=2, random_state=0)\n\nkm = KMeans(n_clusters=5, random_state=0).fit(X)\nbisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)\n\nfig, ax = plt.subplots(1, 2, figsize=(10, 5))\nax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)\nax[0].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=20, c=\"r\")\nax[0].set_title(\"KMeans\")\n\nax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)\nax[1].scatter(\n    bisect_km.cluster_centers_[:, 0], bisect_km.cluster_centers_[:, 1], s=20, c=\"r\"\n)\n_ = ax[1].set_title(\"BisectingKMeans\")"
+        "from sklearn.datasets import make_blobs\nfrom sklearn.cluster import KMeans, BisectingKMeans\nimport matplotlib.pyplot as plt\n\nX, _ = make_blobs(n_samples=1000, centers=2, random_state=0)\n\nkm = KMeans(n_clusters=5, random_state=0, n_init=\"auto\").fit(X)\nbisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)\n\nfig, ax = plt.subplots(1, 2, figsize=(10, 5))\nax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)\nax[0].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=20, c=\"r\")\nax[0].set_title(\"KMeans\")\n\nax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)\nax[1].scatter(\n    bisect_km.cluster_centers_[:, 0], bisect_km.cluster_centers_[:, 1], s=20, c=\"r\"\n)\n_ = ax[1].set_title(\"BisectingKMeans\")"
       ]
     }
   ],
 
@@ -15,7 +15,86 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# IsolationForest example\n\nAn example using :class:`~sklearn.ensemble.IsolationForest` for anomaly\ndetection.\n\nThe IsolationForest 'isolates' observations by randomly selecting a feature\nand then randomly selecting a split value between the maximum and minimum\nvalues of the selected feature.\n\nSince recursive partitioning can be represented by a tree structure, the\nnumber of splittings required to isolate a sample is equivalent to the path\nlength from the root node to the terminating node.\n\nThis path length, averaged over a forest of such random trees, is a measure\nof normality and our decision function.\n\nRandom partitioning produces noticeable shorter paths for anomalies.\nHence, when a forest of random trees collectively produce shorter path lengths\nfor particular samples, they are highly likely to be anomalies.\n"
+        "\n# IsolationForest example\n\nAn example using :class:`~sklearn.ensemble.IsolationForest` for anomaly\ndetection.\n\nThe `isolation_forest` is an ensemble of \"Isolation Trees\" that \"isolate\"\nobservations by recursive random partitioning, which can be represented by a\ntree structure. The number of splittings required to isolate a sample is lower\nfor outliers and higher for inliers.\n\nIn the present example we demo two ways to visualize the decision boundary of an\nIsolation Forest trained on a toy dataset.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Data generation\n\nWe generate two clusters (each one containing `n_samples`) by randomly\nsampling the standard normal distribution as returned by\n:func:`numpy.random.randn`. One of them is spherical and the other one is\nslightly deformed.\n\nFor consistency with the :class:`~sklearn.ensemble.IsolationForest` notation,\nthe inliers (i.e. the gaussian clusters) are assigned a ground truth label `1`\nwhereas the outliers (created with :func:`numpy.random.uniform`) are assigned\nthe label `-1`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nfrom sklearn.model_selection import train_test_split\n\nn_samples, n_outliers = 120, 40\nrng = np.random.RandomState(0)\ncovariance = np.array([[0.5, -0.1], [0.7, 0.4]])\ncluster_1 = 0.4 * rng.randn(n_samples, 2) @ covariance + np.array([2, 2])  # general\ncluster_2 = 0.3 * rng.randn(n_samples, 2) + np.array([-2, -2])  # spherical\noutliers = rng.uniform(low=-4, high=4, size=(n_outliers, 2))\n\nX = np.concatenate([cluster_1, cluster_2, outliers])\ny = np.concatenate(\n    [np.ones((2 * n_samples), dtype=int), -np.ones((n_outliers), dtype=int)]\n)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We can visualize the resulting clusters:\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n\nscatter = plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor=\"k\")\nhandles, labels = scatter.legend_elements()\nplt.axis(\"square\")\nplt.legend(handles=handles, labels=[\"outliers\", \"inliers\"], title=\"true class\")\nplt.title(\"Gaussian inliers with \\nuniformly distributed outliers\")\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Training of the model\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.ensemble import IsolationForest\n\nclf = IsolationForest(max_samples=100, random_state=0)\nclf.fit(X_train)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plot discrete decision boundary\n\nWe use the class :class:`~sklearn.inspection.DecisionBoundaryDisplay` to\nvisualize a discrete decision boundary. The background color represents\nwhether a sample in that given area is predicted to be an outlier\nor not. The scatter plot displays the true labels.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\nfrom sklearn.inspection import DecisionBoundaryDisplay\n\ndisp = DecisionBoundaryDisplay.from_estimator(\n    clf,\n    X,\n    response_method=\"predict\",\n    alpha=0.5,\n)\ndisp.ax_.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor=\"k\")\ndisp.ax_.set_title(\"Binary decision boundary \\nof IsolationForest\")\nplt.axis(\"square\")\nplt.legend(handles=handles, labels=[\"outliers\", \"inliers\"], title=\"true class\")\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plot path length decision boundary\n\nBy setting the `response_method=\"decision_function\"`, the background of the\n:class:`~sklearn.inspection.DecisionBoundaryDisplay` represents the measure of\nnormality of an observation. Such score is given by the path length averaged\nover a forest of random trees, which itself is given by the depth of the leaf\n(or equivalently the number of splits) required to isolate a given sample.\n\nWhen a forest of random trees collectively produce short path lengths for\nisolating some particular samples, they are highly likely to be anomalies and\nthe measure of normality is close to `0`. Similarly, large paths correspond to\nvalues close to `1` and are more likely to be inliers.\n\n"
       ]
     },
     {
@@ -26,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.ensemble import IsolationForest\n\nrng = np.random.RandomState(42)\n\n# Generate train data\nX = 0.3 * rng.randn(100, 2)\nX_train = np.r_[X + 2, X - 2]\n# Generate some regular novel observations\nX = 0.3 * rng.randn(20, 2)\nX_test = np.r_[X + 2, X - 2]\n# Generate some abnormal novel observations\nX_outliers = rng.uniform(low=-4, high=4, size=(20, 2))\n\n# fit the model\nclf = IsolationForest(max_samples=100, random_state=rng)\nclf.fit(X_train)\ny_pred_train = clf.predict(X_train)\ny_pred_test = clf.predict(X_test)\ny_pred_outliers = clf.predict(X_outliers)\n\n# plot the line, the samples, and the nearest vectors to the plane\nxx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))\nZ = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\nZ = Z.reshape(xx.shape)\n\nplt.title(\"IsolationForest\")\nplt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)\n\nb1 = plt.scatter(X_train[:, 0], X_train[:, 1], c=\"white\", s=20, edgecolor=\"k\")\nb2 = plt.scatter(X_test[:, 0], X_test[:, 1], c=\"green\", s=20, edgecolor=\"k\")\nc = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c=\"red\", s=20, edgecolor=\"k\")\nplt.axis(\"tight\")\nplt.xlim((-5, 5))\nplt.ylim((-5, 5))\nplt.legend(\n    [b1, b2, c],\n    [\"training observations\", \"new regular observations\", \"new abnormal observations\"],\n    loc=\"upper left\",\n)\nplt.show()"
+        "disp = DecisionBoundaryDisplay.from_estimator(\n    clf,\n    X,\n    response_method=\"decision_function\",\n    alpha=0.5,\n)\ndisp.ax_.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor=\"k\")\ndisp.ax_.set_title(\"Path length decision boundary \\nof IsolationForest\")\nplt.axis(\"square\")\nplt.legend(handles=handles, labels=[\"outliers\", \"inliers\"], title=\"true class\")\nplt.colorbar(disp.ax_.collections[1])\nplt.show()"
       ]
     }
   ],
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@`
`51`	`51`	`},`
`52`	`52`	`"outputs": [],`
`53`	`53`	`"source": [`
`54`		- "from sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import OneHotEncoder, StandardScaler\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.linear_model import LogisticRegression\n\nX, y = fetch_openml(\n \"titanic\", version=1, as_frame=True, return_X_y=True, parser=\"pandas\"\n)\nnumeric_features = [\"age\", \"fare\"]\nnumeric_transformer = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler())\ncategorical_features = [\"embarked\", \"pclass\"]\n\npreprocessor = ColumnTransformer(\n [\n (\"num\", numeric_transformer, numeric_features),\n (\n \"cat\",\n OneHotEncoder(handle_unknown=\"ignore\", sparse=False),\n categorical_features,\n ),\n ],\n verbose_feature_names_out=False,\n)\nlog_reg = make_pipeline(preprocessor, SelectKBest(k=7), LogisticRegression())\nlog_reg.fit(X, y)"
	`54`	+ "from sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import OneHotEncoder, StandardScaler\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.linear_model import LogisticRegression\n\nX, y = fetch_openml(\n \"titanic\", version=1, as_frame=True, return_X_y=True, parser=\"pandas\"\n)\nnumeric_features = [\"age\", \"fare\"]\nnumeric_transformer = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler())\ncategorical_features = [\"embarked\", \"pclass\"]\n\npreprocessor = ColumnTransformer(\n [\n (\"num\", numeric_transformer, numeric_features),\n (\n \"cat\",\n OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False),\n categorical_features,\n ),\n ],\n verbose_feature_names_out=False,\n)\nlog_reg = make_pipeline(preprocessor, SelectKBest(k=7), LogisticRegression())\nlog_reg.fit(X, y)"
`55`	`55`	`]`
`56`	`56`	`},`
`57`	`57`	`{`
`@@ -87,7 +87,7 @@`
`87`	`87`	`},`
`88`	`88`	`"outputs": [],`
`89`	`89`	`"source": [`
`90`		`- "from sklearn.preprocessing import OneHotEncoder\nimport numpy as np\n\nX = np.array(\n [[\"dog\"] * 5 + [\"cat\"] * 20 + [\"rabbit\"] * 10 + [\"snake\"] * 3], dtype=object\n).T\nenc = OneHotEncoder(min_frequency=6, sparse=False).fit(X)\nenc.infrequent_categories_"`
	`90`	`+ "from sklearn.preprocessing import OneHotEncoder\nimport numpy as np\n\nX = np.array(\n [[\"dog\"] * 5 + [\"cat\"] * 20 + [\"rabbit\"] * 10 + [\"snake\"] * 3], dtype=object\n).T\nenc = OneHotEncoder(min_frequency=6, sparse_output=False).fit(X)\nenc.infrequent_categories_"`
`91`	`91`	`]`
`92`	`92`	`},`
`93`	`93`	`{`
`@@ -148,7 +148,7 @@`
`148`	`148`	`},`
`149`	`149`	`"outputs": [],`
`150`	`150`	`"source": [`
`151`		- "from sklearn.datasets import make_blobs\nfrom sklearn.cluster import KMeans, BisectingKMeans\nimport matplotlib.pyplot as plt\n\nX, _ = make_blobs(n_samples=1000, centers=2, random_state=0)\n\nkm = KMeans(n_clusters=5, random_state=0).fit(X)\nbisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)\n\nfig, ax = plt.subplots(1, 2, figsize=(10, 5))\nax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)\nax[0].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=20, c=\"r\")\nax[0].set_title(\"KMeans\")\n\nax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)\nax[1].scatter(\n bisect_km.cluster_centers_[:, 0], bisect_km.cluster_centers_[:, 1], s=20, c=\"r\"\n)\n_ = ax[1].set_title(\"BisectingKMeans\")"
	`151`	+ "from sklearn.datasets import make_blobs\nfrom sklearn.cluster import KMeans, BisectingKMeans\nimport matplotlib.pyplot as plt\n\nX, _ = make_blobs(n_samples=1000, centers=2, random_state=0)\n\nkm = KMeans(n_clusters=5, random_state=0, n_init=\"auto\").fit(X)\nbisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)\n\nfig, ax = plt.subplots(1, 2, figsize=(10, 5))\nax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)\nax[0].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=20, c=\"r\")\nax[0].set_title(\"KMeans\")\n\nax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)\nax[1].scatter(\n bisect_km.cluster_centers_[:, 0], bisect_km.cluster_centers_[:, 1], s=20, c=\"r\"\n)\n_ = ax[1].set_title(\"BisectingKMeans\")"
`152`	`152`	`]`
`153`	`153`	`}`
`154`	`154`	`],`