scikit-learn
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.51 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.51 KB
diff --git a/‎dev/_downloads/47f024d726d245e034c7690b4664721f/plot_classification.ipynb
Lines changed: 52 additions & 2 deletions b/‎dev/_downloads/47f024d726d245e034c7690b4664721f/plot_classification.ipynb
Lines changed: 52 additions & 2 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
2.16 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
2.16 KB
diff --git a/‎dev/_downloads/8d0cc737ca20800f70d8aa80d8b8fb7d/plot_classification.py
Lines changed: 69 additions & 38 deletions b/‎dev/_downloads/8d0cc737ca20800f70d8aa80d8b8fb7d/plot_classification.py
Lines changed: 69 additions & 38 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
295 Bytes b/‎dev/_downloads/scikit-learn-docs.zip
295 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-121 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-121 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-61 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-61 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_001.png
-27 Bytes b/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_001.png
-27 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_thumb.png
16 Bytes b/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_thumb.png
16 Bytes
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 5963acf6320151f44a6521a3488d2eb6
+config: 59053f3c78059526e40748d46d97a8a3
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -4,7 +4,32 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Nearest Neighbors Classification\n\nSample usage of Nearest Neighbors classification.\nIt will plot the decision boundaries for each class.\n"
+        "\n# Nearest Neighbors Classification\n\nThis example shows how to use :class:`~sklearn.neighbors.KNeighborsClassifier`.\nWe train such a classifier on the iris dataset and observe the difference of the\ndecision boundary obtained with regards to the parameter `weights`.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Load the data\n\nIn this example, we use the iris dataset. We split the data into a train and test\ndataset.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.datasets import load_iris\nfrom sklearn.model_selection import train_test_split\n\niris = load_iris(as_frame=True)\nX = iris.data[[\"sepal length (cm)\", \"sepal width (cm)\"]]\ny = iris.target\nX_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## K-nearest neighbors classifier\n\nWe want to use a k-nearest neighbors classifier considering a neighborhood of 11 data\npoints. Since our k-nearest neighbors model uses euclidean distance to find the\nnearest neighbors, it is therefore important to scale the data beforehand. Refer to\nthe example entitled\n`sphx_glr_auto_examples_preprocessing_plot_scaling_importance.py` for more\ndetailed information.\n\nThus, we use a :class:`~sklearn.pipeline.Pipeline` to chain a scaler before to use\nour classifier.\n\n"
       ]
     },
     {
@@ -15,7 +40,32 @@
       },
       "outputs": [],
       "source": [
-        "import matplotlib.pyplot as plt\nimport seaborn as sns\nfrom matplotlib.colors import ListedColormap\n\nfrom sklearn import datasets, neighbors\nfrom sklearn.inspection import DecisionBoundaryDisplay\n\nn_neighbors = 15\n\n# import some data to play with\niris = datasets.load_iris()\n\n# we only take the first two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = iris.data[:, :2]\ny = iris.target\n\n# Create color maps\ncmap_light = ListedColormap([\"orange\", \"cyan\", \"cornflowerblue\"])\ncmap_bold = [\"darkorange\", \"c\", \"darkblue\"]\n\nfor weights in [\"uniform\", \"distance\"]:\n    # we create an instance of Neighbours Classifier and fit the data.\n    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)\n    clf.fit(X, y)\n\n    _, ax = plt.subplots()\n    DecisionBoundaryDisplay.from_estimator(\n        clf,\n        X,\n        cmap=cmap_light,\n        ax=ax,\n        response_method=\"predict\",\n        plot_method=\"pcolormesh\",\n        xlabel=iris.feature_names[0],\n        ylabel=iris.feature_names[1],\n        shading=\"auto\",\n    )\n\n    # Plot also the training points\n    sns.scatterplot(\n        x=X[:, 0],\n        y=X[:, 1],\n        hue=iris.target_names[y],\n        palette=cmap_bold,\n        alpha=1.0,\n        edgecolor=\"black\",\n    )\n    plt.title(\n        \"3-Class classification (k = %i, weights = '%s')\" % (n_neighbors, weights)\n    )\n\nplt.show()"
+        "from sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\n\nclf = Pipeline(\n    steps=[(\"scaler\", StandardScaler()), (\"knn\", KNeighborsClassifier(n_neighbors=11))]\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Decision boundary\n\nNow, we fit two classifiers with different values of the parameter\n`weights`. We plot the decision boundary of each classifier as well as the original\ndataset to observe the difference.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n\nfrom sklearn.inspection import DecisionBoundaryDisplay\n\n_, axs = plt.subplots(ncols=2, figsize=(12, 5))\n\nfor ax, weights in zip(axs, (\"uniform\", \"distance\")):\n    clf.set_params(knn__weights=weights).fit(X_train, y_train)\n    disp = DecisionBoundaryDisplay.from_estimator(\n        clf,\n        X_test,\n        response_method=\"predict\",\n        plot_method=\"pcolormesh\",\n        xlabel=iris.feature_names[0],\n        ylabel=iris.feature_names[1],\n        shading=\"auto\",\n        alpha=0.5,\n        ax=ax,\n    )\n    scatter = disp.ax_.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors=\"k\")\n    disp.ax_.legend(\n        scatter.legend_elements()[0],\n        iris.target_names,\n        loc=\"lower left\",\n        title=\"Classes\",\n    )\n    _ = disp.ax_.set_title(\n        f\"3-Class classification\\n(k={clf[-1].n_neighbors}, weights={weights!r})\"\n    )\n\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Conclusion\n\nWe observe that the parameter `weights` has an impact on the decision boundary. When\n`weights=\"unifom\"` all nearest neighbors will have the same impact on the decision.\nWhereas when `weights=\"distance\"` the weight given to each neighbor is proportional\nto the inverse of the distance from that neighbor to the query point.\n\nIn some cases, taking the distance into account might improve the model.\n\n"
       ]
     }
   ],
 
@@ -3,61 +3,92 @@
 Nearest Neighbors Classification
 ================================
 
-Sample usage of Nearest Neighbors classification.
-It will plot the decision boundaries for each class.
-
+This example shows how to use :class:`~sklearn.neighbors.KNeighborsClassifier`.
+We train such a classifier on the iris dataset and observe the difference of the
+decision boundary obtained with regards to the parameter `weights`.
 """
 
-import matplotlib.pyplot as plt
-import seaborn as sns
-from matplotlib.colors import ListedColormap
+# %%
+# Load the data
+# -------------
+#
+# In this example, we use the iris dataset. We split the data into a train and test
+# dataset.
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
 
-from sklearn import datasets, neighbors
-from sklearn.inspection import DecisionBoundaryDisplay
+iris = load_iris(as_frame=True)
+X = iris.data[["sepal length (cm)", "sepal width (cm)"]]
+y = iris.target
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
 
-n_neighbors = 15
+# %%
+# K-nearest neighbors classifier
+# ------------------------------
+#
+# We want to use a k-nearest neighbors classifier considering a neighborhood of 11 data
+# points. Since our k-nearest neighbors model uses euclidean distance to find the
+# nearest neighbors, it is therefore important to scale the data beforehand. Refer to
+# the example entitled
+# :ref:`sphx_glr_auto_examples_preprocessing_plot_scaling_importance.py` for more
+# detailed information.
+#
+# Thus, we use a :class:`~sklearn.pipeline.Pipeline` to chain a scaler before to use
+# our classifier.
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 
-# import some data to play with
-iris = datasets.load_iris()
+clf = Pipeline(
+    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=11))]
+)
 
-# we only take the first two features. We could avoid this ugly
-# slicing by using a two-dim dataset
-X = iris.data[:, :2]
-y = iris.target
+# %%
+# Decision boundary
+# -----------------
+#
+# Now, we fit two classifiers with different values of the parameter
+# `weights`. We plot the decision boundary of each classifier as well as the original
+# dataset to observe the difference.
+import matplotlib.pyplot as plt
 
-# Create color maps
-cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
-cmap_bold = ["darkorange", "c", "darkblue"]
+from sklearn.inspection import DecisionBoundaryDisplay
 
-for weights in ["uniform", "distance"]:
-    # we create an instance of Neighbours Classifier and fit the data.
-    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
-    clf.fit(X, y)
+_, axs = plt.subplots(ncols=2, figsize=(12, 5))
 
-    _, ax = plt.subplots()
-    DecisionBoundaryDisplay.from_estimator(
+for ax, weights in zip(axs, ("uniform", "distance")):
+    clf.set_params(knn__weights=weights).fit(X_train, y_train)
+    disp = DecisionBoundaryDisplay.from_estimator(
         clf,
-        X,
-        cmap=cmap_light,
-        ax=ax,
+        X_test,
         response_method="predict",
         plot_method="pcolormesh",
         xlabel=iris.feature_names[0],
         ylabel=iris.feature_names[1],
         shading="auto",
+        alpha=0.5,
+        ax=ax,
     )
-
-    # Plot also the training points
-    sns.scatterplot(
-        x=X[:, 0],
-        y=X[:, 1],
-        hue=iris.target_names[y],
-        palette=cmap_bold,
-        alpha=1.0,
-        edgecolor="black",
+    scatter = disp.ax_.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors="k")
+    disp.ax_.legend(
+        scatter.legend_elements()[0],
+        iris.target_names,
+        loc="lower left",
+        title="Classes",
     )
-    plt.title(
-        "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights)
+    _ = disp.ax_.set_title(
+        f"3-Class classification\n(k={clf[-1].n_neighbors}, weights={weights!r})"
     )
 
 plt.show()
+
+# %%
+# Conclusion
+# ----------
+#
+# We observe that the parameter `weights` has an impact on the decision boundary. When
+# `weights="unifom"` all nearest neighbors will have the same impact on the decision.
+# Whereas when `weights="distance"` the weight given to each neighbor is proportional
+# to the inverse of the distance from that neighbor to the query point.
+#
+# In some cases, taking the distance into account might improve the model.