scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
461 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
461 Bytes
diff --git a/‎dev/_downloads/22bab9f5303a6f1be42d14efb0f90b40/plot_label_propagation_digits.ipynb
Lines changed: 163 additions & 1 deletion b/‎dev/_downloads/22bab9f5303a6f1be42d14efb0f90b40/plot_label_propagation_digits.ipynb
Lines changed: 163 additions & 1 deletion
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
3.9 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
3.9 KB
diff --git a/‎dev/_downloads/9c824e9beef1b72c9f1ad3f39de0bf57/plot_label_propagation_structure.ipynb
Lines changed: 73 additions & 1 deletion b/‎dev/_downloads/9c824e9beef1b72c9f1ad3f39de0bf57/plot_label_propagation_structure.ipynb
Lines changed: 73 additions & 1 deletion
diff --git a/‎dev/_downloads/c6e2877780eeb2421a441896c8ec77b7/plot_label_propagation_structure.py
Lines changed: 25 additions & 15 deletions b/‎dev/_downloads/c6e2877780eeb2421a441896c8ec77b7/plot_label_propagation_structure.py
Lines changed: 25 additions & 15 deletions
@@ -26,7 +26,169 @@
       },
       "outputs": [],
       "source": [
-        "# Authors: Clay Woolam <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom scipy import stats\n\nfrom sklearn import datasets\nfrom sklearn.semi_supervised import LabelSpreading\n\nfrom sklearn.metrics import confusion_matrix, classification_report\n\ndigits = datasets.load_digits()\nrng = np.random.RandomState(2)\nindices = np.arange(len(digits.data))\nrng.shuffle(indices)\n\nX = digits.data[indices[:340]]\ny = digits.target[indices[:340]]\nimages = digits.images[indices[:340]]\n\nn_total_samples = len(y)\nn_labeled_points = 40\n\nindices = np.arange(n_total_samples)\n\nunlabeled_set = indices[n_labeled_points:]\n\n# #############################################################################\n# Shuffle everything around\ny_train = np.copy(y)\ny_train[unlabeled_set] = -1\n\n# #############################################################################\n# Learn with LabelSpreading\nlp_model = LabelSpreading(gamma=0.25, max_iter=20)\nlp_model.fit(X, y_train)\npredicted_labels = lp_model.transduction_[unlabeled_set]\ntrue_labels = y[unlabeled_set]\n\ncm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)\n\nprint(\n    \"Label Spreading model: %d labeled & %d unlabeled points (%d total)\"\n    % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)\n)\n\nprint(classification_report(true_labels, predicted_labels))\n\nprint(\"Confusion matrix\")\nprint(cm)\n\n# #############################################################################\n# Calculate uncertainty values for each transduced distribution\npred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)\n\n# #############################################################################\n# Pick the top 10 most uncertain labels\nuncertainty_index = np.argsort(pred_entropies)[-10:]\n\n# #############################################################################\n# Plot\nf = plt.figure(figsize=(7, 5))\nfor index, image_index in enumerate(uncertainty_index):\n    image = images[image_index]\n\n    sub = f.add_subplot(2, 5, index + 1)\n    sub.imshow(image, cmap=plt.cm.gray_r)\n    plt.xticks([])\n    plt.yticks([])\n    sub.set_title(\n        \"predict: %i\\ntrue: %i\" % (lp_model.transduction_[image_index], y[image_index])\n    )\n\nf.suptitle(\"Learning with small amount of labeled data\")\nplt.show()"
+        "# Authors: Clay Woolam <[email protected]>\n# License: BSD"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Data generation\n\nWe use the digits dataset. We only use a subset of randomly selected samples.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn import datasets\nimport numpy as np\n\ndigits = datasets.load_digits()\nrng = np.random.RandomState(2)\nindices = np.arange(len(digits.data))\nrng.shuffle(indices)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We selected 340 samples of which only 40 will be associated with a known label.\nTherefore, we store the indices of the 300 other samples for which we are not\nsupposed to know their labels.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "X = digits.data[indices[:340]]\ny = digits.target[indices[:340]]\nimages = digits.images[indices[:340]]\n\nn_total_samples = len(y)\nn_labeled_points = 40\n\nindices = np.arange(n_total_samples)\n\nunlabeled_set = indices[n_labeled_points:]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Shuffle everything around\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "y_train = np.copy(y)\ny_train[unlabeled_set] = -1"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Semi-supervised learning\n\nWe fit a :class:`~sklearn.semi_supervised.LabelSpreading` and use it to predict\nthe unknown labels.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.semi_supervised import LabelSpreading\nfrom sklearn.metrics import classification_report\n\nlp_model = LabelSpreading(gamma=0.25, max_iter=20)\nlp_model.fit(X, y_train)\npredicted_labels = lp_model.transduction_[unlabeled_set]\ntrue_labels = y[unlabeled_set]\n\nprint(\n    \"Label Spreading model: %d labeled & %d unlabeled points (%d total)\"\n    % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Classification report\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(classification_report(true_labels, predicted_labels))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Confusion matrix\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.metrics import ConfusionMatrixDisplay\n\nConfusionMatrixDisplay.from_predictions(\n    true_labels, predicted_labels, labels=lp_model.classes_\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plot the most uncertain predictions\n\nHere, we will pick and show the 10 most uncertain predictions.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from scipy import stats\n\npred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Pick the top 10 most uncertain labels\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "uncertainty_index = np.argsort(pred_entropies)[-10:]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Plot\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n\nf = plt.figure(figsize=(7, 5))\nfor index, image_index in enumerate(uncertainty_index):\n    image = images[image_index]\n\n    sub = f.add_subplot(2, 5, index + 1)\n    sub.imshow(image, cmap=plt.cm.gray_r)\n    plt.xticks([])\n    plt.yticks([])\n    sub.set_title(\n        \"predict: %i\\ntrue: %i\" % (lp_model.transduction_[image_index], y[image_index])\n    )\n\nf.suptitle(\"Learning with small amount of labeled data\")\nplt.show()"
       ]
     }
   ],
 
@@ -26,7 +26,79 @@
       },
       "outputs": [],
       "source": [
-        "# Authors: Clay Woolam <[email protected]>\n#          Andreas Mueller <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.semi_supervised import LabelSpreading\nfrom sklearn.datasets import make_circles\n\n# generate ring with inner box\nn_samples = 200\nX, y = make_circles(n_samples=n_samples, shuffle=False)\nouter, inner = 0, 1\nlabels = np.full(n_samples, -1.0)\nlabels[0] = outer\nlabels[-1] = inner\n\n# #############################################################################\n# Learn with LabelSpreading\nlabel_spread = LabelSpreading(kernel=\"knn\", alpha=0.8)\nlabel_spread.fit(X, labels)\n\n# #############################################################################\n# Plot output labels\noutput_labels = label_spread.transduction_\nplt.figure(figsize=(8.5, 4))\nplt.subplot(1, 2, 1)\nplt.scatter(\n    X[labels == outer, 0],\n    X[labels == outer, 1],\n    color=\"navy\",\n    marker=\"s\",\n    lw=0,\n    label=\"outer labeled\",\n    s=10,\n)\nplt.scatter(\n    X[labels == inner, 0],\n    X[labels == inner, 1],\n    color=\"c\",\n    marker=\"s\",\n    lw=0,\n    label=\"inner labeled\",\n    s=10,\n)\nplt.scatter(\n    X[labels == -1, 0],\n    X[labels == -1, 1],\n    color=\"darkorange\",\n    marker=\".\",\n    label=\"unlabeled\",\n)\nplt.legend(scatterpoints=1, shadow=False, loc=\"upper right\")\nplt.title(\"Raw data (2 classes=outer and inner)\")\n\nplt.subplot(1, 2, 2)\noutput_label_array = np.asarray(output_labels)\nouter_numbers = np.where(output_label_array == outer)[0]\ninner_numbers = np.where(output_label_array == inner)[0]\nplt.scatter(\n    X[outer_numbers, 0],\n    X[outer_numbers, 1],\n    color=\"navy\",\n    marker=\"s\",\n    lw=0,\n    s=10,\n    label=\"outer learned\",\n)\nplt.scatter(\n    X[inner_numbers, 0],\n    X[inner_numbers, 1],\n    color=\"c\",\n    marker=\"s\",\n    lw=0,\n    s=10,\n    label=\"inner learned\",\n)\nplt.legend(scatterpoints=1, shadow=False, loc=\"upper right\")\nplt.title(\"Labels learned with Label Spreading (KNN)\")\n\nplt.subplots_adjust(left=0.07, bottom=0.07, right=0.93, top=0.92)\nplt.show()"
+        "# Authors: Clay Woolam <[email protected]>\n#          Andreas Mueller <[email protected]>\n# License: BSD"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We generate a dataset with two concentric circles. In addition, a label\nis associated with each sample of the dataset that is: 0 (belonging to\nthe outer circle), 1 (belonging to the inner circle), and -1 (unknown).\nHere, all labels but two are tagged as unknown.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nfrom sklearn.datasets import make_circles\n\nn_samples = 200\nX, y = make_circles(n_samples=n_samples, shuffle=False)\nouter, inner = 0, 1\nlabels = np.full(n_samples, -1.0)\nlabels[0] = outer\nlabels[-1] = inner"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Plot raw data\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n\nplt.figure(figsize=(4, 4))\nplt.scatter(\n    X[labels == outer, 0],\n    X[labels == outer, 1],\n    color=\"navy\",\n    marker=\"s\",\n    lw=0,\n    label=\"outer labeled\",\n    s=10,\n)\nplt.scatter(\n    X[labels == inner, 0],\n    X[labels == inner, 1],\n    color=\"c\",\n    marker=\"s\",\n    lw=0,\n    label=\"inner labeled\",\n    s=10,\n)\nplt.scatter(\n    X[labels == -1, 0],\n    X[labels == -1, 1],\n    color=\"darkorange\",\n    marker=\".\",\n    label=\"unlabeled\",\n)\nplt.legend(scatterpoints=1, shadow=False, loc=\"upper right\")\nplt.title(\"Raw data (2 classes=outer and inner)\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The aim of :class:`~sklearn.semi_supervised.LabelSpreading` is to associate\na label to sample where the label is initially unknown.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.semi_supervised import LabelSpreading\n\nlabel_spread = LabelSpreading(kernel=\"knn\", alpha=0.8)\nlabel_spread.fit(X, labels)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Now, we can check which labels have been associated with each sample\nwhen the label was unknown.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "output_labels = label_spread.transduction_\noutput_label_array = np.asarray(output_labels)\nouter_numbers = np.where(output_label_array == outer)[0]\ninner_numbers = np.where(output_label_array == inner)[0]\n\nplt.figure(figsize=(4, 4))\nplt.scatter(\n    X[outer_numbers, 0],\n    X[outer_numbers, 1],\n    color=\"navy\",\n    marker=\"s\",\n    lw=0,\n    s=10,\n    label=\"outer learned\",\n)\nplt.scatter(\n    X[inner_numbers, 0],\n    X[inner_numbers, 1],\n    color=\"c\",\n    marker=\"s\",\n    lw=0,\n    s=10,\n    label=\"inner learned\",\n)\nplt.legend(scatterpoints=1, shadow=False, loc=\"upper right\")\nplt.title(\"Labels learned with Label Spreading (KNN)\")\nplt.show()"
       ]
     }
   ],
 
@@ -15,29 +15,27 @@
 #          Andreas Mueller <[email protected]>
 # License: BSD
 
+# %%
+# We generate a dataset with two concentric circles. In addition, a label
+# is associated with each sample of the dataset that is: 0 (belonging to
+# the outer circle), 1 (belonging to the inner circle), and -1 (unknown).
+# Here, all labels but two are tagged as unknown.
+
 import numpy as np
-import matplotlib.pyplot as plt
-from sklearn.semi_supervised import LabelSpreading
 from sklearn.datasets import make_circles
 
-# generate ring with inner box
 n_samples = 200
 X, y = make_circles(n_samples=n_samples, shuffle=False)
 outer, inner = 0, 1
 labels = np.full(n_samples, -1.0)
 labels[0] = outer
 labels[-1] = inner
 
-# #############################################################################
-# Learn with LabelSpreading
-label_spread = LabelSpreading(kernel="knn", alpha=0.8)
-label_spread.fit(X, labels)
+# %%
+# Plot raw data
+import matplotlib.pyplot as plt
 
-# #############################################################################
-# Plot output labels
-output_labels = label_spread.transduction_
-plt.figure(figsize=(8.5, 4))
-plt.subplot(1, 2, 1)
+plt.figure(figsize=(4, 4))
 plt.scatter(
     X[labels == outer, 0],
     X[labels == outer, 1],
@@ -66,10 +64,24 @@
 plt.legend(scatterpoints=1, shadow=False, loc="upper right")
 plt.title("Raw data (2 classes=outer and inner)")
 
-plt.subplot(1, 2, 2)
+# %%
+#
+# The aim of :class:`~sklearn.semi_supervised.LabelSpreading` is to associate
+# a label to sample where the label is initially unknown.
+from sklearn.semi_supervised import LabelSpreading
+
+label_spread = LabelSpreading(kernel="knn", alpha=0.8)
+label_spread.fit(X, labels)
+
+# %%
+# Now, we can check which labels have been associated with each sample
+# when the label was unknown.
+output_labels = label_spread.transduction_
 output_label_array = np.asarray(output_labels)
 outer_numbers = np.where(output_label_array == outer)[0]
 inner_numbers = np.where(output_label_array == inner)[0]
+
+plt.figure(figsize=(4, 4))
 plt.scatter(
     X[outer_numbers, 0],
     X[outer_numbers, 1],
@@ -90,6 +102,4 @@
 )
 plt.legend(scatterpoints=1, shadow=False, loc="upper right")
 plt.title("Labels learned with Label Spreading (KNN)")
-
-plt.subplots_adjust(left=0.07, bottom=0.07, right=0.93, top=0.92)
 plt.show()