scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
681 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
681 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
659 Bytes b/‎dev/_downloads/auto_examples_python.zip
659 Bytes
diff --git a/‎dev/_downloads/plot_label_propagation_digits_active_learning.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_label_propagation_digits_active_learning.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_label_propagation_digits_active_learning.py
Lines changed: 27 additions & 13 deletions b/‎dev/_downloads/plot_label_propagation_digits_active_learning.py
Lines changed: 27 additions & 13 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
10.2 KB b/‎dev/_downloads/scikit-learn-docs.pdf
10.2 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
152 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
152 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
152 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
152 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
9 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
9 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
9 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
9 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-116 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-116 Bytes
@@ -15,7 +15,7 @@
     }, 
     {
       "source": [
-        "\n# Label Propagation digits active learning\n\n\nDemonstrates an active learning technique to learn handwritten digits\nusing label propagation.\n\nWe start by training a label propagation model with only 10 labeled points,\nthen we select the top five most uncertain points to label. Next, we train\nwith 15 labeled points (original 10 + 5 new ones). We repeat this process\nfour times to have a model trained with 30 labeled examples.\n\nA plot will appear showing the top 5 most uncertain digits for each iteration\nof training. These may or may not contain mistakes, but we will train the next\nmodel with their true labels.\n\n"
+        "\n# Label Propagation digits active learning\n\n\nDemonstrates an active learning technique to learn handwritten digits\nusing label propagation.\n\nWe start by training a label propagation model with only 10 labeled points,\nthen we select the top five most uncertain points to label. Next, we train\nwith 15 labeled points (original 10 + 5 new ones). We repeat this process\nfour times to have a model trained with 30 labeled examples. Note you can\nincrease this to label more than 30 by changing `max_iterations`. Labeling\nmore than 30 can be useful to get a sense for the speed of convergence of\nthis active learning technique.\n\nA plot will appear showing the top 5 most uncertain digits for each iteration\nof training. These may or may not contain mistakes, but we will train the next\nmodel with their true labels.\n\n"
       ], 
       "cell_type": "markdown", 
       "metadata": {}
@@ -24,7 +24,7 @@
       "execution_count": null, 
       "cell_type": "code", 
       "source": [
-        "print(__doc__)\n\n# Authors: Clay Woolam <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\nfrom sklearn import datasets\nfrom sklearn.semi_supervised import label_propagation\nfrom sklearn.metrics import classification_report, confusion_matrix\n\ndigits = datasets.load_digits()\nrng = np.random.RandomState(0)\nindices = np.arange(len(digits.data))\nrng.shuffle(indices)\n\nX = digits.data[indices[:330]]\ny = digits.target[indices[:330]]\nimages = digits.images[indices[:330]]\n\nn_total_samples = len(y)\nn_labeled_points = 10\n\nunlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]\nf = plt.figure()\n\nfor i in range(5):\n    y_train = np.copy(y)\n    y_train[unlabeled_indices] = -1\n\n    lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)\n    lp_model.fit(X, y_train)\n\n    predicted_labels = lp_model.transduction_[unlabeled_indices]\n    true_labels = y[unlabeled_indices]\n\n    cm = confusion_matrix(true_labels, predicted_labels,\n                          labels=lp_model.classes_)\n\n    print('Iteration %i %s' % (i, 70 * '_'))\n    print(\"Label Spreading model: %d labeled & %d unlabeled (%d total)\"\n          % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))\n\n    print(classification_report(true_labels, predicted_labels))\n\n    print(\"Confusion matrix\")\n    print(cm)\n\n    # compute the entropies of transduced label distributions\n    pred_entropies = stats.distributions.entropy(\n        lp_model.label_distributions_.T)\n\n    # select five digit examples that the classifier is most uncertain about\n    uncertainty_index = uncertainty_index = np.argsort(pred_entropies)[-5:]\n\n    # keep track of indices that we get labels for\n    delete_indices = np.array([])\n\n    f.text(.05, (1 - (i + 1) * .183),\n           \"model %d\\n\\nfit with\\n%d labels\" % ((i + 1), i * 5 + 10), size=10)\n    for index, image_index in enumerate(uncertainty_index):\n        image = images[image_index]\n\n        sub = f.add_subplot(5, 5, index + 1 + (5 * i))\n        sub.imshow(image, cmap=plt.cm.gray_r)\n        sub.set_title('predict: %i\\ntrue: %i' % (\n            lp_model.transduction_[image_index], y[image_index]), size=10)\n        sub.axis('off')\n\n        # labeling 5 points, remote from labeled set\n        delete_index, = np.where(unlabeled_indices == image_index)\n        delete_indices = np.concatenate((delete_indices, delete_index))\n\n    unlabeled_indices = np.delete(unlabeled_indices, delete_indices)\n    n_labeled_points += 5\n\nf.suptitle(\"Active learning with Label Propagation.\\nRows show 5 most \"\n           \"uncertain labels to learn with the next model.\")\nplt.subplots_adjust(0.12, 0.03, 0.9, 0.8, 0.2, 0.45)\nplt.show()"
+        "print(__doc__)\n\n# Authors: Clay Woolam <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\nfrom sklearn import datasets\nfrom sklearn.semi_supervised import label_propagation\nfrom sklearn.metrics import classification_report, confusion_matrix\n\ndigits = datasets.load_digits()\nrng = np.random.RandomState(0)\nindices = np.arange(len(digits.data))\nrng.shuffle(indices)\n\nX = digits.data[indices[:330]]\ny = digits.target[indices[:330]]\nimages = digits.images[indices[:330]]\n\nn_total_samples = len(y)\nn_labeled_points = 10\nmax_iterations = 5\n\nunlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]\nf = plt.figure()\n\nfor i in range(max_iterations):\n    if len(unlabeled_indices) == 0:\n        print(\"No unlabeled items left to label.\")\n        break\n    y_train = np.copy(y)\n    y_train[unlabeled_indices] = -1\n\n    lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)\n    lp_model.fit(X, y_train)\n\n    predicted_labels = lp_model.transduction_[unlabeled_indices]\n    true_labels = y[unlabeled_indices]\n\n    cm = confusion_matrix(true_labels, predicted_labels,\n                          labels=lp_model.classes_)\n\n    print(\"Iteration %i %s\" % (i, 70 * \"_\"))\n    print(\"Label Spreading model: %d labeled & %d unlabeled (%d total)\"\n          % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))\n\n    print(classification_report(true_labels, predicted_labels))\n\n    print(\"Confusion matrix\")\n    print(cm)\n\n    # compute the entropies of transduced label distributions\n    pred_entropies = stats.distributions.entropy(\n        lp_model.label_distributions_.T)\n\n    # select up to 5 digit examples that the classifier is most uncertain about\n    uncertainty_index = np.argsort(pred_entropies)[::-1]\n    uncertainty_index = uncertainty_index[\n        np.in1d(uncertainty_index, unlabeled_indices)][:5]\n\n    # keep track of indices that we get labels for\n    delete_indices = np.array([])\n\n    # for more than 5 iterations, visualize the gain only on the first 5\n    if i < 5:\n        f.text(.05, (1 - (i + 1) * .183),\n               \"model %d\\n\\nfit with\\n%d labels\" %\n               ((i + 1), i * 5 + 10), size=10)\n    for index, image_index in enumerate(uncertainty_index):\n        image = images[image_index]\n\n        # for more than 5 iterations, visualize the gain only on the first 5\n        if i < 5:\n            sub = f.add_subplot(5, 5, index + 1 + (5 * i))\n            sub.imshow(image, cmap=plt.cm.gray_r)\n            sub.set_title(\"predict: %i\\ntrue: %i\" % (\n                lp_model.transduction_[image_index], y[image_index]), size=10)\n            sub.axis('off')\n\n        # labeling 5 points, remote from labeled set\n        delete_index, = np.where(unlabeled_indices == image_index)\n        delete_indices = np.concatenate((delete_indices, delete_index))\n\n    unlabeled_indices = np.delete(unlabeled_indices, delete_indices)\n    n_labeled_points += len(uncertainty_index)\n\nf.suptitle(\"Active learning with Label Propagation.\\nRows show 5 most \"\n           \"uncertain labels to learn with the next model.\")\nplt.subplots_adjust(0.12, 0.03, 0.9, 0.8, 0.2, 0.45)\nplt.show()"
       ], 
       "outputs": [], 
       "metadata": {
 
@@ -9,7 +9,10 @@
 We start by training a label propagation model with only 10 labeled points,
 then we select the top five most uncertain points to label. Next, we train
 with 15 labeled points (original 10 + 5 new ones). We repeat this process
-four times to have a model trained with 30 labeled examples.
+four times to have a model trained with 30 labeled examples. Note you can
+increase this to label more than 30 by changing `max_iterations`. Labeling
+more than 30 can be useful to get a sense for the speed of convergence of
+this active learning technique.
 
 A plot will appear showing the top 5 most uncertain digits for each iteration
 of training. These may or may not contain mistakes, but we will train the next
@@ -39,11 +42,15 @@
 
 n_total_samples = len(y)
 n_labeled_points = 10
+max_iterations = 5
 
 unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]
 f = plt.figure()
 
-for i in range(5):
+for i in range(max_iterations):
+    if len(unlabeled_indices) == 0:
+        print("No unlabeled items left to label.")
+        break
     y_train = np.copy(y)
     y_train[unlabeled_indices] = -1
 
@@ -56,7 +63,7 @@
     cm = confusion_matrix(true_labels, predicted_labels,
                           labels=lp_model.classes_)
 
-    print('Iteration %i %s' % (i, 70 * '_'))
+    print("Iteration %i %s" % (i, 70 * "_"))
     print("Label Spreading model: %d labeled & %d unlabeled (%d total)"
           % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))
 
@@ -69,29 +76,36 @@
     pred_entropies = stats.distributions.entropy(
         lp_model.label_distributions_.T)
 
-    # select five digit examples that the classifier is most uncertain about
-    uncertainty_index = uncertainty_index = np.argsort(pred_entropies)[-5:]
+    # select up to 5 digit examples that the classifier is most uncertain about
+    uncertainty_index = np.argsort(pred_entropies)[::-1]
+    uncertainty_index = uncertainty_index[
+        np.in1d(uncertainty_index, unlabeled_indices)][:5]
 
     # keep track of indices that we get labels for
     delete_indices = np.array([])
 
-    f.text(.05, (1 - (i + 1) * .183),
-           "model %d\n\nfit with\n%d labels" % ((i + 1), i * 5 + 10), size=10)
+    # for more than 5 iterations, visualize the gain only on the first 5
+    if i < 5:
+        f.text(.05, (1 - (i + 1) * .183),
+               "model %d\n\nfit with\n%d labels" %
+               ((i + 1), i * 5 + 10), size=10)
     for index, image_index in enumerate(uncertainty_index):
         image = images[image_index]
 
-        sub = f.add_subplot(5, 5, index + 1 + (5 * i))
-        sub.imshow(image, cmap=plt.cm.gray_r)
-        sub.set_title('predict: %i\ntrue: %i' % (
-            lp_model.transduction_[image_index], y[image_index]), size=10)
-        sub.axis('off')
+        # for more than 5 iterations, visualize the gain only on the first 5
+        if i < 5:
+            sub = f.add_subplot(5, 5, index + 1 + (5 * i))
+            sub.imshow(image, cmap=plt.cm.gray_r)
+            sub.set_title("predict: %i\ntrue: %i" % (
+                lp_model.transduction_[image_index], y[image_index]), size=10)
+            sub.axis('off')
 
         # labeling 5 points, remote from labeled set
         delete_index, = np.where(unlabeled_indices == image_index)
         delete_indices = np.concatenate((delete_indices, delete_index))
 
     unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
-    n_labeled_points += 5
+    n_labeled_points += len(uncertainty_index)
 
 f.suptitle("Active learning with Label Propagation.\nRows show 5 most "
            "uncertain labels to learn with the next model.")