scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
8.46 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
8.46 KB
diff --git a/‎dev/_downloads/08cd69ec4e6b0089b41d13ef3cbc000b/plot_label_propagation_versus_svm_iris.ipynb
Lines changed: 0 additions & 54 deletions b/‎dev/_downloads/08cd69ec4e6b0089b41d13ef3cbc000b/plot_label_propagation_versus_svm_iris.ipynb
Lines changed: 0 additions & 54 deletions
diff --git a/‎dev/_downloads/28056df7ed8b04d495f832aaab1b8c3e/plot_semi_supervised_versus_svm_iris.ipynb
Lines changed: 54 additions & 0 deletions b/‎dev/_downloads/28056df7ed8b04d495f832aaab1b8c3e/plot_semi_supervised_versus_svm_iris.ipynb
Lines changed: 54 additions & 0 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
10.4 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
10.4 KB
diff --git a/‎dev/_downloads/7394345a6f70a1637fe759e076256013/plot_self_training_varying_threshold.ipynb
Lines changed: 54 additions & 0 deletions b/‎dev/_downloads/7394345a6f70a1637fe759e076256013/plot_self_training_varying_threshold.ipynb
Lines changed: 54 additions & 0 deletions
diff --git a/‎dev/_downloads/7f9c06d88a8d544a3815452dacaa0548/plot_semi_supervised_newsgroups.py
Lines changed: 95 additions & 0 deletions b/‎dev/_downloads/7f9c06d88a8d544a3815452dacaa0548/plot_semi_supervised_newsgroups.py
Lines changed: 95 additions & 0 deletions
diff --git a/‎dev/_downloads/8219ffabb9724762c36d14d22f80a0d5/plot_semi_supervised_versus_svm_iris.py
Lines changed: 86 additions & 0 deletions b/‎dev/_downloads/8219ffabb9724762c36d14d22f80a0d5/plot_semi_supervised_versus_svm_iris.py
Lines changed: 86 additions & 0 deletions
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Decision boundary of semi-supervised classifiers versus SVM on the Iris dataset\n\nA comparison for the decision boundaries generated on the iris dataset\nby Label Spreading, Self-training and SVM.\n\nThis example demonstrates that Label Spreading and Self-training can learn\ngood boundaries even when small amounts of labeled data are available.\n\nNote that Self-training with 100% of the data is omitted as it is functionally\nidentical to training the SVC on 100% of the data.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(__doc__)\n\n# Authors: Clay Woolam   <[email protected]>\n#          Oliver Rausch <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\nfrom sklearn.svm import SVC\nfrom sklearn.semi_supervised import LabelSpreading\nfrom sklearn.semi_supervised import SelfTrainingClassifier\n\n\niris = datasets.load_iris()\n\nX = iris.data[:, :2]\ny = iris.target\n\n# step size in the mesh\nh = .02\n\nrng = np.random.RandomState(0)\ny_rand = rng.rand(y.shape[0])\ny_30 = np.copy(y)\ny_30[y_rand < 0.3] = -1  # set random samples to be unlabeled\ny_50 = np.copy(y)\ny_50[y_rand < 0.5] = -1\n# we create an instance of SVM and fit out data. We do not scale our\n# data since we want to plot the support vectors\nls30 = (LabelSpreading().fit(X, y_30), y_30, 'Label Spreading 30% data')\nls50 = (LabelSpreading().fit(X, y_50), y_50, 'Label Spreading 50% data')\nls100 = (LabelSpreading().fit(X, y), y, 'Label Spreading 100% data')\n\n# the base classifier for self-training is identical to the SVC\nbase_classifier = SVC(kernel='rbf', gamma=.5, probability=True)\nst30 = (SelfTrainingClassifier(base_classifier).fit(X, y_30),\n        y_30, 'Self-training 30% data')\nst50 = (SelfTrainingClassifier(base_classifier).fit(X, y_50),\n        y_50, 'Self-training 50% data')\n\nrbf_svc = (SVC(kernel='rbf', gamma=.5).fit(X, y), y, 'SVC with rbf kernel')\n\n# create a mesh to plot in\nx_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\ny_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\nxx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n                     np.arange(y_min, y_max, h))\n\ncolor_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)}\n\nclassifiers = (ls30, st30, ls50, st50, ls100, rbf_svc)\nfor i, (clf, y_train, title) in enumerate(classifiers):\n    # Plot the decision boundary. For that, we will assign a color to each\n    # point in the mesh [x_min, x_max]x[y_min, y_max].\n    plt.subplot(3, 2, i + 1)\n    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n    # Put the result into a color plot\n    Z = Z.reshape(xx.shape)\n    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)\n    plt.axis('off')\n\n    # Plot also the training points\n    colors = [color_map[y] for y in y_train]\n    plt.scatter(X[:, 0], X[:, 1], c=colors, edgecolors='black')\n\n    plt.title(title)\n\nplt.suptitle(\"Unlabeled points are colored white\", y=0.1)\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Effect of varying threshold for self-training\n\nThis example illustrates the effect of a varying threshold on self-training.\nThe `breast_cancer` dataset is loaded, and labels are deleted such that only 50\nout of 569 samples have labels. A `SelfTrainingClassifier` is fitted on this\ndataset, with varying thresholds.\n\nThe upper graph shows the amount of labeled samples that the classifier has\navailable by the end of fit, and the accuracy of the classifier. The lower\ngraph shows the last iteration in which a sample was labeled. All values are\ncross validated with 3 folds.\n\nAt low thresholds (in [0.4, 0.5]), the classifier learns from samples that were\nlabeled with a low confidence. These low-confidence samples are likely have\nincorrect predicted labels, and as a result, fitting on these incorrect labels\nproduces a poor accuracy. Note that the classifier labels almost all of the\nsamples, and only takes one iteration.\n\nFor very high thresholds (in [0.9, 1)) we observe that the classifier does not\naugment its dataset (the amount of self-labeled samples is 0). As a result, the\naccuracy achieved with a threshold of 0.9999 is the same as a normal supervised\nclassifier would achieve.\n\nThe optimal accuracy lies in between both of these extremes at a threshold of\naround 0.7.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(__doc__)\n\n# Authors: Oliver Rausch <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\nfrom sklearn.svm import SVC\nfrom sklearn.model_selection import StratifiedKFold\nfrom sklearn.semi_supervised import SelfTrainingClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.utils import shuffle\n\nn_splits = 3\n\nX, y = datasets.load_breast_cancer(return_X_y=True)\nX, y = shuffle(X, y, random_state=42)\ny_true = y.copy()\ny[50:] = -1\ntotal_samples = y.shape[0]\n\nbase_classifier = SVC(probability=True, gamma=0.001, random_state=42)\n\nx_values = np.arange(0.4, 1.05, 0.05)\nx_values = np.append(x_values, 0.99999)\nscores = np.empty((x_values.shape[0], n_splits))\namount_labeled = np.empty((x_values.shape[0], n_splits))\namount_iterations = np.empty((x_values.shape[0], n_splits))\n\nfor (i, threshold) in enumerate(x_values):\n    self_training_clf = SelfTrainingClassifier(base_classifier,\n                                               threshold=threshold)\n\n    # We need manual cross validation so that we don't treat -1 as a separate\n    # class when computing accuracy\n    skfolds = StratifiedKFold(n_splits=n_splits)\n    for fold, (train_index, test_index) in enumerate(skfolds.split(X, y)):\n        X_train = X[train_index]\n        y_train = y[train_index]\n        X_test = X[test_index]\n        y_test = y[test_index]\n        y_test_true = y_true[test_index]\n\n        self_training_clf.fit(X_train, y_train)\n\n        # The amount of labeled samples that at the end of fitting\n        amount_labeled[i, fold] = total_samples - np.unique(\n            self_training_clf.labeled_iter_, return_counts=True)[1][0]\n        # The last iteration the classifier labeled a sample in\n        amount_iterations[i, fold] = np.max(self_training_clf.labeled_iter_)\n\n        y_pred = self_training_clf.predict(X_test)\n        scores[i, fold] = accuracy_score(y_test_true, y_pred)\n\n\nax1 = plt.subplot(211)\nax1.errorbar(x_values, scores.mean(axis=1),\n             yerr=scores.std(axis=1),\n             capsize=2, color='b')\nax1.set_ylabel('Accuracy', color='b')\nax1.tick_params('y', colors='b')\n\nax2 = ax1.twinx()\nax2.errorbar(x_values, amount_labeled.mean(axis=1),\n             yerr=amount_labeled.std(axis=1),\n             capsize=2, color='g')\nax2.set_ylim(bottom=0)\nax2.set_ylabel('Amount of labeled samples', color='g')\nax2.tick_params('y', colors='g')\n\nax3 = plt.subplot(212, sharex=ax1)\nax3.errorbar(x_values, amount_iterations.mean(axis=1),\n             yerr=amount_iterations.std(axis=1),\n             capsize=2, color='b')\nax3.set_ylim(bottom=0)\nax3.set_ylabel('Amount of iterations')\nax3.set_xlabel('Threshold')\n\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,95 @@
+"""
+================================================
+Semi-supervised Classification on a Text Dataset
+================================================
+
+In this example, semi-supervised classifiers are trained on the 20 newsgroups
+dataset (which will be automatically downloaded).
+
+You can adjust the number of categories by giving their names to the dataset
+loader or setting them to `None` to get all 20 of them.
+"""
+import os
+
+import numpy as np
+
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.preprocessing import FunctionTransformer
+from sklearn.linear_model import SGDClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.semi_supervised import SelfTrainingClassifier
+from sklearn.semi_supervised import LabelSpreading
+from sklearn.metrics import f1_score
+
+data = fetch_20newsgroups(subset='train', categories=None)
+print("%d documents" % len(data.filenames))
+print("%d categories" % len(data.target_names))
+print()
+
+# Parameters
+sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')
+vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)
+
+# Supervised Pipeline
+pipeline = Pipeline([
+    ('vect', CountVectorizer(**vectorizer_params)),
+    ('tfidf', TfidfTransformer()),
+    ('clf', SGDClassifier(**sdg_params)),
+])
+# SelfTraining Pipeline
+st_pipeline = Pipeline([
+    ('vect', CountVectorizer(**vectorizer_params)),
+    ('tfidf', TfidfTransformer()),
+    ('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
+])
+# LabelSpreading Pipeline
+ls_pipeline = Pipeline([
+    ('vect', CountVectorizer(**vectorizer_params)),
+    ('tfidf', TfidfTransformer()),
+    # LabelSpreading does not support dense matrices
+    ('todense', FunctionTransformer(lambda x: x.todense())),
+    ('clf', LabelSpreading()),
+])
+
+
+def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
+    print("Number of training samples:", len(X_train))
+    print("Unlabeled samples in training set:",
+          sum(1 for x in y_train if x == -1))
+    clf.fit(X_train, y_train)
+    y_pred = clf.predict(X_test)
+    print("Micro-averaged F1 score on test set: "
+          "%0.3f" % f1_score(y_test, y_pred, average='micro'))
+    print("-" * 10)
+    print()
+
+
+if __name__ == "__main__":
+    X, y = data.data, data.target
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+    print("Supervised SGDClassifier on 100% of the data:")
+    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)
+
+    # select a mask of 20% of the train dataset
+    y_mask = np.random.rand(len(y_train)) < 0.2
+
+    # X_20 and y_20 are the subset of the train dataset indicated by the mask
+    X_20, y_20 = map(list, zip(*((x, y)
+                     for x, y, m in zip(X_train, y_train, y_mask) if m)))
+    print("Supervised SGDClassifier on 20% of the training data:")
+    eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)
+
+    # set the non-masked subset to be unlabeled
+    y_train[~y_mask] = -1
+    print("SelfTrainingClassifier on 20% of the training data (rest "
+          "is unlabeled):")
+    eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)
+
+    if 'CI' not in os.environ:
+        # LabelSpreading takes too long to run in the online documentation
+        print("LabelSpreading on 20% of the data (rest is unlabeled):")
+        eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)
@@ -0,0 +1,86 @@
+"""
+===============================================================================
+Decision boundary of semi-supervised classifiers versus SVM on the Iris dataset
+===============================================================================
+
+A comparison for the decision boundaries generated on the iris dataset
+by Label Spreading, Self-training and SVM.
+
+This example demonstrates that Label Spreading and Self-training can learn
+good boundaries even when small amounts of labeled data are available.
+
+Note that Self-training with 100% of the data is omitted as it is functionally
+identical to training the SVC on 100% of the data.
+
+"""
+print(__doc__)
+
+# Authors: Clay Woolam   <[email protected]>
+#          Oliver Rausch <[email protected]>
+# License: BSD
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn import datasets
+from sklearn.svm import SVC
+from sklearn.semi_supervised import LabelSpreading
+from sklearn.semi_supervised import SelfTrainingClassifier
+
+
+iris = datasets.load_iris()
+
+X = iris.data[:, :2]
+y = iris.target
+
+# step size in the mesh
+h = .02
+
+rng = np.random.RandomState(0)
+y_rand = rng.rand(y.shape[0])
+y_30 = np.copy(y)
+y_30[y_rand < 0.3] = -1  # set random samples to be unlabeled
+y_50 = np.copy(y)
+y_50[y_rand < 0.5] = -1
+# we create an instance of SVM and fit out data. We do not scale our
+# data since we want to plot the support vectors
+ls30 = (LabelSpreading().fit(X, y_30), y_30, 'Label Spreading 30% data')
+ls50 = (LabelSpreading().fit(X, y_50), y_50, 'Label Spreading 50% data')
+ls100 = (LabelSpreading().fit(X, y), y, 'Label Spreading 100% data')
+
+# the base classifier for self-training is identical to the SVC
+base_classifier = SVC(kernel='rbf', gamma=.5, probability=True)
+st30 = (SelfTrainingClassifier(base_classifier).fit(X, y_30),
+        y_30, 'Self-training 30% data')
+st50 = (SelfTrainingClassifier(base_classifier).fit(X, y_50),
+        y_50, 'Self-training 50% data')
+
+rbf_svc = (SVC(kernel='rbf', gamma=.5).fit(X, y), y, 'SVC with rbf kernel')
+
+# create a mesh to plot in
+x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
+y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                     np.arange(y_min, y_max, h))
+
+color_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)}
+
+classifiers = (ls30, st30, ls50, st50, ls100, rbf_svc)
+for i, (clf, y_train, title) in enumerate(classifiers):
+    # Plot the decision boundary. For that, we will assign a color to each
+    # point in the mesh [x_min, x_max]x[y_min, y_max].
+    plt.subplot(3, 2, i + 1)
+    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
+
+    # Put the result into a color plot
+    Z = Z.reshape(xx.shape)
+    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
+    plt.axis('off')
+
+    # Plot also the training points
+    colors = [color_map[y] for y in y_train]
+    plt.scatter(X[:, 0], X[:, 1], c=colors, edgecolors='black')
+
+    plt.title(title)
+
+plt.suptitle("Unlabeled points are colored white", y=0.1)
+plt.show()