Skip to content

Commit cc391e7

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 2f6af711b098b553271e69b58d0711629825848a
1 parent 1a85c88 commit cc391e7

File tree

1,261 files changed

+6801
-4164
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,261 files changed

+6801
-4164
lines changed
Binary file not shown.

dev/_downloads/08cd69ec4e6b0089b41d13ef3cbc000b/plot_label_propagation_versus_svm_iris.ipynb

Lines changed: 0 additions & 54 deletions
This file was deleted.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Decision boundary of semi-supervised classifiers versus SVM on the Iris dataset\n\nA comparison for the decision boundaries generated on the iris dataset\nby Label Spreading, Self-training and SVM.\n\nThis example demonstrates that Label Spreading and Self-training can learn\ngood boundaries even when small amounts of labeled data are available.\n\nNote that Self-training with 100% of the data is omitted as it is functionally\nidentical to training the SVC on 100% of the data.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)\n\n# Authors: Clay Woolam <[email protected]>\n# Oliver Rausch <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\nfrom sklearn.svm import SVC\nfrom sklearn.semi_supervised import LabelSpreading\nfrom sklearn.semi_supervised import SelfTrainingClassifier\n\n\niris = datasets.load_iris()\n\nX = iris.data[:, :2]\ny = iris.target\n\n# step size in the mesh\nh = .02\n\nrng = np.random.RandomState(0)\ny_rand = rng.rand(y.shape[0])\ny_30 = np.copy(y)\ny_30[y_rand < 0.3] = -1 # set random samples to be unlabeled\ny_50 = np.copy(y)\ny_50[y_rand < 0.5] = -1\n# we create an instance of SVM and fit out data. We do not scale our\n# data since we want to plot the support vectors\nls30 = (LabelSpreading().fit(X, y_30), y_30, 'Label Spreading 30% data')\nls50 = (LabelSpreading().fit(X, y_50), y_50, 'Label Spreading 50% data')\nls100 = (LabelSpreading().fit(X, y), y, 'Label Spreading 100% data')\n\n# the base classifier for self-training is identical to the SVC\nbase_classifier = SVC(kernel='rbf', gamma=.5, probability=True)\nst30 = (SelfTrainingClassifier(base_classifier).fit(X, y_30),\n y_30, 'Self-training 30% data')\nst50 = (SelfTrainingClassifier(base_classifier).fit(X, y_50),\n y_50, 'Self-training 50% data')\n\nrbf_svc = (SVC(kernel='rbf', gamma=.5).fit(X, y), y, 'SVC with rbf kernel')\n\n# create a mesh to plot in\nx_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\ny_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\nxx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n np.arange(y_min, y_max, h))\n\ncolor_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)}\n\nclassifiers = (ls30, st30, ls50, st50, ls100, rbf_svc)\nfor i, (clf, y_train, title) in enumerate(classifiers):\n # Plot the decision boundary. For that, we will assign a color to each\n # point in the mesh [x_min, x_max]x[y_min, y_max].\n plt.subplot(3, 2, i + 1)\n Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n # Put the result into a color plot\n Z = Z.reshape(xx.shape)\n plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)\n plt.axis('off')\n\n # Plot also the training points\n colors = [color_map[y] for y in y_train]\n plt.scatter(X[:, 0], X[:, 1], c=colors, edgecolors='black')\n\n plt.title(title)\n\nplt.suptitle(\"Unlabeled points are colored white\", y=0.1)\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.8.5"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Effect of varying threshold for self-training\n\nThis example illustrates the effect of a varying threshold on self-training.\nThe `breast_cancer` dataset is loaded, and labels are deleted such that only 50\nout of 569 samples have labels. A `SelfTrainingClassifier` is fitted on this\ndataset, with varying thresholds.\n\nThe upper graph shows the amount of labeled samples that the classifier has\navailable by the end of fit, and the accuracy of the classifier. The lower\ngraph shows the last iteration in which a sample was labeled. All values are\ncross validated with 3 folds.\n\nAt low thresholds (in [0.4, 0.5]), the classifier learns from samples that were\nlabeled with a low confidence. These low-confidence samples are likely have\nincorrect predicted labels, and as a result, fitting on these incorrect labels\nproduces a poor accuracy. Note that the classifier labels almost all of the\nsamples, and only takes one iteration.\n\nFor very high thresholds (in [0.9, 1)) we observe that the classifier does not\naugment its dataset (the amount of self-labeled samples is 0). As a result, the\naccuracy achieved with a threshold of 0.9999 is the same as a normal supervised\nclassifier would achieve.\n\nThe optimal accuracy lies in between both of these extremes at a threshold of\naround 0.7.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)\n\n# Authors: Oliver Rausch <[email protected]>\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\nfrom sklearn.svm import SVC\nfrom sklearn.model_selection import StratifiedKFold\nfrom sklearn.semi_supervised import SelfTrainingClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.utils import shuffle\n\nn_splits = 3\n\nX, y = datasets.load_breast_cancer(return_X_y=True)\nX, y = shuffle(X, y, random_state=42)\ny_true = y.copy()\ny[50:] = -1\ntotal_samples = y.shape[0]\n\nbase_classifier = SVC(probability=True, gamma=0.001, random_state=42)\n\nx_values = np.arange(0.4, 1.05, 0.05)\nx_values = np.append(x_values, 0.99999)\nscores = np.empty((x_values.shape[0], n_splits))\namount_labeled = np.empty((x_values.shape[0], n_splits))\namount_iterations = np.empty((x_values.shape[0], n_splits))\n\nfor (i, threshold) in enumerate(x_values):\n self_training_clf = SelfTrainingClassifier(base_classifier,\n threshold=threshold)\n\n # We need manual cross validation so that we don't treat -1 as a separate\n # class when computing accuracy\n skfolds = StratifiedKFold(n_splits=n_splits)\n for fold, (train_index, test_index) in enumerate(skfolds.split(X, y)):\n X_train = X[train_index]\n y_train = y[train_index]\n X_test = X[test_index]\n y_test = y[test_index]\n y_test_true = y_true[test_index]\n\n self_training_clf.fit(X_train, y_train)\n\n # The amount of labeled samples that at the end of fitting\n amount_labeled[i, fold] = total_samples - np.unique(\n self_training_clf.labeled_iter_, return_counts=True)[1][0]\n # The last iteration the classifier labeled a sample in\n amount_iterations[i, fold] = np.max(self_training_clf.labeled_iter_)\n\n y_pred = self_training_clf.predict(X_test)\n scores[i, fold] = accuracy_score(y_test_true, y_pred)\n\n\nax1 = plt.subplot(211)\nax1.errorbar(x_values, scores.mean(axis=1),\n yerr=scores.std(axis=1),\n capsize=2, color='b')\nax1.set_ylabel('Accuracy', color='b')\nax1.tick_params('y', colors='b')\n\nax2 = ax1.twinx()\nax2.errorbar(x_values, amount_labeled.mean(axis=1),\n yerr=amount_labeled.std(axis=1),\n capsize=2, color='g')\nax2.set_ylim(bottom=0)\nax2.set_ylabel('Amount of labeled samples', color='g')\nax2.tick_params('y', colors='g')\n\nax3 = plt.subplot(212, sharex=ax1)\nax3.errorbar(x_values, amount_iterations.mean(axis=1),\n yerr=amount_iterations.std(axis=1),\n capsize=2, color='b')\nax3.set_ylim(bottom=0)\nax3.set_ylabel('Amount of iterations')\nax3.set_xlabel('Threshold')\n\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.8.5"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
"""
2+
================================================
3+
Semi-supervised Classification on a Text Dataset
4+
================================================
5+
6+
In this example, semi-supervised classifiers are trained on the 20 newsgroups
7+
dataset (which will be automatically downloaded).
8+
9+
You can adjust the number of categories by giving their names to the dataset
10+
loader or setting them to `None` to get all 20 of them.
11+
"""
12+
import os
13+
14+
import numpy as np
15+
16+
from sklearn.datasets import fetch_20newsgroups
17+
from sklearn.feature_extraction.text import CountVectorizer
18+
from sklearn.feature_extraction.text import TfidfTransformer
19+
from sklearn.preprocessing import FunctionTransformer
20+
from sklearn.linear_model import SGDClassifier
21+
from sklearn.model_selection import train_test_split
22+
from sklearn.pipeline import Pipeline
23+
from sklearn.semi_supervised import SelfTrainingClassifier
24+
from sklearn.semi_supervised import LabelSpreading
25+
from sklearn.metrics import f1_score
26+
27+
data = fetch_20newsgroups(subset='train', categories=None)
28+
print("%d documents" % len(data.filenames))
29+
print("%d categories" % len(data.target_names))
30+
print()
31+
32+
# Parameters
33+
sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')
34+
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)
35+
36+
# Supervised Pipeline
37+
pipeline = Pipeline([
38+
('vect', CountVectorizer(**vectorizer_params)),
39+
('tfidf', TfidfTransformer()),
40+
('clf', SGDClassifier(**sdg_params)),
41+
])
42+
# SelfTraining Pipeline
43+
st_pipeline = Pipeline([
44+
('vect', CountVectorizer(**vectorizer_params)),
45+
('tfidf', TfidfTransformer()),
46+
('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
47+
])
48+
# LabelSpreading Pipeline
49+
ls_pipeline = Pipeline([
50+
('vect', CountVectorizer(**vectorizer_params)),
51+
('tfidf', TfidfTransformer()),
52+
# LabelSpreading does not support dense matrices
53+
('todense', FunctionTransformer(lambda x: x.todense())),
54+
('clf', LabelSpreading()),
55+
])
56+
57+
58+
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
59+
print("Number of training samples:", len(X_train))
60+
print("Unlabeled samples in training set:",
61+
sum(1 for x in y_train if x == -1))
62+
clf.fit(X_train, y_train)
63+
y_pred = clf.predict(X_test)
64+
print("Micro-averaged F1 score on test set: "
65+
"%0.3f" % f1_score(y_test, y_pred, average='micro'))
66+
print("-" * 10)
67+
print()
68+
69+
70+
if __name__ == "__main__":
71+
X, y = data.data, data.target
72+
X_train, X_test, y_train, y_test = train_test_split(X, y)
73+
74+
print("Supervised SGDClassifier on 100% of the data:")
75+
eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)
76+
77+
# select a mask of 20% of the train dataset
78+
y_mask = np.random.rand(len(y_train)) < 0.2
79+
80+
# X_20 and y_20 are the subset of the train dataset indicated by the mask
81+
X_20, y_20 = map(list, zip(*((x, y)
82+
for x, y, m in zip(X_train, y_train, y_mask) if m)))
83+
print("Supervised SGDClassifier on 20% of the training data:")
84+
eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)
85+
86+
# set the non-masked subset to be unlabeled
87+
y_train[~y_mask] = -1
88+
print("SelfTrainingClassifier on 20% of the training data (rest "
89+
"is unlabeled):")
90+
eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)
91+
92+
if 'CI' not in os.environ:
93+
# LabelSpreading takes too long to run in the online documentation
94+
print("LabelSpreading on 20% of the data (rest is unlabeled):")
95+
eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""
2+
===============================================================================
3+
Decision boundary of semi-supervised classifiers versus SVM on the Iris dataset
4+
===============================================================================
5+
6+
A comparison for the decision boundaries generated on the iris dataset
7+
by Label Spreading, Self-training and SVM.
8+
9+
This example demonstrates that Label Spreading and Self-training can learn
10+
good boundaries even when small amounts of labeled data are available.
11+
12+
Note that Self-training with 100% of the data is omitted as it is functionally
13+
identical to training the SVC on 100% of the data.
14+
15+
"""
16+
print(__doc__)
17+
18+
# Authors: Clay Woolam <[email protected]>
19+
# Oliver Rausch <[email protected]>
20+
# License: BSD
21+
22+
import numpy as np
23+
import matplotlib.pyplot as plt
24+
from sklearn import datasets
25+
from sklearn.svm import SVC
26+
from sklearn.semi_supervised import LabelSpreading
27+
from sklearn.semi_supervised import SelfTrainingClassifier
28+
29+
30+
iris = datasets.load_iris()
31+
32+
X = iris.data[:, :2]
33+
y = iris.target
34+
35+
# step size in the mesh
36+
h = .02
37+
38+
rng = np.random.RandomState(0)
39+
y_rand = rng.rand(y.shape[0])
40+
y_30 = np.copy(y)
41+
y_30[y_rand < 0.3] = -1 # set random samples to be unlabeled
42+
y_50 = np.copy(y)
43+
y_50[y_rand < 0.5] = -1
44+
# we create an instance of SVM and fit out data. We do not scale our
45+
# data since we want to plot the support vectors
46+
ls30 = (LabelSpreading().fit(X, y_30), y_30, 'Label Spreading 30% data')
47+
ls50 = (LabelSpreading().fit(X, y_50), y_50, 'Label Spreading 50% data')
48+
ls100 = (LabelSpreading().fit(X, y), y, 'Label Spreading 100% data')
49+
50+
# the base classifier for self-training is identical to the SVC
51+
base_classifier = SVC(kernel='rbf', gamma=.5, probability=True)
52+
st30 = (SelfTrainingClassifier(base_classifier).fit(X, y_30),
53+
y_30, 'Self-training 30% data')
54+
st50 = (SelfTrainingClassifier(base_classifier).fit(X, y_50),
55+
y_50, 'Self-training 50% data')
56+
57+
rbf_svc = (SVC(kernel='rbf', gamma=.5).fit(X, y), y, 'SVC with rbf kernel')
58+
59+
# create a mesh to plot in
60+
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
61+
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
62+
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
63+
np.arange(y_min, y_max, h))
64+
65+
color_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)}
66+
67+
classifiers = (ls30, st30, ls50, st50, ls100, rbf_svc)
68+
for i, (clf, y_train, title) in enumerate(classifiers):
69+
# Plot the decision boundary. For that, we will assign a color to each
70+
# point in the mesh [x_min, x_max]x[y_min, y_max].
71+
plt.subplot(3, 2, i + 1)
72+
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
73+
74+
# Put the result into a color plot
75+
Z = Z.reshape(xx.shape)
76+
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
77+
plt.axis('off')
78+
79+
# Plot also the training points
80+
colors = [color_map[y] for y in y_train]
81+
plt.scatter(X[:, 0], X[:, 1], c=colors, edgecolors='black')
82+
83+
plt.title(title)
84+
85+
plt.suptitle("Unlabeled points are colored white", y=0.1)
86+
plt.show()

0 commit comments

Comments
 (0)