Skip to content

Commit ad3d029

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 8f9a027b384e3365131c6fa1575557921251f2d1
1 parent 6c7ed15 commit ad3d029

File tree

703 files changed

+1549
-1558
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

703 files changed

+1549
-1558
lines changed
Binary file not shown.

dev/_downloads/388641d133587cc11aa26f2dbef4b950/plot_document_classification_20newsgroups.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def benchmark(clf):
248248
for clf, name in (
249249
(RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
250250
(Perceptron(max_iter=50), "Perceptron"),
251-
(PassiveAggressiveClassifier(max_iter=50, tol=1e-3),
251+
(PassiveAggressiveClassifier(max_iter=50),
252252
"Passive-Aggressive"),
253253
(KNeighborsClassifier(n_neighbors=10), "kNN"),
254254
(RandomForestClassifier(), "Random forest")):

dev/_downloads/3b31bf37034a6ece04667cd422e5ff79/plot_document_classification_20newsgroups.ipynb

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dev/_downloads/58e661bc0f16de8cbd6dd7f2c1237745/plot_svm_scale_c.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,7 @@
112112
clf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,
113113
tol=1e-3),
114114
np.logspace(-2.3, -1.3, 10), X_1, y_1),
115-
(LinearSVC(penalty='l2', loss='squared_hinge', dual=True,
116-
tol=1e-4),
115+
(LinearSVC(penalty='l2', loss='squared_hinge', dual=True),
117116
np.logspace(-4.5, -2, 10), X_2, y_2)]
118117

119118
colors = ['navy', 'cyan', 'darkorange']

dev/_downloads/5ba9a28ce0b812d50bf8133494f3bf19/plot_svm_scale_c.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\n\n# Author: Andreas Mueller <[email protected]>\n# Jaques Grobler <[email protected]>\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.svm import LinearSVC\nfrom sklearn.model_selection import ShuffleSplit\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.utils import check_random_state\nfrom sklearn import datasets\n\nrnd = check_random_state(1)\n\n# set up dataset\nn_samples = 100\nn_features = 300\n\n# l1 data (only 5 informative features)\nX_1, y_1 = datasets.make_classification(n_samples=n_samples,\n n_features=n_features, n_informative=5,\n random_state=1)\n\n# l2 data: non sparse, but less features\ny_2 = np.sign(.5 - rnd.rand(n_samples))\nX_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis]\nX_2 += 5 * rnd.randn(n_samples, n_features // 5)\n\nclf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,\n tol=1e-3),\n np.logspace(-2.3, -1.3, 10), X_1, y_1),\n (LinearSVC(penalty='l2', loss='squared_hinge', dual=True,\n tol=1e-4),\n np.logspace(-4.5, -2, 10), X_2, y_2)]\n\ncolors = ['navy', 'cyan', 'darkorange']\nlw = 2\n\nfor clf, cs, X, y in clf_sets:\n # set up the plot for each regressor\n fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))\n\n for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]):\n param_grid = dict(C=cs)\n # To get nice curve, we need a large number of iterations to\n # reduce the variance\n grid = GridSearchCV(clf, refit=False, param_grid=param_grid,\n cv=ShuffleSplit(train_size=train_size,\n test_size=.3,\n n_splits=250, random_state=1))\n grid.fit(X, y)\n scores = grid.cv_results_['mean_test_score']\n\n scales = [(1, 'No scaling'),\n ((n_samples * train_size), '1/n_samples'),\n ]\n\n for ax, (scaler, name) in zip(axes, scales):\n ax.set_xlabel('C')\n ax.set_ylabel('CV Score')\n grid_cs = cs * float(scaler) # scale the C's\n ax.semilogx(grid_cs, scores, label=\"fraction %.2f\" %\n train_size, color=colors[k], lw=lw)\n ax.set_title('scaling=%s, penalty=%s, loss=%s' %\n (name, clf.penalty, clf.loss))\n\n plt.legend(loc=\"best\")\nplt.show()"
29+
"print(__doc__)\n\n\n# Author: Andreas Mueller <[email protected]>\n# Jaques Grobler <[email protected]>\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.svm import LinearSVC\nfrom sklearn.model_selection import ShuffleSplit\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.utils import check_random_state\nfrom sklearn import datasets\n\nrnd = check_random_state(1)\n\n# set up dataset\nn_samples = 100\nn_features = 300\n\n# l1 data (only 5 informative features)\nX_1, y_1 = datasets.make_classification(n_samples=n_samples,\n n_features=n_features, n_informative=5,\n random_state=1)\n\n# l2 data: non sparse, but less features\ny_2 = np.sign(.5 - rnd.rand(n_samples))\nX_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis]\nX_2 += 5 * rnd.randn(n_samples, n_features // 5)\n\nclf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,\n tol=1e-3),\n np.logspace(-2.3, -1.3, 10), X_1, y_1),\n (LinearSVC(penalty='l2', loss='squared_hinge', dual=True),\n np.logspace(-4.5, -2, 10), X_2, y_2)]\n\ncolors = ['navy', 'cyan', 'darkorange']\nlw = 2\n\nfor clf, cs, X, y in clf_sets:\n # set up the plot for each regressor\n fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))\n\n for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]):\n param_grid = dict(C=cs)\n # To get nice curve, we need a large number of iterations to\n # reduce the variance\n grid = GridSearchCV(clf, refit=False, param_grid=param_grid,\n cv=ShuffleSplit(train_size=train_size,\n test_size=.3,\n n_splits=250, random_state=1))\n grid.fit(X, y)\n scores = grid.cv_results_['mean_test_score']\n\n scales = [(1, 'No scaling'),\n ((n_samples * train_size), '1/n_samples'),\n ]\n\n for ax, (scaler, name) in zip(axes, scales):\n ax.set_xlabel('C')\n ax.set_ylabel('CV Score')\n grid_cs = cs * float(scaler) # scale the C's\n ax.semilogx(grid_cs, scores, label=\"fraction %.2f\" %\n train_size, color=colors[k], lw=lw)\n ax.set_title('scaling=%s, penalty=%s, loss=%s' %\n (name, clf.penalty, clf.loss))\n\n plt.legend(loc=\"best\")\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/64866fb2a9398ff657578febcb91d430/plot_mnist_filters.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"import matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.neural_network import MLPClassifier\n\nprint(__doc__)\n\n# Load data from https://www.openml.org/d/554\nX, y = fetch_openml('mnist_784', version=1, return_X_y=True)\nX = X / 255.\n\n# rescale the data, use the traditional train/test split\nX_train, X_test = X[:60000], X[60000:]\ny_train, y_test = y[:60000], y[60000:]\n\n# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,\n# solver='sgd', verbose=10, tol=1e-4, random_state=1)\nmlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,\n solver='sgd', verbose=10, tol=1e-4, random_state=1,\n learning_rate_init=.1)\n\nmlp.fit(X_train, y_train)\nprint(\"Training set score: %f\" % mlp.score(X_train, y_train))\nprint(\"Test set score: %f\" % mlp.score(X_test, y_test))\n\nfig, axes = plt.subplots(4, 4)\n# use global min / max to ensure all weights are shown on the same scale\nvmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()\nfor coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):\n ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin,\n vmax=.5 * vmax)\n ax.set_xticks(())\n ax.set_yticks(())\n\nplt.show()"
29+
"import matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.neural_network import MLPClassifier\n\nprint(__doc__)\n\n# Load data from https://www.openml.org/d/554\nX, y = fetch_openml('mnist_784', version=1, return_X_y=True)\nX = X / 255.\n\n# rescale the data, use the traditional train/test split\nX_train, X_test = X[:60000], X[60000:]\ny_train, y_test = y[:60000], y[60000:]\n\nmlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,\n solver='sgd', verbose=10, random_state=1,\n learning_rate_init=.1)\n\nmlp.fit(X_train, y_train)\nprint(\"Training set score: %f\" % mlp.score(X_train, y_train))\nprint(\"Test set score: %f\" % mlp.score(X_test, y_test))\n\nfig, axes = plt.subplots(4, 4)\n# use global min / max to ensure all weights are shown on the same scale\nvmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()\nfor coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):\n ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin,\n vmax=.5 * vmax)\n ax.set_xticks(())\n ax.set_yticks(())\n\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/80692cf167e9ea27b27e5bd144159c82/plot_out_of_core_classification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def progress(blocknum, bs, size):
210210
'SGD': SGDClassifier(max_iter=5),
211211
'Perceptron': Perceptron(),
212212
'NB Multinomial': MultinomialNB(alpha=0.01),
213-
'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),
213+
'Passive-Aggressive': PassiveAggressiveClassifier(),
214214
}
215215

216216

dev/_downloads/9b3be64651591413a73d3848e0317ffd/plot_mnist_filters.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,8 @@
3434
X_train, X_test = X[:60000], X[60000:]
3535
y_train, y_test = y[:60000], y[60000:]
3636

37-
# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
38-
# solver='sgd', verbose=10, tol=1e-4, random_state=1)
3937
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
40-
solver='sgd', verbose=10, tol=1e-4, random_state=1,
38+
solver='sgd', verbose=10, random_state=1,
4139
learning_rate_init=.1)
4240

4341
mlp.fit(X_train, y_train)

dev/_downloads/b86db3a111b621a7beeaa9d099608e5b/plot_out_of_core_classification.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
},
6363
"outputs": [],
6464
"source": [
65-
"vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,\n alternate_sign=False)\n\n\n# Iterator over parsed Reuters SGML files.\ndata_stream = stream_reuters_documents()\n\n# We learn a binary classification between the \"acq\" class and all the others.\n# \"acq\" was chosen as it is more or less evenly distributed in the Reuters\n# files. For other datasets, one should take care of creating a test set with\n# a realistic portion of positive instances.\nall_classes = np.array([0, 1])\npositive_class = 'acq'\n\n# Here are some classifiers that support the `partial_fit` method\npartial_fit_classifiers = {\n 'SGD': SGDClassifier(max_iter=5),\n 'Perceptron': Perceptron(),\n 'NB Multinomial': MultinomialNB(alpha=0.01),\n 'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),\n}\n\n\ndef get_minibatch(doc_iter, size, pos_class=positive_class):\n \"\"\"Extract a minibatch of examples, return a tuple X_text, y.\n\n Note: size is before excluding invalid docs with no topics assigned.\n\n \"\"\"\n data = [('{title}\\n\\n{body}'.format(**doc), pos_class in doc['topics'])\n for doc in itertools.islice(doc_iter, size)\n if doc['topics']]\n if not len(data):\n return np.asarray([], dtype=int), np.asarray([], dtype=int)\n X_text, y = zip(*data)\n return X_text, np.asarray(y, dtype=int)\n\n\ndef iter_minibatches(doc_iter, minibatch_size):\n \"\"\"Generator of minibatches.\"\"\"\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n while len(X_text):\n yield X_text, y\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n\n\n# test data statistics\ntest_stats = {'n_test': 0, 'n_test_pos': 0}\n\n# First we hold out a number of examples to estimate accuracy\nn_test_documents = 1000\ntick = time.time()\nX_test_text, y_test = get_minibatch(data_stream, 1000)\nparsing_time = time.time() - tick\ntick = time.time()\nX_test = vectorizer.transform(X_test_text)\nvectorizing_time = time.time() - tick\ntest_stats['n_test'] += len(y_test)\ntest_stats['n_test_pos'] += sum(y_test)\nprint(\"Test set is %d documents (%d positive)\" % (len(y_test), sum(y_test)))\n\n\ndef progress(cls_name, stats):\n \"\"\"Report progress information, return a string.\"\"\"\n duration = time.time() - stats['t0']\n s = \"%20s classifier : \\t\" % cls_name\n s += \"%(n_train)6d train docs (%(n_train_pos)6d positive) \" % stats\n s += \"%(n_test)6d test docs (%(n_test_pos)6d positive) \" % test_stats\n s += \"accuracy: %(accuracy).3f \" % stats\n s += \"in %.2fs (%5d docs/s)\" % (duration, stats['n_train'] / duration)\n return s\n\n\ncls_stats = {}\n\nfor cls_name in partial_fit_classifiers:\n stats = {'n_train': 0, 'n_train_pos': 0,\n 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),\n 'runtime_history': [(0, 0)], 'total_fit_time': 0.0}\n cls_stats[cls_name] = stats\n\nget_minibatch(data_stream, n_test_documents)\n# Discard test set\n\n# We will feed the classifier with mini-batches of 1000 documents; this means\n# we have at most 1000 docs in memory at any time. The smaller the document\n# batch, the bigger the relative overhead of the partial fit methods.\nminibatch_size = 1000\n\n# Create the data_stream that parses Reuters SGML files and iterates on\n# documents as a stream.\nminibatch_iterators = iter_minibatches(data_stream, minibatch_size)\ntotal_vect_time = 0.0\n\n# Main loop : iterate on mini-batches of examples\nfor i, (X_train_text, y_train) in enumerate(minibatch_iterators):\n\n tick = time.time()\n X_train = vectorizer.transform(X_train_text)\n total_vect_time += time.time() - tick\n\n for cls_name, cls in partial_fit_classifiers.items():\n tick = time.time()\n # update estimator with examples in the current mini-batch\n cls.partial_fit(X_train, y_train, classes=all_classes)\n\n # accumulate test accuracy stats\n cls_stats[cls_name]['total_fit_time'] += time.time() - tick\n cls_stats[cls_name]['n_train'] += X_train.shape[0]\n cls_stats[cls_name]['n_train_pos'] += sum(y_train)\n tick = time.time()\n cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)\n cls_stats[cls_name]['prediction_time'] = time.time() - tick\n acc_history = (cls_stats[cls_name]['accuracy'],\n cls_stats[cls_name]['n_train'])\n cls_stats[cls_name]['accuracy_history'].append(acc_history)\n run_history = (cls_stats[cls_name]['accuracy'],\n total_vect_time + cls_stats[cls_name]['total_fit_time'])\n cls_stats[cls_name]['runtime_history'].append(run_history)\n\n if i % 3 == 0:\n print(progress(cls_name, cls_stats[cls_name]))\n if i % 3 == 0:\n print('\\n')"
65+
"vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,\n alternate_sign=False)\n\n\n# Iterator over parsed Reuters SGML files.\ndata_stream = stream_reuters_documents()\n\n# We learn a binary classification between the \"acq\" class and all the others.\n# \"acq\" was chosen as it is more or less evenly distributed in the Reuters\n# files. For other datasets, one should take care of creating a test set with\n# a realistic portion of positive instances.\nall_classes = np.array([0, 1])\npositive_class = 'acq'\n\n# Here are some classifiers that support the `partial_fit` method\npartial_fit_classifiers = {\n 'SGD': SGDClassifier(max_iter=5),\n 'Perceptron': Perceptron(),\n 'NB Multinomial': MultinomialNB(alpha=0.01),\n 'Passive-Aggressive': PassiveAggressiveClassifier(),\n}\n\n\ndef get_minibatch(doc_iter, size, pos_class=positive_class):\n \"\"\"Extract a minibatch of examples, return a tuple X_text, y.\n\n Note: size is before excluding invalid docs with no topics assigned.\n\n \"\"\"\n data = [('{title}\\n\\n{body}'.format(**doc), pos_class in doc['topics'])\n for doc in itertools.islice(doc_iter, size)\n if doc['topics']]\n if not len(data):\n return np.asarray([], dtype=int), np.asarray([], dtype=int)\n X_text, y = zip(*data)\n return X_text, np.asarray(y, dtype=int)\n\n\ndef iter_minibatches(doc_iter, minibatch_size):\n \"\"\"Generator of minibatches.\"\"\"\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n while len(X_text):\n yield X_text, y\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n\n\n# test data statistics\ntest_stats = {'n_test': 0, 'n_test_pos': 0}\n\n# First we hold out a number of examples to estimate accuracy\nn_test_documents = 1000\ntick = time.time()\nX_test_text, y_test = get_minibatch(data_stream, 1000)\nparsing_time = time.time() - tick\ntick = time.time()\nX_test = vectorizer.transform(X_test_text)\nvectorizing_time = time.time() - tick\ntest_stats['n_test'] += len(y_test)\ntest_stats['n_test_pos'] += sum(y_test)\nprint(\"Test set is %d documents (%d positive)\" % (len(y_test), sum(y_test)))\n\n\ndef progress(cls_name, stats):\n \"\"\"Report progress information, return a string.\"\"\"\n duration = time.time() - stats['t0']\n s = \"%20s classifier : \\t\" % cls_name\n s += \"%(n_train)6d train docs (%(n_train_pos)6d positive) \" % stats\n s += \"%(n_test)6d test docs (%(n_test_pos)6d positive) \" % test_stats\n s += \"accuracy: %(accuracy).3f \" % stats\n s += \"in %.2fs (%5d docs/s)\" % (duration, stats['n_train'] / duration)\n return s\n\n\ncls_stats = {}\n\nfor cls_name in partial_fit_classifiers:\n stats = {'n_train': 0, 'n_train_pos': 0,\n 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),\n 'runtime_history': [(0, 0)], 'total_fit_time': 0.0}\n cls_stats[cls_name] = stats\n\nget_minibatch(data_stream, n_test_documents)\n# Discard test set\n\n# We will feed the classifier with mini-batches of 1000 documents; this means\n# we have at most 1000 docs in memory at any time. The smaller the document\n# batch, the bigger the relative overhead of the partial fit methods.\nminibatch_size = 1000\n\n# Create the data_stream that parses Reuters SGML files and iterates on\n# documents as a stream.\nminibatch_iterators = iter_minibatches(data_stream, minibatch_size)\ntotal_vect_time = 0.0\n\n# Main loop : iterate on mini-batches of examples\nfor i, (X_train_text, y_train) in enumerate(minibatch_iterators):\n\n tick = time.time()\n X_train = vectorizer.transform(X_train_text)\n total_vect_time += time.time() - tick\n\n for cls_name, cls in partial_fit_classifiers.items():\n tick = time.time()\n # update estimator with examples in the current mini-batch\n cls.partial_fit(X_train, y_train, classes=all_classes)\n\n # accumulate test accuracy stats\n cls_stats[cls_name]['total_fit_time'] += time.time() - tick\n cls_stats[cls_name]['n_train'] += X_train.shape[0]\n cls_stats[cls_name]['n_train_pos'] += sum(y_train)\n tick = time.time()\n cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)\n cls_stats[cls_name]['prediction_time'] = time.time() - tick\n acc_history = (cls_stats[cls_name]['accuracy'],\n cls_stats[cls_name]['n_train'])\n cls_stats[cls_name]['accuracy_history'].append(acc_history)\n run_history = (cls_stats[cls_name]['accuracy'],\n total_vect_time + cls_stats[cls_name]['total_fit_time'])\n cls_stats[cls_name]['runtime_history'].append(run_history)\n\n if i % 3 == 0:\n print(progress(cls_name, cls_stats[cls_name]))\n if i % 3 == 0:\n print('\\n')"
6666
]
6767
},
6868
{
Binary file not shown.

0 commit comments

Comments
 (0)