Skip to content

Commit 7dfb450

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 3dc8d2f83bfb94dbb5a7190bac39e046063db4e0
1 parent eb29f21 commit 7dfb450

File tree

907 files changed

+2571
-2564
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

907 files changed

+2571
-2564
lines changed
119 Bytes
Binary file not shown.
116 Bytes
Binary file not shown.

dev/_downloads/document_classification_20newsgroups.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"execution_count": null,
2525
"cell_type": "code",
2626
"source": [
27-
"# Author: Peter Prettenhofer <[email protected]>\n# Olivier Grisel <[email protected]>\n# Mathieu Blondel <[email protected]>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport logging\nimport numpy as np\nfrom optparse import OptionParser\nimport sys\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_selection import SelectKBest, chi2\nfrom sklearn.linear_model import RidgeClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.naive_bayes import BernoulliNB, MultinomialNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neighbors import NearestCentroid\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.utils.extmath import density\nfrom sklearn import metrics\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\"--report\",\n action=\"store_true\", dest=\"print_report\",\n help=\"Print a detailed classification report.\")\nop.add_option(\"--chi2_select\",\n action=\"store\", type=\"int\", dest=\"select_chi2\",\n help=\"Select some number of features using a chi-squared test\")\nop.add_option(\"--confusion_matrix\",\n action=\"store_true\", dest=\"print_cm\",\n help=\"Print the confusion matrix.\")\nop.add_option(\"--top10\",\n action=\"store_true\", dest=\"print_top10\",\n help=\"Print ten most discriminative terms per class\"\n \" for every classifier.\")\nop.add_option(\"--all_categories\",\n action=\"store_true\", dest=\"all_categories\",\n help=\"Whether to use all categories or not.\")\nop.add_option(\"--use_hashing\",\n action=\"store_true\",\n help=\"Use a hashing vectorizer.\")\nop.add_option(\"--n_features\",\n action=\"store\", type=int, default=2 ** 16,\n help=\"n_features when using the hashing vectorizer.\")\nop.add_option(\"--filtered\",\n action=\"store_true\",\n help=\"Remove newsgroup information that is easily overfit: \"\n \"headers, signatures, and quoting.\")\n\n\ndef is_interactive():\n return not hasattr(sys.modules['__main__'], '__file__')\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)\n\nprint(__doc__)\nop.print_help()\nprint()"
27+
"# Author: Peter Prettenhofer <[email protected]>\n# Olivier Grisel <[email protected]>\n# Mathieu Blondel <[email protected]>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport logging\nimport numpy as np\nfrom optparse import OptionParser\nimport sys\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_selection import SelectFromModel\nfrom sklearn.feature_selection import SelectKBest, chi2\nfrom sklearn.linear_model import RidgeClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.naive_bayes import BernoulliNB, MultinomialNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neighbors import NearestCentroid\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.utils.extmath import density\nfrom sklearn import metrics\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\"--report\",\n action=\"store_true\", dest=\"print_report\",\n help=\"Print a detailed classification report.\")\nop.add_option(\"--chi2_select\",\n action=\"store\", type=\"int\", dest=\"select_chi2\",\n help=\"Select some number of features using a chi-squared test\")\nop.add_option(\"--confusion_matrix\",\n action=\"store_true\", dest=\"print_cm\",\n help=\"Print the confusion matrix.\")\nop.add_option(\"--top10\",\n action=\"store_true\", dest=\"print_top10\",\n help=\"Print ten most discriminative terms per class\"\n \" for every classifier.\")\nop.add_option(\"--all_categories\",\n action=\"store_true\", dest=\"all_categories\",\n help=\"Whether to use all categories or not.\")\nop.add_option(\"--use_hashing\",\n action=\"store_true\",\n help=\"Use a hashing vectorizer.\")\nop.add_option(\"--n_features\",\n action=\"store\", type=int, default=2 ** 16,\n help=\"n_features when using the hashing vectorizer.\")\nop.add_option(\"--filtered\",\n action=\"store_true\",\n help=\"Remove newsgroup information that is easily overfit: \"\n \"headers, signatures, and quoting.\")\n\n\ndef is_interactive():\n return not hasattr(sys.modules['__main__'], '__file__')\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)\n\nprint(__doc__)\nop.print_help()\nprint()"
2828
],
2929
"outputs": [],
3030
"metadata": {
@@ -60,7 +60,7 @@
6060
"execution_count": null,
6161
"cell_type": "code",
6262
"source": [
63-
"def benchmark(clf):\n print('_' * 80)\n print(\"Training: \")\n print(clf)\n t0 = time()\n clf.fit(X_train, y_train)\n train_time = time() - t0\n print(\"train time: %0.3fs\" % train_time)\n\n t0 = time()\n pred = clf.predict(X_test)\n test_time = time() - t0\n print(\"test time: %0.3fs\" % test_time)\n\n score = metrics.accuracy_score(y_test, pred)\n print(\"accuracy: %0.3f\" % score)\n\n if hasattr(clf, 'coef_'):\n print(\"dimensionality: %d\" % clf.coef_.shape[1])\n print(\"density: %f\" % density(clf.coef_))\n\n if opts.print_top10 and feature_names is not None:\n print(\"top 10 keywords per class:\")\n for i, label in enumerate(target_names):\n top10 = np.argsort(clf.coef_[i])[-10:]\n print(trim(\"%s: %s\" % (label, \" \".join(feature_names[top10]))))\n print()\n\n if opts.print_report:\n print(\"classification report:\")\n print(metrics.classification_report(y_test, pred,\n target_names=target_names))\n\n if opts.print_cm:\n print(\"confusion matrix:\")\n print(metrics.confusion_matrix(y_test, pred))\n\n print()\n clf_descr = str(clf).split('(')[0]\n return clf_descr, score, train_time, test_time\n\n\nresults = []\nfor clf, name in (\n (RidgeClassifier(tol=1e-2, solver=\"lsqr\"), \"Ridge Classifier\"),\n (Perceptron(n_iter=50), \"Perceptron\"),\n (PassiveAggressiveClassifier(n_iter=50), \"Passive-Aggressive\"),\n (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n (RandomForestClassifier(n_estimators=100), \"Random forest\")):\n print('=' * 80)\n print(name)\n results.append(benchmark(clf))\n\nfor penalty in [\"l2\", \"l1\"]:\n print('=' * 80)\n print(\"%s penalty\" % penalty.upper())\n # Train Liblinear model\n results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,\n dual=False, tol=1e-3)))\n\n # Train SGD model\n results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n penalty=penalty)))\n\n# Train SGD with Elastic Net penalty\nprint('=' * 80)\nprint(\"Elastic-Net penalty\")\nresults.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n penalty=\"elasticnet\")))\n\n# Train NearestCentroid without threshold\nprint('=' * 80)\nprint(\"NearestCentroid (aka Rocchio classifier)\")\nresults.append(benchmark(NearestCentroid()))\n\n# Train sparse Naive Bayes classifiers\nprint('=' * 80)\nprint(\"Naive Bayes\")\nresults.append(benchmark(MultinomialNB(alpha=.01)))\nresults.append(benchmark(BernoulliNB(alpha=.01)))\n\nprint('=' * 80)\nprint(\"LinearSVC with L1-based feature selection\")\n# The smaller C, the stronger the regularization.\n# The more regularization, the more sparsity.\nresults.append(benchmark(Pipeline([\n ('feature_selection', LinearSVC(penalty=\"l1\", dual=False, tol=1e-3)),\n ('classification', LinearSVC())\n])))\n\n# make some plots\n\nindices = np.arange(len(results))\n\nresults = [[x[i] for x in results] for i in range(4)]\n\nclf_names, score, training_time, test_time = results\ntraining_time = np.array(training_time) / np.max(training_time)\ntest_time = np.array(test_time) / np.max(test_time)\n\nplt.figure(figsize=(12, 8))\nplt.title(\"Score\")\nplt.barh(indices, score, .2, label=\"score\", color='navy')\nplt.barh(indices + .3, training_time, .2, label=\"training time\",\n color='c')\nplt.barh(indices + .6, test_time, .2, label=\"test time\", color='darkorange')\nplt.yticks(())\nplt.legend(loc='best')\nplt.subplots_adjust(left=.25)\nplt.subplots_adjust(top=.95)\nplt.subplots_adjust(bottom=.05)\n\nfor i, c in zip(indices, clf_names):\n plt.text(-.3, i, c)\n\nplt.show()"
63+
"def benchmark(clf):\n print('_' * 80)\n print(\"Training: \")\n print(clf)\n t0 = time()\n clf.fit(X_train, y_train)\n train_time = time() - t0\n print(\"train time: %0.3fs\" % train_time)\n\n t0 = time()\n pred = clf.predict(X_test)\n test_time = time() - t0\n print(\"test time: %0.3fs\" % test_time)\n\n score = metrics.accuracy_score(y_test, pred)\n print(\"accuracy: %0.3f\" % score)\n\n if hasattr(clf, 'coef_'):\n print(\"dimensionality: %d\" % clf.coef_.shape[1])\n print(\"density: %f\" % density(clf.coef_))\n\n if opts.print_top10 and feature_names is not None:\n print(\"top 10 keywords per class:\")\n for i, label in enumerate(target_names):\n top10 = np.argsort(clf.coef_[i])[-10:]\n print(trim(\"%s: %s\" % (label, \" \".join(feature_names[top10]))))\n print()\n\n if opts.print_report:\n print(\"classification report:\")\n print(metrics.classification_report(y_test, pred,\n target_names=target_names))\n\n if opts.print_cm:\n print(\"confusion matrix:\")\n print(metrics.confusion_matrix(y_test, pred))\n\n print()\n clf_descr = str(clf).split('(')[0]\n return clf_descr, score, train_time, test_time\n\n\nresults = []\nfor clf, name in (\n (RidgeClassifier(tol=1e-2, solver=\"lsqr\"), \"Ridge Classifier\"),\n (Perceptron(n_iter=50), \"Perceptron\"),\n (PassiveAggressiveClassifier(n_iter=50), \"Passive-Aggressive\"),\n (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n (RandomForestClassifier(n_estimators=100), \"Random forest\")):\n print('=' * 80)\n print(name)\n results.append(benchmark(clf))\n\nfor penalty in [\"l2\", \"l1\"]:\n print('=' * 80)\n print(\"%s penalty\" % penalty.upper())\n # Train Liblinear model\n results.append(benchmark(LinearSVC(penalty=penalty, dual=False,\n tol=1e-3)))\n\n # Train SGD model\n results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n penalty=penalty)))\n\n# Train SGD with Elastic Net penalty\nprint('=' * 80)\nprint(\"Elastic-Net penalty\")\nresults.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n penalty=\"elasticnet\")))\n\n# Train NearestCentroid without threshold\nprint('=' * 80)\nprint(\"NearestCentroid (aka Rocchio classifier)\")\nresults.append(benchmark(NearestCentroid()))\n\n# Train sparse Naive Bayes classifiers\nprint('=' * 80)\nprint(\"Naive Bayes\")\nresults.append(benchmark(MultinomialNB(alpha=.01)))\nresults.append(benchmark(BernoulliNB(alpha=.01)))\n\nprint('=' * 80)\nprint(\"LinearSVC with L1-based feature selection\")\n# The smaller C, the stronger the regularization.\n# The more regularization, the more sparsity.\nresults.append(benchmark(Pipeline([\n ('feature_selection', SelectFromModel(LinearSVC(penalty=\"l1\", dual=False,\n tol=1e-3))),\n ('classification', LinearSVC(penalty=\"l2\"))])))\n\n# make some plots\n\nindices = np.arange(len(results))\n\nresults = [[x[i] for x in results] for i in range(4)]\n\nclf_names, score, training_time, test_time = results\ntraining_time = np.array(training_time) / np.max(training_time)\ntest_time = np.array(test_time) / np.max(test_time)\n\nplt.figure(figsize=(12, 8))\nplt.title(\"Score\")\nplt.barh(indices, score, .2, label=\"score\", color='navy')\nplt.barh(indices + .3, training_time, .2, label=\"training time\",\n color='c')\nplt.barh(indices + .6, test_time, .2, label=\"test time\", color='darkorange')\nplt.yticks(())\nplt.legend(loc='best')\nplt.subplots_adjust(left=.25)\nplt.subplots_adjust(top=.95)\nplt.subplots_adjust(bottom=.05)\n\nfor i, c in zip(indices, clf_names):\n plt.text(-.3, i, c)\n\nplt.show()"
6464
],
6565
"outputs": [],
6666
"metadata": {

dev/_downloads/document_classification_20newsgroups.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from sklearn.datasets import fetch_20newsgroups
3535
from sklearn.feature_extraction.text import TfidfVectorizer
3636
from sklearn.feature_extraction.text import HashingVectorizer
37+
from sklearn.feature_selection import SelectFromModel
3738
from sklearn.feature_selection import SelectKBest, chi2
3839
from sklearn.linear_model import RidgeClassifier
3940
from sklearn.pipeline import Pipeline
@@ -259,8 +260,8 @@ def benchmark(clf):
259260
print('=' * 80)
260261
print("%s penalty" % penalty.upper())
261262
# Train Liblinear model
262-
results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
263-
dual=False, tol=1e-3)))
263+
results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
264+
tol=1e-3)))
264265

265266
# Train SGD model
266267
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
@@ -288,9 +289,9 @@ def benchmark(clf):
288289
# The smaller C, the stronger the regularization.
289290
# The more regularization, the more sparsity.
290291
results.append(benchmark(Pipeline([
291-
('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
292-
('classification', LinearSVC())
293-
])))
292+
('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
293+
tol=1e-3))),
294+
('classification', LinearSVC(penalty="l2"))])))
294295

295296
# make some plots
296297

dev/_downloads/scikit-learn-docs.pdf

6.57 KB
Binary file not shown.
-20 Bytes
-20 Bytes
677 Bytes
677 Bytes
-52 Bytes

0 commit comments

Comments
 (0)