scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
119 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
119 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
116 Bytes b/‎dev/_downloads/auto_examples_python.zip
116 Bytes
diff --git a/‎dev/_downloads/document_classification_20newsgroups.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/document_classification_20newsgroups.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/document_classification_20newsgroups.py
Lines changed: 6 additions & 5 deletions b/‎dev/_downloads/document_classification_20newsgroups.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
6.57 KB b/‎dev/_downloads/scikit-learn-docs.pdf
6.57 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-20 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-20 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-20 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-20 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
677 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
677 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
677 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
677 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-52 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-52 Bytes
@@ -24,7 +24,7 @@
       "execution_count": null, 
       "cell_type": "code", 
       "source": [
-        "# Author: Peter Prettenhofer <[email protected]>\n#         Olivier Grisel <[email protected]>\n#         Mathieu Blondel <[email protected]>\n#         Lars Buitinck\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport logging\nimport numpy as np\nfrom optparse import OptionParser\nimport sys\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_selection import SelectKBest, chi2\nfrom sklearn.linear_model import RidgeClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.naive_bayes import BernoulliNB, MultinomialNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neighbors import NearestCentroid\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.utils.extmath import density\nfrom sklearn import metrics\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n                    format='%(asctime)s %(levelname)s %(message)s')\n\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\"--report\",\n              action=\"store_true\", dest=\"print_report\",\n              help=\"Print a detailed classification report.\")\nop.add_option(\"--chi2_select\",\n              action=\"store\", type=\"int\", dest=\"select_chi2\",\n              help=\"Select some number of features using a chi-squared test\")\nop.add_option(\"--confusion_matrix\",\n              action=\"store_true\", dest=\"print_cm\",\n              help=\"Print the confusion matrix.\")\nop.add_option(\"--top10\",\n              action=\"store_true\", dest=\"print_top10\",\n              help=\"Print ten most discriminative terms per class\"\n                   \" for every classifier.\")\nop.add_option(\"--all_categories\",\n              action=\"store_true\", dest=\"all_categories\",\n              help=\"Whether to use all categories or not.\")\nop.add_option(\"--use_hashing\",\n              action=\"store_true\",\n              help=\"Use a hashing vectorizer.\")\nop.add_option(\"--n_features\",\n              action=\"store\", type=int, default=2 ** 16,\n              help=\"n_features when using the hashing vectorizer.\")\nop.add_option(\"--filtered\",\n              action=\"store_true\",\n              help=\"Remove newsgroup information that is easily overfit: \"\n                   \"headers, signatures, and quoting.\")\n\n\ndef is_interactive():\n    return not hasattr(sys.modules['__main__'], '__file__')\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n    op.error(\"this script takes no arguments.\")\n    sys.exit(1)\n\nprint(__doc__)\nop.print_help()\nprint()"
+        "# Author: Peter Prettenhofer <[email protected]>\n#         Olivier Grisel <[email protected]>\n#         Mathieu Blondel <[email protected]>\n#         Lars Buitinck\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport logging\nimport numpy as np\nfrom optparse import OptionParser\nimport sys\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_selection import SelectFromModel\nfrom sklearn.feature_selection import SelectKBest, chi2\nfrom sklearn.linear_model import RidgeClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.naive_bayes import BernoulliNB, MultinomialNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neighbors import NearestCentroid\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.utils.extmath import density\nfrom sklearn import metrics\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n                    format='%(asctime)s %(levelname)s %(message)s')\n\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\"--report\",\n              action=\"store_true\", dest=\"print_report\",\n              help=\"Print a detailed classification report.\")\nop.add_option(\"--chi2_select\",\n              action=\"store\", type=\"int\", dest=\"select_chi2\",\n              help=\"Select some number of features using a chi-squared test\")\nop.add_option(\"--confusion_matrix\",\n              action=\"store_true\", dest=\"print_cm\",\n              help=\"Print the confusion matrix.\")\nop.add_option(\"--top10\",\n              action=\"store_true\", dest=\"print_top10\",\n              help=\"Print ten most discriminative terms per class\"\n                   \" for every classifier.\")\nop.add_option(\"--all_categories\",\n              action=\"store_true\", dest=\"all_categories\",\n              help=\"Whether to use all categories or not.\")\nop.add_option(\"--use_hashing\",\n              action=\"store_true\",\n              help=\"Use a hashing vectorizer.\")\nop.add_option(\"--n_features\",\n              action=\"store\", type=int, default=2 ** 16,\n              help=\"n_features when using the hashing vectorizer.\")\nop.add_option(\"--filtered\",\n              action=\"store_true\",\n              help=\"Remove newsgroup information that is easily overfit: \"\n                   \"headers, signatures, and quoting.\")\n\n\ndef is_interactive():\n    return not hasattr(sys.modules['__main__'], '__file__')\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n    op.error(\"this script takes no arguments.\")\n    sys.exit(1)\n\nprint(__doc__)\nop.print_help()\nprint()"
       ], 
       "outputs": [], 
       "metadata": {
@@ -60,7 +60,7 @@
       "execution_count": null, 
       "cell_type": "code", 
       "source": [
-        "def benchmark(clf):\n    print('_' * 80)\n    print(\"Training: \")\n    print(clf)\n    t0 = time()\n    clf.fit(X_train, y_train)\n    train_time = time() - t0\n    print(\"train time: %0.3fs\" % train_time)\n\n    t0 = time()\n    pred = clf.predict(X_test)\n    test_time = time() - t0\n    print(\"test time:  %0.3fs\" % test_time)\n\n    score = metrics.accuracy_score(y_test, pred)\n    print(\"accuracy:   %0.3f\" % score)\n\n    if hasattr(clf, 'coef_'):\n        print(\"dimensionality: %d\" % clf.coef_.shape[1])\n        print(\"density: %f\" % density(clf.coef_))\n\n        if opts.print_top10 and feature_names is not None:\n            print(\"top 10 keywords per class:\")\n            for i, label in enumerate(target_names):\n                top10 = np.argsort(clf.coef_[i])[-10:]\n                print(trim(\"%s: %s\" % (label, \" \".join(feature_names[top10]))))\n        print()\n\n    if opts.print_report:\n        print(\"classification report:\")\n        print(metrics.classification_report(y_test, pred,\n                                            target_names=target_names))\n\n    if opts.print_cm:\n        print(\"confusion matrix:\")\n        print(metrics.confusion_matrix(y_test, pred))\n\n    print()\n    clf_descr = str(clf).split('(')[0]\n    return clf_descr, score, train_time, test_time\n\n\nresults = []\nfor clf, name in (\n        (RidgeClassifier(tol=1e-2, solver=\"lsqr\"), \"Ridge Classifier\"),\n        (Perceptron(n_iter=50), \"Perceptron\"),\n        (PassiveAggressiveClassifier(n_iter=50), \"Passive-Aggressive\"),\n        (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n        (RandomForestClassifier(n_estimators=100), \"Random forest\")):\n    print('=' * 80)\n    print(name)\n    results.append(benchmark(clf))\n\nfor penalty in [\"l2\", \"l1\"]:\n    print('=' * 80)\n    print(\"%s penalty\" % penalty.upper())\n    # Train Liblinear model\n    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,\n                                            dual=False, tol=1e-3)))\n\n    # Train SGD model\n    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n                                           penalty=penalty)))\n\n# Train SGD with Elastic Net penalty\nprint('=' * 80)\nprint(\"Elastic-Net penalty\")\nresults.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n                                       penalty=\"elasticnet\")))\n\n# Train NearestCentroid without threshold\nprint('=' * 80)\nprint(\"NearestCentroid (aka Rocchio classifier)\")\nresults.append(benchmark(NearestCentroid()))\n\n# Train sparse Naive Bayes classifiers\nprint('=' * 80)\nprint(\"Naive Bayes\")\nresults.append(benchmark(MultinomialNB(alpha=.01)))\nresults.append(benchmark(BernoulliNB(alpha=.01)))\n\nprint('=' * 80)\nprint(\"LinearSVC with L1-based feature selection\")\n# The smaller C, the stronger the regularization.\n# The more regularization, the more sparsity.\nresults.append(benchmark(Pipeline([\n  ('feature_selection', LinearSVC(penalty=\"l1\", dual=False, tol=1e-3)),\n  ('classification', LinearSVC())\n])))\n\n# make some plots\n\nindices = np.arange(len(results))\n\nresults = [[x[i] for x in results] for i in range(4)]\n\nclf_names, score, training_time, test_time = results\ntraining_time = np.array(training_time) / np.max(training_time)\ntest_time = np.array(test_time) / np.max(test_time)\n\nplt.figure(figsize=(12, 8))\nplt.title(\"Score\")\nplt.barh(indices, score, .2, label=\"score\", color='navy')\nplt.barh(indices + .3, training_time, .2, label=\"training time\",\n         color='c')\nplt.barh(indices + .6, test_time, .2, label=\"test time\", color='darkorange')\nplt.yticks(())\nplt.legend(loc='best')\nplt.subplots_adjust(left=.25)\nplt.subplots_adjust(top=.95)\nplt.subplots_adjust(bottom=.05)\n\nfor i, c in zip(indices, clf_names):\n    plt.text(-.3, i, c)\n\nplt.show()"
+        "def benchmark(clf):\n    print('_' * 80)\n    print(\"Training: \")\n    print(clf)\n    t0 = time()\n    clf.fit(X_train, y_train)\n    train_time = time() - t0\n    print(\"train time: %0.3fs\" % train_time)\n\n    t0 = time()\n    pred = clf.predict(X_test)\n    test_time = time() - t0\n    print(\"test time:  %0.3fs\" % test_time)\n\n    score = metrics.accuracy_score(y_test, pred)\n    print(\"accuracy:   %0.3f\" % score)\n\n    if hasattr(clf, 'coef_'):\n        print(\"dimensionality: %d\" % clf.coef_.shape[1])\n        print(\"density: %f\" % density(clf.coef_))\n\n        if opts.print_top10 and feature_names is not None:\n            print(\"top 10 keywords per class:\")\n            for i, label in enumerate(target_names):\n                top10 = np.argsort(clf.coef_[i])[-10:]\n                print(trim(\"%s: %s\" % (label, \" \".join(feature_names[top10]))))\n        print()\n\n    if opts.print_report:\n        print(\"classification report:\")\n        print(metrics.classification_report(y_test, pred,\n                                            target_names=target_names))\n\n    if opts.print_cm:\n        print(\"confusion matrix:\")\n        print(metrics.confusion_matrix(y_test, pred))\n\n    print()\n    clf_descr = str(clf).split('(')[0]\n    return clf_descr, score, train_time, test_time\n\n\nresults = []\nfor clf, name in (\n        (RidgeClassifier(tol=1e-2, solver=\"lsqr\"), \"Ridge Classifier\"),\n        (Perceptron(n_iter=50), \"Perceptron\"),\n        (PassiveAggressiveClassifier(n_iter=50), \"Passive-Aggressive\"),\n        (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n        (RandomForestClassifier(n_estimators=100), \"Random forest\")):\n    print('=' * 80)\n    print(name)\n    results.append(benchmark(clf))\n\nfor penalty in [\"l2\", \"l1\"]:\n    print('=' * 80)\n    print(\"%s penalty\" % penalty.upper())\n    # Train Liblinear model\n    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,\n                                       tol=1e-3)))\n\n    # Train SGD model\n    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n                                           penalty=penalty)))\n\n# Train SGD with Elastic Net penalty\nprint('=' * 80)\nprint(\"Elastic-Net penalty\")\nresults.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n                                       penalty=\"elasticnet\")))\n\n# Train NearestCentroid without threshold\nprint('=' * 80)\nprint(\"NearestCentroid (aka Rocchio classifier)\")\nresults.append(benchmark(NearestCentroid()))\n\n# Train sparse Naive Bayes classifiers\nprint('=' * 80)\nprint(\"Naive Bayes\")\nresults.append(benchmark(MultinomialNB(alpha=.01)))\nresults.append(benchmark(BernoulliNB(alpha=.01)))\n\nprint('=' * 80)\nprint(\"LinearSVC with L1-based feature selection\")\n# The smaller C, the stronger the regularization.\n# The more regularization, the more sparsity.\nresults.append(benchmark(Pipeline([\n  ('feature_selection', SelectFromModel(LinearSVC(penalty=\"l1\", dual=False,\n                                                  tol=1e-3))),\n  ('classification', LinearSVC(penalty=\"l2\"))])))\n\n# make some plots\n\nindices = np.arange(len(results))\n\nresults = [[x[i] for x in results] for i in range(4)]\n\nclf_names, score, training_time, test_time = results\ntraining_time = np.array(training_time) / np.max(training_time)\ntest_time = np.array(test_time) / np.max(test_time)\n\nplt.figure(figsize=(12, 8))\nplt.title(\"Score\")\nplt.barh(indices, score, .2, label=\"score\", color='navy')\nplt.barh(indices + .3, training_time, .2, label=\"training time\",\n         color='c')\nplt.barh(indices + .6, test_time, .2, label=\"test time\", color='darkorange')\nplt.yticks(())\nplt.legend(loc='best')\nplt.subplots_adjust(left=.25)\nplt.subplots_adjust(top=.95)\nplt.subplots_adjust(bottom=.05)\n\nfor i, c in zip(indices, clf_names):\n    plt.text(-.3, i, c)\n\nplt.show()"
       ], 
       "outputs": [], 
       "metadata": {
 
@@ -34,6 +34,7 @@
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_extraction.text import HashingVectorizer
+from sklearn.feature_selection import SelectFromModel
 from sklearn.feature_selection import SelectKBest, chi2
 from sklearn.linear_model import RidgeClassifier
 from sklearn.pipeline import Pipeline
@@ -259,8 +260,8 @@ def benchmark(clf):
     print('=' * 80)
     print("%s penalty" % penalty.upper())
     # Train Liblinear model
-    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
-                                            dual=False, tol=1e-3)))
+    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
+                                       tol=1e-3)))
 
     # Train SGD model
     results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
@@ -288,9 +289,9 @@ def benchmark(clf):
 # The smaller C, the stronger the regularization.
 # The more regularization, the more sparsity.
 results.append(benchmark(Pipeline([
-  ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
-  ('classification', LinearSVC())
-])))
+  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
+                                                  tol=1e-3))),
+  ('classification', LinearSVC(penalty="l2"))])))
 
 # make some plots