|
24 | 24 | "execution_count": null,
|
25 | 25 | "cell_type": "code",
|
26 | 26 | "source": [
|
27 |
| - "# Author: Peter Prettenhofer < [email protected]>\n# Olivier Grisel < [email protected]>\n# Mathieu Blondel < [email protected]>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport logging\nimport numpy as np\nfrom optparse import OptionParser\nimport sys\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_selection import SelectKBest, chi2\nfrom sklearn.linear_model import RidgeClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.naive_bayes import BernoulliNB, MultinomialNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neighbors import NearestCentroid\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.utils.extmath import density\nfrom sklearn import metrics\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\"--report\",\n action=\"store_true\", dest=\"print_report\",\n help=\"Print a detailed classification report.\")\nop.add_option(\"--chi2_select\",\n action=\"store\", type=\"int\", dest=\"select_chi2\",\n help=\"Select some number of features using a chi-squared test\")\nop.add_option(\"--confusion_matrix\",\n action=\"store_true\", dest=\"print_cm\",\n help=\"Print the confusion matrix.\")\nop.add_option(\"--top10\",\n action=\"store_true\", dest=\"print_top10\",\n help=\"Print ten most discriminative terms per class\"\n \" for every classifier.\")\nop.add_option(\"--all_categories\",\n action=\"store_true\", dest=\"all_categories\",\n help=\"Whether to use all categories or not.\")\nop.add_option(\"--use_hashing\",\n action=\"store_true\",\n help=\"Use a hashing vectorizer.\")\nop.add_option(\"--n_features\",\n action=\"store\", type=int, default=2 ** 16,\n help=\"n_features when using the hashing vectorizer.\")\nop.add_option(\"--filtered\",\n action=\"store_true\",\n help=\"Remove newsgroup information that is easily overfit: \"\n \"headers, signatures, and quoting.\")\n\n\ndef is_interactive():\n return not hasattr(sys.modules['__main__'], '__file__')\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)\n\nprint(__doc__)\nop.print_help()\nprint()" |
| 27 | + "# Author: Peter Prettenhofer < [email protected]>\n# Olivier Grisel < [email protected]>\n# Mathieu Blondel < [email protected]>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport logging\nimport numpy as np\nfrom optparse import OptionParser\nimport sys\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_selection import SelectFromModel\nfrom sklearn.feature_selection import SelectKBest, chi2\nfrom sklearn.linear_model import RidgeClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.naive_bayes import BernoulliNB, MultinomialNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neighbors import NearestCentroid\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.utils.extmath import density\nfrom sklearn import metrics\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\"--report\",\n action=\"store_true\", dest=\"print_report\",\n help=\"Print a detailed classification report.\")\nop.add_option(\"--chi2_select\",\n action=\"store\", type=\"int\", dest=\"select_chi2\",\n help=\"Select some number of features using a chi-squared test\")\nop.add_option(\"--confusion_matrix\",\n action=\"store_true\", dest=\"print_cm\",\n help=\"Print the confusion matrix.\")\nop.add_option(\"--top10\",\n action=\"store_true\", dest=\"print_top10\",\n help=\"Print ten most discriminative terms per class\"\n \" for every classifier.\")\nop.add_option(\"--all_categories\",\n action=\"store_true\", dest=\"all_categories\",\n help=\"Whether to use all categories or not.\")\nop.add_option(\"--use_hashing\",\n action=\"store_true\",\n help=\"Use a hashing vectorizer.\")\nop.add_option(\"--n_features\",\n action=\"store\", type=int, default=2 ** 16,\n help=\"n_features when using the hashing vectorizer.\")\nop.add_option(\"--filtered\",\n action=\"store_true\",\n help=\"Remove newsgroup information that is easily overfit: \"\n \"headers, signatures, and quoting.\")\n\n\ndef is_interactive():\n return not hasattr(sys.modules['__main__'], '__file__')\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)\n\nprint(__doc__)\nop.print_help()\nprint()" |
28 | 28 | ],
|
29 | 29 | "outputs": [],
|
30 | 30 | "metadata": {
|
|
60 | 60 | "execution_count": null,
|
61 | 61 | "cell_type": "code",
|
62 | 62 | "source": [
|
63 |
| - "def benchmark(clf):\n print('_' * 80)\n print(\"Training: \")\n print(clf)\n t0 = time()\n clf.fit(X_train, y_train)\n train_time = time() - t0\n print(\"train time: %0.3fs\" % train_time)\n\n t0 = time()\n pred = clf.predict(X_test)\n test_time = time() - t0\n print(\"test time: %0.3fs\" % test_time)\n\n score = metrics.accuracy_score(y_test, pred)\n print(\"accuracy: %0.3f\" % score)\n\n if hasattr(clf, 'coef_'):\n print(\"dimensionality: %d\" % clf.coef_.shape[1])\n print(\"density: %f\" % density(clf.coef_))\n\n if opts.print_top10 and feature_names is not None:\n print(\"top 10 keywords per class:\")\n for i, label in enumerate(target_names):\n top10 = np.argsort(clf.coef_[i])[-10:]\n print(trim(\"%s: %s\" % (label, \" \".join(feature_names[top10]))))\n print()\n\n if opts.print_report:\n print(\"classification report:\")\n print(metrics.classification_report(y_test, pred,\n target_names=target_names))\n\n if opts.print_cm:\n print(\"confusion matrix:\")\n print(metrics.confusion_matrix(y_test, pred))\n\n print()\n clf_descr = str(clf).split('(')[0]\n return clf_descr, score, train_time, test_time\n\n\nresults = []\nfor clf, name in (\n (RidgeClassifier(tol=1e-2, solver=\"lsqr\"), \"Ridge Classifier\"),\n (Perceptron(n_iter=50), \"Perceptron\"),\n (PassiveAggressiveClassifier(n_iter=50), \"Passive-Aggressive\"),\n (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n (RandomForestClassifier(n_estimators=100), \"Random forest\")):\n print('=' * 80)\n print(name)\n results.append(benchmark(clf))\n\nfor penalty in [\"l2\", \"l1\"]:\n print('=' * 80)\n print(\"%s penalty\" % penalty.upper())\n # Train Liblinear model\n results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,\n dual=False, tol=1e-3)))\n\n # Train SGD model\n results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n penalty=penalty)))\n\n# Train SGD with Elastic Net penalty\nprint('=' * 80)\nprint(\"Elastic-Net penalty\")\nresults.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n penalty=\"elasticnet\")))\n\n# Train NearestCentroid without threshold\nprint('=' * 80)\nprint(\"NearestCentroid (aka Rocchio classifier)\")\nresults.append(benchmark(NearestCentroid()))\n\n# Train sparse Naive Bayes classifiers\nprint('=' * 80)\nprint(\"Naive Bayes\")\nresults.append(benchmark(MultinomialNB(alpha=.01)))\nresults.append(benchmark(BernoulliNB(alpha=.01)))\n\nprint('=' * 80)\nprint(\"LinearSVC with L1-based feature selection\")\n# The smaller C, the stronger the regularization.\n# The more regularization, the more sparsity.\nresults.append(benchmark(Pipeline([\n ('feature_selection', LinearSVC(penalty=\"l1\", dual=False, tol=1e-3)),\n ('classification', LinearSVC())\n])))\n\n# make some plots\n\nindices = np.arange(len(results))\n\nresults = [[x[i] for x in results] for i in range(4)]\n\nclf_names, score, training_time, test_time = results\ntraining_time = np.array(training_time) / np.max(training_time)\ntest_time = np.array(test_time) / np.max(test_time)\n\nplt.figure(figsize=(12, 8))\nplt.title(\"Score\")\nplt.barh(indices, score, .2, label=\"score\", color='navy')\nplt.barh(indices + .3, training_time, .2, label=\"training time\",\n color='c')\nplt.barh(indices + .6, test_time, .2, label=\"test time\", color='darkorange')\nplt.yticks(())\nplt.legend(loc='best')\nplt.subplots_adjust(left=.25)\nplt.subplots_adjust(top=.95)\nplt.subplots_adjust(bottom=.05)\n\nfor i, c in zip(indices, clf_names):\n plt.text(-.3, i, c)\n\nplt.show()" |
| 63 | + "def benchmark(clf):\n print('_' * 80)\n print(\"Training: \")\n print(clf)\n t0 = time()\n clf.fit(X_train, y_train)\n train_time = time() - t0\n print(\"train time: %0.3fs\" % train_time)\n\n t0 = time()\n pred = clf.predict(X_test)\n test_time = time() - t0\n print(\"test time: %0.3fs\" % test_time)\n\n score = metrics.accuracy_score(y_test, pred)\n print(\"accuracy: %0.3f\" % score)\n\n if hasattr(clf, 'coef_'):\n print(\"dimensionality: %d\" % clf.coef_.shape[1])\n print(\"density: %f\" % density(clf.coef_))\n\n if opts.print_top10 and feature_names is not None:\n print(\"top 10 keywords per class:\")\n for i, label in enumerate(target_names):\n top10 = np.argsort(clf.coef_[i])[-10:]\n print(trim(\"%s: %s\" % (label, \" \".join(feature_names[top10]))))\n print()\n\n if opts.print_report:\n print(\"classification report:\")\n print(metrics.classification_report(y_test, pred,\n target_names=target_names))\n\n if opts.print_cm:\n print(\"confusion matrix:\")\n print(metrics.confusion_matrix(y_test, pred))\n\n print()\n clf_descr = str(clf).split('(')[0]\n return clf_descr, score, train_time, test_time\n\n\nresults = []\nfor clf, name in (\n (RidgeClassifier(tol=1e-2, solver=\"lsqr\"), \"Ridge Classifier\"),\n (Perceptron(n_iter=50), \"Perceptron\"),\n (PassiveAggressiveClassifier(n_iter=50), \"Passive-Aggressive\"),\n (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n (RandomForestClassifier(n_estimators=100), \"Random forest\")):\n print('=' * 80)\n print(name)\n results.append(benchmark(clf))\n\nfor penalty in [\"l2\", \"l1\"]:\n print('=' * 80)\n print(\"%s penalty\" % penalty.upper())\n # Train Liblinear model\n results.append(benchmark(LinearSVC(penalty=penalty, dual=False,\n tol=1e-3)))\n\n # Train SGD model\n results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n penalty=penalty)))\n\n# Train SGD with Elastic Net penalty\nprint('=' * 80)\nprint(\"Elastic-Net penalty\")\nresults.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,\n penalty=\"elasticnet\")))\n\n# Train NearestCentroid without threshold\nprint('=' * 80)\nprint(\"NearestCentroid (aka Rocchio classifier)\")\nresults.append(benchmark(NearestCentroid()))\n\n# Train sparse Naive Bayes classifiers\nprint('=' * 80)\nprint(\"Naive Bayes\")\nresults.append(benchmark(MultinomialNB(alpha=.01)))\nresults.append(benchmark(BernoulliNB(alpha=.01)))\n\nprint('=' * 80)\nprint(\"LinearSVC with L1-based feature selection\")\n# The smaller C, the stronger the regularization.\n# The more regularization, the more sparsity.\nresults.append(benchmark(Pipeline([\n ('feature_selection', SelectFromModel(LinearSVC(penalty=\"l1\", dual=False,\n tol=1e-3))),\n ('classification', LinearSVC(penalty=\"l2\"))])))\n\n# make some plots\n\nindices = np.arange(len(results))\n\nresults = [[x[i] for x in results] for i in range(4)]\n\nclf_names, score, training_time, test_time = results\ntraining_time = np.array(training_time) / np.max(training_time)\ntest_time = np.array(test_time) / np.max(test_time)\n\nplt.figure(figsize=(12, 8))\nplt.title(\"Score\")\nplt.barh(indices, score, .2, label=\"score\", color='navy')\nplt.barh(indices + .3, training_time, .2, label=\"training time\",\n color='c')\nplt.barh(indices + .6, test_time, .2, label=\"test time\", color='darkorange')\nplt.yticks(())\nplt.legend(loc='best')\nplt.subplots_adjust(left=.25)\nplt.subplots_adjust(top=.95)\nplt.subplots_adjust(bottom=.05)\n\nfor i, c in zip(indices, clf_names):\n plt.text(-.3, i, c)\n\nplt.show()" |
64 | 64 | ],
|
65 | 65 | "outputs": [],
|
66 | 66 | "metadata": {
|
|
0 commit comments