scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
304 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
304 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
299 Bytes b/‎dev/_downloads/auto_examples_python.zip
299 Bytes
diff --git a/‎dev/_downloads/document_classification_20newsgroups.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/document_classification_20newsgroups.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/document_classification_20newsgroups.py
Lines changed: 7 additions & 4 deletions b/‎dev/_downloads/document_classification_20newsgroups.py
Lines changed: 7 additions & 4 deletions
diff --git a/‎dev/_downloads/grid_search_text_feature_extraction.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/grid_search_text_feature_extraction.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/grid_search_text_feature_extraction.py
Lines changed: 1 addition & 0 deletions b/‎dev/_downloads/grid_search_text_feature_extraction.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎dev/_downloads/plot_model_complexity_influence.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_model_complexity_influence.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_model_complexity_influence.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_model_complexity_influence.py
Lines changed: 1 addition & 1 deletion
@@ -248,8 +248,9 @@ def benchmark(clf):
 results = []
 for clf, name in (
         (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
-        (Perceptron(n_iter=50), "Perceptron"),
-        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
+        (Perceptron(n_iter=50, tol=1e-3), "Perceptron"),
+        (PassiveAggressiveClassifier(n_iter=50, tol=1e-3),
+         "Passive-Aggressive"),
         (KNeighborsClassifier(n_neighbors=10), "kNN"),
         (RandomForestClassifier(n_estimators=100), "Random forest")):
     print('=' * 80)
@@ -265,13 +266,15 @@ def benchmark(clf):
 
     # Train SGD model
     results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
-                                           penalty=penalty)))
+                                           penalty=penalty,
+                                           max_iter=5)))
 
 # Train SGD with Elastic Net penalty
 print('=' * 80)
 print("Elastic-Net penalty")
 results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
-                                       penalty="elasticnet")))
+                                       penalty="elasticnet",
+                                       max_iter=5)))
 
 # Train NearestCentroid without threshold
 print('=' * 80)
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Olivier Grisel <[email protected]>\n#         Peter Prettenhofer <[email protected]>\n#         Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n                    format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n    'alt.atheism',\n    'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n    ('vect', CountVectorizer()),\n    ('tfidf', TfidfTransformer()),\n    ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n    'vect__max_df': (0.5, 0.75, 1.0),\n    #'vect__max_features': (None, 5000, 10000, 50000),\n    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams\n    #'tfidf__use_idf': (True, False),\n    #'tfidf__norm': ('l1', 'l2'),\n    'clf__alpha': (0.00001, 0.000001),\n    'clf__penalty': ('l2', 'elasticnet'),\n    #'clf__n_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n    # multiprocessing requires the fork to happen in a __main__ protected\n    # block\n\n    # find the best parameters for both the feature extraction and the\n    # classifier\n    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\n    print(\"Performing grid search...\")\n    print(\"pipeline:\", [name for name, _ in pipeline.steps])\n    print(\"parameters:\")\n    pprint(parameters)\n    t0 = time()\n    grid_search.fit(data.data, data.target)\n    print(\"done in %0.3fs\" % (time() - t0))\n    print()\n\n    print(\"Best score: %0.3f\" % grid_search.best_score_)\n    print(\"Best parameters set:\")\n    best_parameters = grid_search.best_estimator_.get_params()\n    for param_name in sorted(parameters.keys()):\n        print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
+        "# Author: Olivier Grisel <[email protected]>\n#         Peter Prettenhofer <[email protected]>\n#         Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n                    format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n    'alt.atheism',\n    'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n    ('vect', CountVectorizer()),\n    ('tfidf', TfidfTransformer()),\n    ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n    'vect__max_df': (0.5, 0.75, 1.0),\n    #'vect__max_features': (None, 5000, 10000, 50000),\n    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams\n    #'tfidf__use_idf': (True, False),\n    #'tfidf__norm': ('l1', 'l2'),\n    'clf__max_iter': (5,),\n    'clf__alpha': (0.00001, 0.000001),\n    'clf__penalty': ('l2', 'elasticnet'),\n    #'clf__n_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n    # multiprocessing requires the fork to happen in a __main__ protected\n    # block\n\n    # find the best parameters for both the feature extraction and the\n    # classifier\n    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\n    print(\"Performing grid search...\")\n    print(\"pipeline:\", [name for name, _ in pipeline.steps])\n    print(\"parameters:\")\n    pprint(parameters)\n    t0 = time()\n    grid_search.fit(data.data, data.target)\n    print(\"done in %0.3fs\" % (time() - t0))\n    print()\n\n    print(\"Best score: %0.3f\" % grid_search.best_score_)\n    print(\"Best parameters set:\")\n    best_parameters = grid_search.best_estimator_.get_params()\n    for param_name in sorted(parameters.keys()):\n        print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
       ]
     }
   ],
 
@@ -101,6 +101,7 @@
     'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
     #'tfidf__use_idf': (True, False),
     #'tfidf__norm': ('l1', 'l2'),
+    'clf__max_iter': (5,),
     'clf__alpha': (0.00001, 0.000001),
     'clf__penalty': ('l2', 'elasticnet'),
     #'clf__n_iter': (10, 50, 80),
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\n# Author: Eustache Diemert <[email protected]>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom mpl_toolkits.axes_grid1.parasite_axes import host_subplot\nfrom mpl_toolkits.axisartist.axislines import Axes\nfrom scipy.sparse.csr import csr_matrix\n\nfrom sklearn import datasets\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm.classes import NuSVR\nfrom sklearn.ensemble.gradient_boosting import GradientBoostingRegressor\nfrom sklearn.linear_model.stochastic_gradient import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n# #############################################################################\n# Routines\n\n\n# Initialize random generator\nnp.random.seed(0)\n\n\ndef generate_data(case, sparse=False):\n    \"\"\"Generate regression/classification data.\"\"\"\n    bunch = None\n    if case == 'regression':\n        bunch = datasets.load_boston()\n    elif case == 'classification':\n        bunch = datasets.fetch_20newsgroups_vectorized(subset='all')\n    X, y = shuffle(bunch.data, bunch.target)\n    offset = int(X.shape[0] * 0.8)\n    X_train, y_train = X[:offset], y[:offset]\n    X_test, y_test = X[offset:], y[offset:]\n    if sparse:\n        X_train = csr_matrix(X_train)\n        X_test = csr_matrix(X_test)\n    else:\n        X_train = np.array(X_train)\n        X_test = np.array(X_test)\n    y_test = np.array(y_test)\n    y_train = np.array(y_train)\n    data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,\n            'y_test': y_test}\n    return data\n\n\ndef benchmark_influence(conf):\n    \"\"\"\n    Benchmark influence of :changing_param: on both MSE and latency.\n    \"\"\"\n    prediction_times = []\n    prediction_powers = []\n    complexities = []\n    for param_value in conf['changing_param_values']:\n        conf['tuned_params'][conf['changing_param']] = param_value\n        estimator = conf['estimator'](**conf['tuned_params'])\n        print(\"Benchmarking %s\" % estimator)\n        estimator.fit(conf['data']['X_train'], conf['data']['y_train'])\n        conf['postfit_hook'](estimator)\n        complexity = conf['complexity_computer'](estimator)\n        complexities.append(complexity)\n        start_time = time.time()\n        for _ in range(conf['n_samples']):\n            y_pred = estimator.predict(conf['data']['X_test'])\n        elapsed_time = (time.time() - start_time) / float(conf['n_samples'])\n        prediction_times.append(elapsed_time)\n        pred_score = conf['prediction_performance_computer'](\n            conf['data']['y_test'], y_pred)\n        prediction_powers.append(pred_score)\n        print(\"Complexity: %d | %s: %.4f | Pred. Time: %fs\\n\" % (\n            complexity, conf['prediction_performance_label'], pred_score,\n            elapsed_time))\n    return prediction_powers, prediction_times, complexities\n\n\ndef plot_influence(conf, mse_values, prediction_times, complexities):\n    \"\"\"\n    Plot influence of model complexity on both accuracy and latency.\n    \"\"\"\n    plt.figure(figsize=(12, 6))\n    host = host_subplot(111, axes_class=Axes)\n    plt.subplots_adjust(right=0.75)\n    par1 = host.twinx()\n    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])\n    y1_label = conf['prediction_performance_label']\n    y2_label = \"Time (s)\"\n    host.set_ylabel(y1_label)\n    par1.set_ylabel(y2_label)\n    p1, = host.plot(complexities, mse_values, 'b-', label=\"prediction error\")\n    p2, = par1.plot(complexities, prediction_times, 'r-',\n                    label=\"latency\")\n    host.legend(loc='upper right')\n    host.axis[\"left\"].label.set_color(p1.get_color())\n    par1.axis[\"right\"].label.set_color(p2.get_color())\n    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)\n    plt.show()\n\n\ndef _count_nonzero_coefficients(estimator):\n    a = estimator.coef_.toarray()\n    return np.count_nonzero(a)\n\n# #############################################################################\n# Main code\nregression_data = generate_data('regression')\nclassification_data = generate_data('classification', sparse=True)\nconfigurations = [\n    {'estimator': SGDClassifier,\n     'tuned_params': {'penalty': 'elasticnet', 'alpha': 0.001, 'loss':\n                      'modified_huber', 'fit_intercept': True},\n     'changing_param': 'l1_ratio',\n     'changing_param_values': [0.25, 0.5, 0.75, 0.9],\n     'complexity_label': 'non_zero coefficients',\n     'complexity_computer': _count_nonzero_coefficients,\n     'prediction_performance_computer': hamming_loss,\n     'prediction_performance_label': 'Hamming Loss (Misclassification Ratio)',\n     'postfit_hook': lambda x: x.sparsify(),\n     'data': classification_data,\n     'n_samples': 30},\n    {'estimator': NuSVR,\n     'tuned_params': {'C': 1e3, 'gamma': 2 ** -15},\n     'changing_param': 'nu',\n     'changing_param_values': [0.1, 0.25, 0.5, 0.75, 0.9],\n     'complexity_label': 'n_support_vectors',\n     'complexity_computer': lambda x: len(x.support_vectors_),\n     'data': regression_data,\n     'postfit_hook': lambda x: x,\n     'prediction_performance_computer': mean_squared_error,\n     'prediction_performance_label': 'MSE',\n     'n_samples': 30},\n    {'estimator': GradientBoostingRegressor,\n     'tuned_params': {'loss': 'ls'},\n     'changing_param': 'n_estimators',\n     'changing_param_values': [10, 50, 100, 200, 500],\n     'complexity_label': 'n_trees',\n     'complexity_computer': lambda x: x.n_estimators,\n     'data': regression_data,\n     'postfit_hook': lambda x: x,\n     'prediction_performance_computer': mean_squared_error,\n     'prediction_performance_label': 'MSE',\n     'n_samples': 30},\n]\nfor conf in configurations:\n    prediction_performances, prediction_times, complexities = \\\n        benchmark_influence(conf)\n    plot_influence(conf, prediction_performances, prediction_times,\n                   complexities)"
+        "print(__doc__)\n\n# Author: Eustache Diemert <[email protected]>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom mpl_toolkits.axes_grid1.parasite_axes import host_subplot\nfrom mpl_toolkits.axisartist.axislines import Axes\nfrom scipy.sparse.csr import csr_matrix\n\nfrom sklearn import datasets\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm.classes import NuSVR\nfrom sklearn.ensemble.gradient_boosting import GradientBoostingRegressor\nfrom sklearn.linear_model.stochastic_gradient import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n# #############################################################################\n# Routines\n\n\n# Initialize random generator\nnp.random.seed(0)\n\n\ndef generate_data(case, sparse=False):\n    \"\"\"Generate regression/classification data.\"\"\"\n    bunch = None\n    if case == 'regression':\n        bunch = datasets.load_boston()\n    elif case == 'classification':\n        bunch = datasets.fetch_20newsgroups_vectorized(subset='all')\n    X, y = shuffle(bunch.data, bunch.target)\n    offset = int(X.shape[0] * 0.8)\n    X_train, y_train = X[:offset], y[:offset]\n    X_test, y_test = X[offset:], y[offset:]\n    if sparse:\n        X_train = csr_matrix(X_train)\n        X_test = csr_matrix(X_test)\n    else:\n        X_train = np.array(X_train)\n        X_test = np.array(X_test)\n    y_test = np.array(y_test)\n    y_train = np.array(y_train)\n    data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,\n            'y_test': y_test}\n    return data\n\n\ndef benchmark_influence(conf):\n    \"\"\"\n    Benchmark influence of :changing_param: on both MSE and latency.\n    \"\"\"\n    prediction_times = []\n    prediction_powers = []\n    complexities = []\n    for param_value in conf['changing_param_values']:\n        conf['tuned_params'][conf['changing_param']] = param_value\n        estimator = conf['estimator'](**conf['tuned_params'])\n        print(\"Benchmarking %s\" % estimator)\n        estimator.fit(conf['data']['X_train'], conf['data']['y_train'])\n        conf['postfit_hook'](estimator)\n        complexity = conf['complexity_computer'](estimator)\n        complexities.append(complexity)\n        start_time = time.time()\n        for _ in range(conf['n_samples']):\n            y_pred = estimator.predict(conf['data']['X_test'])\n        elapsed_time = (time.time() - start_time) / float(conf['n_samples'])\n        prediction_times.append(elapsed_time)\n        pred_score = conf['prediction_performance_computer'](\n            conf['data']['y_test'], y_pred)\n        prediction_powers.append(pred_score)\n        print(\"Complexity: %d | %s: %.4f | Pred. Time: %fs\\n\" % (\n            complexity, conf['prediction_performance_label'], pred_score,\n            elapsed_time))\n    return prediction_powers, prediction_times, complexities\n\n\ndef plot_influence(conf, mse_values, prediction_times, complexities):\n    \"\"\"\n    Plot influence of model complexity on both accuracy and latency.\n    \"\"\"\n    plt.figure(figsize=(12, 6))\n    host = host_subplot(111, axes_class=Axes)\n    plt.subplots_adjust(right=0.75)\n    par1 = host.twinx()\n    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])\n    y1_label = conf['prediction_performance_label']\n    y2_label = \"Time (s)\"\n    host.set_ylabel(y1_label)\n    par1.set_ylabel(y2_label)\n    p1, = host.plot(complexities, mse_values, 'b-', label=\"prediction error\")\n    p2, = par1.plot(complexities, prediction_times, 'r-',\n                    label=\"latency\")\n    host.legend(loc='upper right')\n    host.axis[\"left\"].label.set_color(p1.get_color())\n    par1.axis[\"right\"].label.set_color(p2.get_color())\n    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)\n    plt.show()\n\n\ndef _count_nonzero_coefficients(estimator):\n    a = estimator.coef_.toarray()\n    return np.count_nonzero(a)\n\n# #############################################################################\n# Main code\nregression_data = generate_data('regression')\nclassification_data = generate_data('classification', sparse=True)\nconfigurations = [\n    {'estimator': SGDClassifier,\n     'tuned_params': {'penalty': 'elasticnet', 'alpha': 0.001, 'loss':\n                      'modified_huber', 'fit_intercept': True, 'tol': 1e-3},\n     'changing_param': 'l1_ratio',\n     'changing_param_values': [0.25, 0.5, 0.75, 0.9],\n     'complexity_label': 'non_zero coefficients',\n     'complexity_computer': _count_nonzero_coefficients,\n     'prediction_performance_computer': hamming_loss,\n     'prediction_performance_label': 'Hamming Loss (Misclassification Ratio)',\n     'postfit_hook': lambda x: x.sparsify(),\n     'data': classification_data,\n     'n_samples': 30},\n    {'estimator': NuSVR,\n     'tuned_params': {'C': 1e3, 'gamma': 2 ** -15},\n     'changing_param': 'nu',\n     'changing_param_values': [0.1, 0.25, 0.5, 0.75, 0.9],\n     'complexity_label': 'n_support_vectors',\n     'complexity_computer': lambda x: len(x.support_vectors_),\n     'data': regression_data,\n     'postfit_hook': lambda x: x,\n     'prediction_performance_computer': mean_squared_error,\n     'prediction_performance_label': 'MSE',\n     'n_samples': 30},\n    {'estimator': GradientBoostingRegressor,\n     'tuned_params': {'loss': 'ls'},\n     'changing_param': 'n_estimators',\n     'changing_param_values': [10, 50, 100, 200, 500],\n     'complexity_label': 'n_trees',\n     'complexity_computer': lambda x: x.n_estimators,\n     'data': regression_data,\n     'postfit_hook': lambda x: x,\n     'prediction_performance_computer': mean_squared_error,\n     'prediction_performance_label': 'MSE',\n     'n_samples': 30},\n]\nfor conf in configurations:\n    prediction_performances, prediction_times, complexities = \\\n        benchmark_influence(conf)\n    plot_influence(conf, prediction_performances, prediction_times,\n                   complexities)"
       ]
     }
   ],
 
@@ -129,7 +129,7 @@ def _count_nonzero_coefficients(estimator):
 configurations = [
     {'estimator': SGDClassifier,
      'tuned_params': {'penalty': 'elasticnet', 'alpha': 0.001, 'loss':
-                      'modified_huber', 'fit_intercept': True},
+                      'modified_huber', 'fit_intercept': True, 'tol': 1e-3},
      'changing_param': 'l1_ratio',
      'changing_param_values': [0.25, 0.5, 0.75, 0.9],
      'complexity_label': 'non_zero coefficients',
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n #'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n #'tfidf__use_idf': (True, False),\n #'tfidf__norm': ('l1', 'l2'),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n #'clf__n_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
	`29`	+ "# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n #'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n #'tfidf__use_idf': (True, False),\n #'tfidf__norm': ('l1', 'l2'),\n 'clf__max_iter': (5,),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n #'clf__n_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`