scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
34 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
34 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
37 Bytes b/‎dev/_downloads/auto_examples_python.zip
37 Bytes
diff --git a/‎dev/_downloads/grid_search_text_feature_extraction.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/grid_search_text_feature_extraction.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/grid_search_text_feature_extraction.py
Lines changed: 6 additions & 6 deletions b/‎dev/_downloads/grid_search_text_feature_extraction.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎dev/_downloads/plot_classification_probability.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_classification_probability.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_classification_probability.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_classification_probability.py
Lines changed: 1 addition & 1 deletion
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Sample pipeline for text feature extraction and evaluation\n\n\nThe dataset used in this example is the 20 newsgroups dataset which will be\nautomatically downloaded and then cached and reused for the document\nclassification example.\n\nYou can adjust the number of categories by giving their names to the dataset\nloader or setting them to None to get the 20 of them.\n\nHere is a sample output of a run on a quad-core machine::\n\n  Loading 20 newsgroups dataset for categories:\n  ['alt.atheism', 'talk.religion.misc']\n  1427 documents\n  2 categories\n\n  Performing grid search...\n  pipeline: ['vect', 'tfidf', 'clf']\n  parameters:\n  {'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),\n   'clf__n_iter': (10, 50, 80),\n   'clf__penalty': ('l2', 'elasticnet'),\n   'tfidf__use_idf': (True, False),\n   'vect__max_n': (1, 2),\n   'vect__max_df': (0.5, 0.75, 1.0),\n   'vect__max_features': (None, 5000, 10000, 50000)}\n  done in 1737.030s\n\n  Best score: 0.940\n  Best parameters set:\n      clf__alpha: 9.9999999999999995e-07\n      clf__n_iter: 50\n      clf__penalty: 'elasticnet'\n      tfidf__use_idf: True\n      vect__max_n: 2\n      vect__max_df: 0.75\n      vect__max_features: 50000\n\n\n"
+        "\n# Sample pipeline for text feature extraction and evaluation\n\n\nThe dataset used in this example is the 20 newsgroups dataset which will be\nautomatically downloaded and then cached and reused for the document\nclassification example.\n\nYou can adjust the number of categories by giving their names to the dataset\nloader or setting them to None to get the 20 of them.\n\nHere is a sample output of a run on a quad-core machine::\n\n  Loading 20 newsgroups dataset for categories:\n  ['alt.atheism', 'talk.religion.misc']\n  1427 documents\n  2 categories\n\n  Performing grid search...\n  pipeline: ['vect', 'tfidf', 'clf']\n  parameters:\n  {'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),\n   'clf__max_iter': (10, 50, 80),\n   'clf__penalty': ('l2', 'elasticnet'),\n   'tfidf__use_idf': (True, False),\n   'vect__max_n': (1, 2),\n   'vect__max_df': (0.5, 0.75, 1.0),\n   'vect__max_features': (None, 5000, 10000, 50000)}\n  done in 1737.030s\n\n  Best score: 0.940\n  Best parameters set:\n      clf__alpha: 9.9999999999999995e-07\n      clf__max_iter: 50\n      clf__penalty: 'elasticnet'\n      tfidf__use_idf: True\n      vect__max_n: 2\n      vect__max_df: 0.75\n      vect__max_features: 50000\n\n\n"
       ]
     },
     {
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Olivier Grisel <[email protected]>\n#         Peter Prettenhofer <[email protected]>\n#         Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n                    format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n    'alt.atheism',\n    'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n    ('vect', CountVectorizer()),\n    ('tfidf', TfidfTransformer()),\n    ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n    'vect__max_df': (0.5, 0.75, 1.0),\n    #'vect__max_features': (None, 5000, 10000, 50000),\n    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams\n    #'tfidf__use_idf': (True, False),\n    #'tfidf__norm': ('l1', 'l2'),\n    'clf__max_iter': (5,),\n    'clf__alpha': (0.00001, 0.000001),\n    'clf__penalty': ('l2', 'elasticnet'),\n    #'clf__n_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n    # multiprocessing requires the fork to happen in a __main__ protected\n    # block\n\n    # find the best parameters for both the feature extraction and the\n    # classifier\n    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\n    print(\"Performing grid search...\")\n    print(\"pipeline:\", [name for name, _ in pipeline.steps])\n    print(\"parameters:\")\n    pprint(parameters)\n    t0 = time()\n    grid_search.fit(data.data, data.target)\n    print(\"done in %0.3fs\" % (time() - t0))\n    print()\n\n    print(\"Best score: %0.3f\" % grid_search.best_score_)\n    print(\"Best parameters set:\")\n    best_parameters = grid_search.best_estimator_.get_params()\n    for param_name in sorted(parameters.keys()):\n        print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
+        "# Author: Olivier Grisel <[email protected]>\n#         Peter Prettenhofer <[email protected]>\n#         Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n                    format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n    'alt.atheism',\n    'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n    ('vect', CountVectorizer()),\n    ('tfidf', TfidfTransformer()),\n    ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n    'vect__max_df': (0.5, 0.75, 1.0),\n    # 'vect__max_features': (None, 5000, 10000, 50000),\n    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams\n    # 'tfidf__use_idf': (True, False),\n    # 'tfidf__norm': ('l1', 'l2'),\n    'clf__max_iter': (5,),\n    'clf__alpha': (0.00001, 0.000001),\n    'clf__penalty': ('l2', 'elasticnet'),\n    # 'clf__max_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n    # multiprocessing requires the fork to happen in a __main__ protected\n    # block\n\n    # find the best parameters for both the feature extraction and the\n    # classifier\n    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\n    print(\"Performing grid search...\")\n    print(\"pipeline:\", [name for name, _ in pipeline.steps])\n    print(\"parameters:\")\n    pprint(parameters)\n    t0 = time()\n    grid_search.fit(data.data, data.target)\n    print(\"done in %0.3fs\" % (time() - t0))\n    print()\n\n    print(\"Best score: %0.3f\" % grid_search.best_score_)\n    print(\"Best parameters set:\")\n    best_parameters = grid_search.best_estimator_.get_params()\n    for param_name in sorted(parameters.keys()):\n        print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
       ]
     }
   ],
 
@@ -22,7 +22,7 @@
   pipeline: ['vect', 'tfidf', 'clf']
   parameters:
   {'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),
-   'clf__n_iter': (10, 50, 80),
+   'clf__max_iter': (10, 50, 80),
    'clf__penalty': ('l2', 'elasticnet'),
    'tfidf__use_idf': (True, False),
    'vect__max_n': (1, 2),
@@ -33,7 +33,7 @@
   Best score: 0.940
   Best parameters set:
       clf__alpha: 9.9999999999999995e-07
-      clf__n_iter: 50
+      clf__max_iter: 50
       clf__penalty: 'elasticnet'
       tfidf__use_idf: True
       vect__max_n: 2
@@ -97,14 +97,14 @@
 # increase processing time in a combinatorial way
 parameters = {
     'vect__max_df': (0.5, 0.75, 1.0),
-    #'vect__max_features': (None, 5000, 10000, 50000),
+    # 'vect__max_features': (None, 5000, 10000, 50000),
     'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
-    #'tfidf__use_idf': (True, False),
-    #'tfidf__norm': ('l1', 'l2'),
+    # 'tfidf__use_idf': (True, False),
+    # 'tfidf__norm': ('l1', 'l2'),
     'clf__max_iter': (5,),
     'clf__alpha': (0.00001, 0.000001),
     'clf__penalty': ('l2', 'elasticnet'),
-    #'clf__n_iter': (10, 50, 80),
+    # 'clf__max_iter': (10, 50, 80),
 }
 
 if __name__ == "__main__":
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\n# Author: Alexandre Gramfort <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.svm import SVC\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.gaussian_process.kernels import RBF\nfrom sklearn import datasets\n\niris = datasets.load_iris()\nX = iris.data[:, 0:2]  # we only take the first two features for visualization\ny = iris.target\n\nn_features = X.shape[1]\n\nC = 1.0\nkernel = 1.0 * RBF([1.0, 1.0])  # for GPC\n\n# Create different classifiers. The logistic regression cannot do\n# multiclass out of the box.\nclassifiers = {'L1 logistic': LogisticRegression(C=C, penalty='l1'),\n               'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2'),\n               'Linear SVC': SVC(kernel='linear', C=C, probability=True,\n                                 random_state=0),\n               'L2 logistic (Multinomial)': LogisticRegression(\n                C=C, solver='lbfgs', multi_class='multinomial'),\n               'GPC': GaussianProcessClassifier(kernel)\n               }\n\nn_classifiers = len(classifiers)\n\nplt.figure(figsize=(3 * 2, n_classifiers * 2))\nplt.subplots_adjust(bottom=.2, top=.95)\n\nxx = np.linspace(3, 9, 100)\nyy = np.linspace(1, 5, 100).T\nxx, yy = np.meshgrid(xx, yy)\nXfull = np.c_[xx.ravel(), yy.ravel()]\n\nfor index, (name, classifier) in enumerate(classifiers.items()):\n    classifier.fit(X, y)\n\n    y_pred = classifier.predict(X)\n    classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100\n    print(\"classif_rate for %s : %f \" % (name, classif_rate))\n\n    # View probabilities=\n    probas = classifier.predict_proba(Xfull)\n    n_classes = np.unique(y_pred).size\n    for k in range(n_classes):\n        plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1)\n        plt.title(\"Class %d\" % k)\n        if k == 0:\n            plt.ylabel(name)\n        imshow_handle = plt.imshow(probas[:, k].reshape((100, 100)),\n                                   extent=(3, 9, 1, 5), origin='lower')\n        plt.xticks(())\n        plt.yticks(())\n        idx = (y_pred == k)\n        if idx.any():\n            plt.scatter(X[idx, 0], X[idx, 1], marker='o', c='k')\n\nax = plt.axes([0.15, 0.04, 0.7, 0.05])\nplt.title(\"Probability\")\nplt.colorbar(imshow_handle, cax=ax, orientation='horizontal')\n\nplt.show()"
+        "print(__doc__)\n\n# Author: Alexandre Gramfort <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.svm import SVC\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.gaussian_process.kernels import RBF\nfrom sklearn import datasets\n\niris = datasets.load_iris()\nX = iris.data[:, 0:2]  # we only take the first two features for visualization\ny = iris.target\n\nn_features = X.shape[1]\n\nC = 1.0\nkernel = 1.0 * RBF([1.0, 1.0])  # for GPC\n\n# Create different classifiers. The logistic regression cannot do\n# multiclass out of the box.\nclassifiers = {'L1 logistic': LogisticRegression(C=C, penalty='l1'),\n               'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2'),\n               'Linear SVC': SVC(kernel='linear', C=C, probability=True,\n                                 random_state=0),\n               'L2 logistic (Multinomial)': LogisticRegression(\n                C=C, solver='lbfgs', multi_class='multinomial'),\n               'GPC': GaussianProcessClassifier(kernel)\n               }\n\nn_classifiers = len(classifiers)\n\nplt.figure(figsize=(3 * 2, n_classifiers * 2))\nplt.subplots_adjust(bottom=.2, top=.95)\n\nxx = np.linspace(3, 9, 100)\nyy = np.linspace(1, 5, 100).T\nxx, yy = np.meshgrid(xx, yy)\nXfull = np.c_[xx.ravel(), yy.ravel()]\n\nfor index, (name, classifier) in enumerate(classifiers.items()):\n    classifier.fit(X, y)\n\n    y_pred = classifier.predict(X)\n    classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100\n    print(\"classif_rate for %s : %f \" % (name, classif_rate))\n\n    # View probabilities=\n    probas = classifier.predict_proba(Xfull)\n    n_classes = np.unique(y_pred).size\n    for k in range(n_classes):\n        plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1)\n        plt.title(\"Class %d\" % k)\n        if k == 0:\n            plt.ylabel(name)\n        imshow_handle = plt.imshow(probas[:, k].reshape((100, 100)),\n                                   extent=(3, 9, 1, 5), origin='lower')\n        plt.xticks(())\n        plt.yticks(())\n        idx = (y_pred == k)\n        if idx.any():\n            plt.scatter(X[idx, 0], X[idx, 1], marker='o', c='w', edgecolor='k')\n\nax = plt.axes([0.15, 0.04, 0.7, 0.05])\nplt.title(\"Probability\")\nplt.colorbar(imshow_handle, cax=ax, orientation='horizontal')\n\nplt.show()"
       ]
     }
   ],
 
@@ -76,7 +76,7 @@ class dataset, and we classify it with a Support Vector classifier, L1
         plt.yticks(())
         idx = (y_pred == k)
         if idx.any():
-            plt.scatter(X[idx, 0], X[idx, 1], marker='o', c='k')
+            plt.scatter(X[idx, 0], X[idx, 1], marker='o', c='w', edgecolor='k')
 
 ax = plt.axes([0.15, 0.04, 0.7, 0.05])
 plt.title("Probability")
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"cell_type": "markdown",`
`16`	`16`	`"metadata": {},`
`17`	`17`	`"source": [`
`18`		- "\n# Sample pipeline for text feature extraction and evaluation\n\n\nThe dataset used in this example is the 20 newsgroups dataset which will be\nautomatically downloaded and then cached and reused for the document\nclassification example.\n\nYou can adjust the number of categories by giving their names to the dataset\nloader or setting them to None to get the 20 of them.\n\nHere is a sample output of a run on a quad-core machine::\n\n Loading 20 newsgroups dataset for categories:\n ['alt.atheism', 'talk.religion.misc']\n 1427 documents\n 2 categories\n\n Performing grid search...\n pipeline: ['vect', 'tfidf', 'clf']\n parameters:\n {'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),\n 'clf__n_iter': (10, 50, 80),\n 'clf__penalty': ('l2', 'elasticnet'),\n 'tfidf__use_idf': (True, False),\n 'vect__max_n': (1, 2),\n 'vect__max_df': (0.5, 0.75, 1.0),\n 'vect__max_features': (None, 5000, 10000, 50000)}\n done in 1737.030s\n\n Best score: 0.940\n Best parameters set:\n clf__alpha: 9.9999999999999995e-07\n clf__n_iter: 50\n clf__penalty: 'elasticnet'\n tfidf__use_idf: True\n vect__max_n: 2\n vect__max_df: 0.75\n vect__max_features: 50000\n\n\n"
	`18`	+ "\n# Sample pipeline for text feature extraction and evaluation\n\n\nThe dataset used in this example is the 20 newsgroups dataset which will be\nautomatically downloaded and then cached and reused for the document\nclassification example.\n\nYou can adjust the number of categories by giving their names to the dataset\nloader or setting them to None to get the 20 of them.\n\nHere is a sample output of a run on a quad-core machine::\n\n Loading 20 newsgroups dataset for categories:\n ['alt.atheism', 'talk.religion.misc']\n 1427 documents\n 2 categories\n\n Performing grid search...\n pipeline: ['vect', 'tfidf', 'clf']\n parameters:\n {'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),\n 'clf__max_iter': (10, 50, 80),\n 'clf__penalty': ('l2', 'elasticnet'),\n 'tfidf__use_idf': (True, False),\n 'vect__max_n': (1, 2),\n 'vect__max_df': (0.5, 0.75, 1.0),\n 'vect__max_features': (None, 5000, 10000, 50000)}\n done in 1737.030s\n\n Best score: 0.940\n Best parameters set:\n clf__alpha: 9.9999999999999995e-07\n clf__max_iter: 50\n clf__penalty: 'elasticnet'\n tfidf__use_idf: True\n vect__max_n: 2\n vect__max_df: 0.75\n vect__max_features: 50000\n\n\n"
`19`	`19`	`]`
`20`	`20`	`},`
`21`	`21`	`{`
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n #'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n #'tfidf__use_idf': (True, False),\n #'tfidf__norm': ('l1', 'l2'),\n 'clf__max_iter': (5,),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n #'clf__n_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
	`29`	+ "# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n # 'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n # 'tfidf__use_idf': (True, False),\n # 'tfidf__norm': ('l1', 'l2'),\n 'clf__max_iter': (5,),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n # 'clf__max_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`