Skip to content

Commit 461e297

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 3a428086ddb4485f5c21af3b3378dd1f659448e5
1 parent cf9ab2e commit 461e297

File tree

1,232 files changed

+4586
-4487
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,232 files changed

+4586
-4487
lines changed
Binary file not shown.

dev/_downloads/2686c9a8c33b1b0159cc05f207d65b4c/grid_search_text_feature_extraction.ipynb

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,43 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(message)s\")\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n \"alt.atheism\",\n \"talk.religion.misc\",\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset=\"train\", categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline(\n [\n (\"vect\", CountVectorizer()),\n (\"tfidf\", TfidfTransformer()),\n (\"clf\", SGDClassifier()),\n ]\n)\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n \"vect__max_df\": (0.5, 0.75, 1.0),\n # 'vect__max_features': (None, 5000, 10000, 50000),\n \"vect__ngram_range\": ((1, 1), (1, 2)), # unigrams or bigrams\n # 'tfidf__use_idf': (True, False),\n # 'tfidf__norm': ('l1', 'l2'),\n \"clf__max_iter\": (20,),\n \"clf__alpha\": (0.00001, 0.000001),\n \"clf__penalty\": (\"l2\", \"elasticnet\"),\n # 'clf__max_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
29+
"# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"## Data loading\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"from pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(message)s\")\n\n# Load some categories from the training set\ncategories = [\n \"alt.atheism\",\n \"talk.religion.misc\",\n]\n\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset=\"train\", categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"## Pipeline with hyperparameter tuning\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"# Define a pipeline combining a text feature extractor with a simple classifier\npipeline = Pipeline(\n [\n (\"vect\", CountVectorizer()),\n (\"tfidf\", TfidfTransformer()),\n (\"clf\", SGDClassifier()),\n ]\n)\n\n# Parameters to use for grid search. Uncommenting more parameters will give\n# better exploring power but will increase processing time in a combinatorial\n# way\nparameters = {\n \"vect__max_df\": (0.5, 0.75, 1.0),\n # 'vect__max_features': (None, 5000, 10000, 50000),\n \"vect__ngram_range\": ((1, 1), (1, 2)), # unigrams or bigrams\n # 'tfidf__use_idf': (True, False),\n # 'tfidf__norm': ('l1', 'l2'),\n \"clf__max_iter\": (20,),\n \"clf__alpha\": (0.00001, 0.000001),\n \"clf__penalty\": (\"l2\", \"elasticnet\"),\n # 'clf__max_iter': (10, 50, 80),\n}\n\n# Find the best parameters for both the feature extraction and the\n# classifier\ngrid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\nprint(\"Performing grid search...\")\nprint(\"pipeline:\", [name for name, _ in pipeline.steps])\nprint(\"parameters:\")\npprint(parameters)\nt0 = time()\ngrid_search.fit(data.data, data.target)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()\n\nprint(\"Best score: %0.3f\" % grid_search.best_score_)\nprint(\"Best parameters set:\")\nbest_parameters = grid_search.best_estimator_.get_params()\nfor param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
3066
]
3167
}
3268
],

dev/_downloads/6a71771766f7ff51a9ac596ae0439d01/grid_search_text_feature_extraction.py

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@
4545
# Peter Prettenhofer <[email protected]>
4646
# Mathieu Blondel <[email protected]>
4747
# License: BSD 3 clause
48+
49+
# %%
50+
# Data loading
51+
# ------------
52+
4853
from pprint import pprint
4954
from time import time
5055
import logging
@@ -59,13 +64,12 @@
5964
# Display progress logs on stdout
6065
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
6166

62-
63-
# #############################################################################
6467
# Load some categories from the training set
6568
categories = [
6669
"alt.atheism",
6770
"talk.religion.misc",
6871
]
72+
6973
# Uncomment the following to do the analysis on all the categories
7074
# categories = None
7175

@@ -77,9 +81,11 @@
7781
print("%d categories" % len(data.target_names))
7882
print()
7983

80-
# #############################################################################
81-
# Define a pipeline combining a text feature extractor with a simple
82-
# classifier
84+
# %%
85+
# Pipeline with hyperparameter tuning
86+
# -----------------------------------
87+
88+
# Define a pipeline combining a text feature extractor with a simple classifier
8389
pipeline = Pipeline(
8490
[
8591
("vect", CountVectorizer()),
@@ -88,8 +94,9 @@
8894
]
8995
)
9096

91-
# uncommenting more parameters will give better exploring power but will
92-
# increase processing time in a combinatorial way
97+
# Parameters to use for grid search. Uncommenting more parameters will give
98+
# better exploring power but will increase processing time in a combinatorial
99+
# way
93100
parameters = {
94101
"vect__max_df": (0.5, 0.75, 1.0),
95102
# 'vect__max_features': (None, 5000, 10000, 50000),
@@ -102,25 +109,21 @@
102109
# 'clf__max_iter': (10, 50, 80),
103110
}
104111

105-
if __name__ == "__main__":
106-
# multiprocessing requires the fork to happen in a __main__ protected
107-
# block
108-
109-
# find the best parameters for both the feature extraction and the
110-
# classifier
111-
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
112-
113-
print("Performing grid search...")
114-
print("pipeline:", [name for name, _ in pipeline.steps])
115-
print("parameters:")
116-
pprint(parameters)
117-
t0 = time()
118-
grid_search.fit(data.data, data.target)
119-
print("done in %0.3fs" % (time() - t0))
120-
print()
121-
122-
print("Best score: %0.3f" % grid_search.best_score_)
123-
print("Best parameters set:")
124-
best_parameters = grid_search.best_estimator_.get_params()
125-
for param_name in sorted(parameters.keys()):
126-
print("\t%s: %r" % (param_name, best_parameters[param_name]))
112+
# Find the best parameters for both the feature extraction and the
113+
# classifier
114+
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
115+
116+
print("Performing grid search...")
117+
print("pipeline:", [name for name, _ in pipeline.steps])
118+
print("parameters:")
119+
pprint(parameters)
120+
t0 = time()
121+
grid_search.fit(data.data, data.target)
122+
print("done in %0.3fs" % (time() - t0))
123+
print()
124+
125+
print("Best score: %0.3f" % grid_search.best_score_)
126+
print("Best parameters set:")
127+
best_parameters = grid_search.best_estimator_.get_params()
128+
for param_name in sorted(parameters.keys()):
129+
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Binary file not shown.

dev/_downloads/scikit-learn-docs.zip

-8.06 KB
Binary file not shown.
-404 Bytes
-160 Bytes
-398 Bytes
-377 Bytes
-133 Bytes

0 commit comments

Comments
 (0)