Skip to content

Commit d99cb78

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 62d205980446a1abc1065f4332fd74eee57fcf73
1 parent f41593e commit d99cb78

File tree

1,106 files changed

+3241
-3425
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,106 files changed

+3241
-3425
lines changed
-1.41 KB
Binary file not shown.
-1.18 KB
Binary file not shown.

dev/_downloads/grid_search_text_feature_extraction.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier(tol=1e-3)),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n # 'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n # 'tfidf__use_idf': (True, False),\n # 'tfidf__norm': ('l1', 'l2'),\n 'clf__max_iter': (20,),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n # 'clf__max_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, cv=5,\n n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
29+
"# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier(tol=1e-3)),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n # 'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n # 'tfidf__use_idf': (True, False),\n # 'tfidf__norm': ('l1', 'l2'),\n 'clf__max_iter': (20,),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n # 'clf__max_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, cv=5,\n n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
3030
]
3131
}
3232
],

dev/_downloads/grid_search_text_feature_extraction.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,6 @@
4646
# Peter Prettenhofer <[email protected]>
4747
# Mathieu Blondel <[email protected]>
4848
# License: BSD 3 clause
49-
50-
from __future__ import print_function
51-
5249
from pprint import pprint
5350
from time import time
5451
import logging

dev/_downloads/plot_all_scaling.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Raghav RV <[email protected]>\n# Guillaume Lemaitre <[email protected]>\n# Thomas Unterthiner\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport numpy as np\n\nimport matplotlib as mpl\nfrom matplotlib import pyplot as plt\nfrom matplotlib import cm\n\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import minmax_scale\nfrom sklearn.preprocessing import MaxAbsScaler\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import RobustScaler\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.preprocessing import PowerTransformer\n\nfrom sklearn.datasets import fetch_california_housing\n\nprint(__doc__)\n\ndataset = fetch_california_housing()\nX_full, y_full = dataset.data, dataset.target\n\n# Take only 2 features to make visualization easier\n# Feature of 0 has a long tail distribution.\n# Feature 5 has a few but very large outliers.\n\nX = X_full[:, [0, 5]]\n\ndistributions = [\n ('Unscaled data', X),\n ('Data after standard scaling',\n StandardScaler().fit_transform(X)),\n ('Data after min-max scaling',\n MinMaxScaler().fit_transform(X)),\n ('Data after max-abs scaling',\n MaxAbsScaler().fit_transform(X)),\n ('Data after robust scaling',\n RobustScaler(quantile_range=(25, 75)).fit_transform(X)),\n ('Data after power transformation (Yeo-Johnson)',\n PowerTransformer(method='yeo-johnson').fit_transform(X)),\n ('Data after power transformation (Box-Cox)',\n PowerTransformer(method='box-cox').fit_transform(X)),\n ('Data after quantile transformation (gaussian pdf)',\n QuantileTransformer(output_distribution='normal')\n .fit_transform(X)),\n ('Data after quantile transformation (uniform pdf)',\n QuantileTransformer(output_distribution='uniform')\n .fit_transform(X)),\n ('Data after sample-wise L2 normalizing',\n Normalizer().fit_transform(X)),\n]\n\n# scale the output between 0 and 1 for the colorbar\ny = minmax_scale(y_full)\n\n# plasma does not exist in matplotlib < 1.5\ncmap = getattr(cm, 'plasma_r', cm.hot_r)\n\ndef create_axes(title, figsize=(16, 6)):\n fig = plt.figure(figsize=figsize)\n fig.suptitle(title)\n\n # define the axis for the first plot\n left, width = 0.1, 0.22\n bottom, height = 0.1, 0.7\n bottom_h = height + 0.15\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter = plt.axes(rect_scatter)\n ax_histx = plt.axes(rect_histx)\n ax_histy = plt.axes(rect_histy)\n\n # define the axis for the zoomed-in plot\n left = width + left + 0.2\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter_zoom = plt.axes(rect_scatter)\n ax_histx_zoom = plt.axes(rect_histx)\n ax_histy_zoom = plt.axes(rect_histy)\n\n # define the axis for the colorbar\n left, width = width + left + 0.13, 0.01\n\n rect_colorbar = [left, bottom, width, height]\n ax_colorbar = plt.axes(rect_colorbar)\n\n return ((ax_scatter, ax_histy, ax_histx),\n (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),\n ax_colorbar)\n\n\ndef plot_distribution(axes, X, y, hist_nbins=50, title=\"\",\n x0_label=\"\", x1_label=\"\"):\n ax, hist_X1, hist_X0 = axes\n\n ax.set_title(title)\n ax.set_xlabel(x0_label)\n ax.set_ylabel(x1_label)\n\n # The scatter plot\n colors = cmap(y)\n ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors)\n\n # Removing the top and the right spine for aesthetics\n # make nice axis layout\n ax.spines['top'].set_visible(False)\n ax.spines['right'].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n ax.spines['left'].set_position(('outward', 10))\n ax.spines['bottom'].set_position(('outward', 10))\n\n # Histogram for axis X1 (feature 5)\n hist_X1.set_ylim(ax.get_ylim())\n hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal',\n color='grey', ec='grey')\n hist_X1.axis('off')\n\n # Histogram for axis X0 (feature 0)\n hist_X0.set_xlim(ax.get_xlim())\n hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical',\n color='grey', ec='grey')\n hist_X0.axis('off')"
29+
"# Author: Raghav RV <[email protected]>\n# Guillaume Lemaitre <[email protected]>\n# Thomas Unterthiner\n# License: BSD 3 clause\n\nimport numpy as np\n\nimport matplotlib as mpl\nfrom matplotlib import pyplot as plt\nfrom matplotlib import cm\n\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import minmax_scale\nfrom sklearn.preprocessing import MaxAbsScaler\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import RobustScaler\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn.preprocessing import QuantileTransformer\nfrom sklearn.preprocessing import PowerTransformer\n\nfrom sklearn.datasets import fetch_california_housing\n\nprint(__doc__)\n\ndataset = fetch_california_housing()\nX_full, y_full = dataset.data, dataset.target\n\n# Take only 2 features to make visualization easier\n# Feature of 0 has a long tail distribution.\n# Feature 5 has a few but very large outliers.\n\nX = X_full[:, [0, 5]]\n\ndistributions = [\n ('Unscaled data', X),\n ('Data after standard scaling',\n StandardScaler().fit_transform(X)),\n ('Data after min-max scaling',\n MinMaxScaler().fit_transform(X)),\n ('Data after max-abs scaling',\n MaxAbsScaler().fit_transform(X)),\n ('Data after robust scaling',\n RobustScaler(quantile_range=(25, 75)).fit_transform(X)),\n ('Data after power transformation (Yeo-Johnson)',\n PowerTransformer(method='yeo-johnson').fit_transform(X)),\n ('Data after power transformation (Box-Cox)',\n PowerTransformer(method='box-cox').fit_transform(X)),\n ('Data after quantile transformation (gaussian pdf)',\n QuantileTransformer(output_distribution='normal')\n .fit_transform(X)),\n ('Data after quantile transformation (uniform pdf)',\n QuantileTransformer(output_distribution='uniform')\n .fit_transform(X)),\n ('Data after sample-wise L2 normalizing',\n Normalizer().fit_transform(X)),\n]\n\n# scale the output between 0 and 1 for the colorbar\ny = minmax_scale(y_full)\n\n# plasma does not exist in matplotlib < 1.5\ncmap = getattr(cm, 'plasma_r', cm.hot_r)\n\ndef create_axes(title, figsize=(16, 6)):\n fig = plt.figure(figsize=figsize)\n fig.suptitle(title)\n\n # define the axis for the first plot\n left, width = 0.1, 0.22\n bottom, height = 0.1, 0.7\n bottom_h = height + 0.15\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter = plt.axes(rect_scatter)\n ax_histx = plt.axes(rect_histx)\n ax_histy = plt.axes(rect_histy)\n\n # define the axis for the zoomed-in plot\n left = width + left + 0.2\n left_h = left + width + 0.02\n\n rect_scatter = [left, bottom, width, height]\n rect_histx = [left, bottom_h, width, 0.1]\n rect_histy = [left_h, bottom, 0.05, height]\n\n ax_scatter_zoom = plt.axes(rect_scatter)\n ax_histx_zoom = plt.axes(rect_histx)\n ax_histy_zoom = plt.axes(rect_histy)\n\n # define the axis for the colorbar\n left, width = width + left + 0.13, 0.01\n\n rect_colorbar = [left, bottom, width, height]\n ax_colorbar = plt.axes(rect_colorbar)\n\n return ((ax_scatter, ax_histy, ax_histx),\n (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),\n ax_colorbar)\n\n\ndef plot_distribution(axes, X, y, hist_nbins=50, title=\"\",\n x0_label=\"\", x1_label=\"\"):\n ax, hist_X1, hist_X0 = axes\n\n ax.set_title(title)\n ax.set_xlabel(x0_label)\n ax.set_ylabel(x1_label)\n\n # The scatter plot\n colors = cmap(y)\n ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors)\n\n # Removing the top and the right spine for aesthetics\n # make nice axis layout\n ax.spines['top'].set_visible(False)\n ax.spines['right'].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n ax.spines['left'].set_position(('outward', 10))\n ax.spines['bottom'].set_position(('outward', 10))\n\n # Histogram for axis X1 (feature 5)\n hist_X1.set_ylim(ax.get_ylim())\n hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal',\n color='grey', ec='grey')\n hist_X1.axis('off')\n\n # Histogram for axis X0 (feature 0)\n hist_X0.set_xlim(ax.get_xlim())\n hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical',\n color='grey', ec='grey')\n hist_X0.axis('off')"
3030
]
3131
},
3232
{

dev/_downloads/plot_all_scaling.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@
4747
# Thomas Unterthiner
4848
# License: BSD 3 clause
4949

50-
from __future__ import print_function
51-
5250
import numpy as np
5351

5452
import matplotlib as mpl

0 commit comments

Comments
 (0)