Skip to content

Commit e426486

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 8890b5060679d34eda522145478eb79ffacb50c3
1 parent a2add95 commit e426486

File tree

1,123 files changed

+3653
-3496
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,123 files changed

+3653
-3496
lines changed
266 Bytes
Binary file not shown.
262 Bytes
Binary file not shown.

dev/_downloads/grid_search_text_feature_extraction.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n # 'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n # 'tfidf__use_idf': (True, False),\n # 'tfidf__norm': ('l1', 'l2'),\n 'clf__max_iter': (5,),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n # 'clf__max_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
29+
"# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n # 'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n # 'tfidf__use_idf': (True, False),\n # 'tfidf__norm': ('l1', 'l2'),\n 'clf__max_iter': (5,),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n # 'clf__max_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, cv=5,\n n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
3030
]
3131
}
3232
],

dev/_downloads/grid_search_text_feature_extraction.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,8 @@
113113

114114
# find the best parameters for both the feature extraction and the
115115
# classifier
116-
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
116+
grid_search = GridSearchCV(pipeline, parameters, cv=5,
117+
n_jobs=-1, verbose=1)
117118

118119
print("Performing grid search...")
119120
print("pipeline:", [name for name, _ in pipeline.steps])

dev/_downloads/plot_covariance_estimation.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import linalg\n\nfrom sklearn.covariance import LedoitWolf, OAS, ShrunkCovariance, \\\n log_likelihood, empirical_covariance\nfrom sklearn.model_selection import GridSearchCV\n\n\n# #############################################################################\n# Generate sample data\nn_features, n_samples = 40, 20\nnp.random.seed(42)\nbase_X_train = np.random.normal(size=(n_samples, n_features))\nbase_X_test = np.random.normal(size=(n_samples, n_features))\n\n# Color samples\ncoloring_matrix = np.random.normal(size=(n_features, n_features))\nX_train = np.dot(base_X_train, coloring_matrix)\nX_test = np.dot(base_X_test, coloring_matrix)\n\n# #############################################################################\n# Compute the likelihood on test data\n\n# spanning a range of possible shrinkage coefficient values\nshrinkages = np.logspace(-2, 0, 30)\nnegative_logliks = [-ShrunkCovariance(shrinkage=s).fit(X_train).score(X_test)\n for s in shrinkages]\n\n# under the ground-truth model, which we would not have access to in real\n# settings\nreal_cov = np.dot(coloring_matrix.T, coloring_matrix)\nemp_cov = empirical_covariance(X_train)\nloglik_real = -log_likelihood(emp_cov, linalg.inv(real_cov))\n\n# #############################################################################\n# Compare different approaches to setting the parameter\n\n# GridSearch for an optimal shrinkage coefficient\ntuned_parameters = [{'shrinkage': shrinkages}]\ncv = GridSearchCV(ShrunkCovariance(), tuned_parameters)\ncv.fit(X_train)\n\n# Ledoit-Wolf optimal shrinkage coefficient estimate\nlw = LedoitWolf()\nloglik_lw = lw.fit(X_train).score(X_test)\n\n# OAS coefficient estimate\noa = OAS()\nloglik_oa = oa.fit(X_train).score(X_test)\n\n# #############################################################################\n# Plot results\nfig = plt.figure()\nplt.title(\"Regularized covariance: likelihood and shrinkage coefficient\")\nplt.xlabel('Regularization parameter: shrinkage coefficient')\nplt.ylabel('Error: negative log-likelihood on test data')\n# range shrinkage curve\nplt.loglog(shrinkages, negative_logliks, label=\"Negative log-likelihood\")\n\nplt.plot(plt.xlim(), 2 * [loglik_real], '--r',\n label=\"Real covariance likelihood\")\n\n# adjust view\nlik_max = np.amax(negative_logliks)\nlik_min = np.amin(negative_logliks)\nymin = lik_min - 6. * np.log((plt.ylim()[1] - plt.ylim()[0]))\nymax = lik_max + 10. * np.log(lik_max - lik_min)\nxmin = shrinkages[0]\nxmax = shrinkages[-1]\n# LW likelihood\nplt.vlines(lw.shrinkage_, ymin, -loglik_lw, color='magenta',\n linewidth=3, label='Ledoit-Wolf estimate')\n# OAS likelihood\nplt.vlines(oa.shrinkage_, ymin, -loglik_oa, color='purple',\n linewidth=3, label='OAS estimate')\n# best CV estimator likelihood\nplt.vlines(cv.best_estimator_.shrinkage, ymin,\n -cv.best_estimator_.score(X_test), color='cyan',\n linewidth=3, label='Cross-validation best estimate')\n\nplt.ylim(ymin, ymax)\nplt.xlim(xmin, xmax)\nplt.legend()\n\nplt.show()"
29+
"print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import linalg\n\nfrom sklearn.covariance import LedoitWolf, OAS, ShrunkCovariance, \\\n log_likelihood, empirical_covariance\nfrom sklearn.model_selection import GridSearchCV\n\n\n# #############################################################################\n# Generate sample data\nn_features, n_samples = 40, 20\nnp.random.seed(42)\nbase_X_train = np.random.normal(size=(n_samples, n_features))\nbase_X_test = np.random.normal(size=(n_samples, n_features))\n\n# Color samples\ncoloring_matrix = np.random.normal(size=(n_features, n_features))\nX_train = np.dot(base_X_train, coloring_matrix)\nX_test = np.dot(base_X_test, coloring_matrix)\n\n# #############################################################################\n# Compute the likelihood on test data\n\n# spanning a range of possible shrinkage coefficient values\nshrinkages = np.logspace(-2, 0, 30)\nnegative_logliks = [-ShrunkCovariance(shrinkage=s).fit(X_train).score(X_test)\n for s in shrinkages]\n\n# under the ground-truth model, which we would not have access to in real\n# settings\nreal_cov = np.dot(coloring_matrix.T, coloring_matrix)\nemp_cov = empirical_covariance(X_train)\nloglik_real = -log_likelihood(emp_cov, linalg.inv(real_cov))\n\n# #############################################################################\n# Compare different approaches to setting the parameter\n\n# GridSearch for an optimal shrinkage coefficient\ntuned_parameters = [{'shrinkage': shrinkages}]\ncv = GridSearchCV(ShrunkCovariance(), tuned_parameters, cv=5)\ncv.fit(X_train)\n\n# Ledoit-Wolf optimal shrinkage coefficient estimate\nlw = LedoitWolf()\nloglik_lw = lw.fit(X_train).score(X_test)\n\n# OAS coefficient estimate\noa = OAS()\nloglik_oa = oa.fit(X_train).score(X_test)\n\n# #############################################################################\n# Plot results\nfig = plt.figure()\nplt.title(\"Regularized covariance: likelihood and shrinkage coefficient\")\nplt.xlabel('Regularization parameter: shrinkage coefficient')\nplt.ylabel('Error: negative log-likelihood on test data')\n# range shrinkage curve\nplt.loglog(shrinkages, negative_logliks, label=\"Negative log-likelihood\")\n\nplt.plot(plt.xlim(), 2 * [loglik_real], '--r',\n label=\"Real covariance likelihood\")\n\n# adjust view\nlik_max = np.amax(negative_logliks)\nlik_min = np.amin(negative_logliks)\nymin = lik_min - 6. * np.log((plt.ylim()[1] - plt.ylim()[0]))\nymax = lik_max + 10. * np.log(lik_max - lik_min)\nxmin = shrinkages[0]\nxmax = shrinkages[-1]\n# LW likelihood\nplt.vlines(lw.shrinkage_, ymin, -loglik_lw, color='magenta',\n linewidth=3, label='Ledoit-Wolf estimate')\n# OAS likelihood\nplt.vlines(oa.shrinkage_, ymin, -loglik_oa, color='purple',\n linewidth=3, label='OAS estimate')\n# best CV estimator likelihood\nplt.vlines(cv.best_estimator_.shrinkage, ymin,\n -cv.best_estimator_.score(X_test), color='cyan',\n linewidth=3, label='Cross-validation best estimate')\n\nplt.ylim(ymin, ymax)\nplt.xlim(xmin, xmax)\nplt.legend()\n\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_covariance_estimation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@
8383

8484
# GridSearch for an optimal shrinkage coefficient
8585
tuned_parameters = [{'shrinkage': shrinkages}]
86-
cv = GridSearchCV(ShrunkCovariance(), tuned_parameters)
86+
cv = GridSearchCV(ShrunkCovariance(), tuned_parameters, cv=5)
8787
cv.fit(X_train)
8888

8989
# Ledoit-Wolf optimal shrinkage coefficient estimate

dev/_downloads/plot_cv_diabetes.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"from __future__ import print_function\nprint(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.linear_model import LassoCV\nfrom sklearn.linear_model import Lasso\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import GridSearchCV\n\ndiabetes = datasets.load_diabetes()\nX = diabetes.data[:150]\ny = diabetes.target[:150]\n\nlasso = Lasso(random_state=0)\nalphas = np.logspace(-4, -0.5, 30)\n\ntuned_parameters = [{'alpha': alphas}]\nn_folds = 3\n\nclf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)\nclf.fit(X, y)\nscores = clf.cv_results_['mean_test_score']\nscores_std = clf.cv_results_['std_test_score']\nplt.figure().set_size_inches(8, 6)\nplt.semilogx(alphas, scores)\n\n# plot error lines showing +/- std. errors of the scores\nstd_error = scores_std / np.sqrt(n_folds)\n\nplt.semilogx(alphas, scores + std_error, 'b--')\nplt.semilogx(alphas, scores - std_error, 'b--')\n\n# alpha=0.2 controls the translucency of the fill color\nplt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2)\n\nplt.ylabel('CV score +/- std error')\nplt.xlabel('alpha')\nplt.axhline(np.max(scores), linestyle='--', color='.5')\nplt.xlim([alphas[0], alphas[-1]])\n\n# #############################################################################\n# Bonus: how much can you trust the selection of alpha?\n\n# To answer this question we use the LassoCV object that sets its alpha\n# parameter automatically from the data by internal cross-validation (i.e. it\n# performs cross-validation on the training data it receives).\n# We use external cross-validation to see how much the automatically obtained\n# alphas differ across different cross-validation folds.\nlasso_cv = LassoCV(alphas=alphas, random_state=0)\nk_fold = KFold(3)\n\nprint(\"Answer to the bonus question:\",\n \"how much can you trust the selection of alpha?\")\nprint()\nprint(\"Alpha parameters maximising the generalization score on different\")\nprint(\"subsets of the data:\")\nfor k, (train, test) in enumerate(k_fold.split(X, y)):\n lasso_cv.fit(X[train], y[train])\n print(\"[fold {0}] alpha: {1:.5f}, score: {2:.5f}\".\n format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))\nprint()\nprint(\"Answer: Not very much since we obtained different alphas for different\")\nprint(\"subsets of the data and moreover, the scores for these alphas differ\")\nprint(\"quite substantially.\")\n\nplt.show()"
29+
"from __future__ import print_function\nprint(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.linear_model import LassoCV\nfrom sklearn.linear_model import Lasso\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import GridSearchCV\n\ndiabetes = datasets.load_diabetes()\nX = diabetes.data[:150]\ny = diabetes.target[:150]\n\nlasso = Lasso(random_state=0)\nalphas = np.logspace(-4, -0.5, 30)\n\ntuned_parameters = [{'alpha': alphas}]\nn_folds = 5\n\nclf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)\nclf.fit(X, y)\nscores = clf.cv_results_['mean_test_score']\nscores_std = clf.cv_results_['std_test_score']\nplt.figure().set_size_inches(8, 6)\nplt.semilogx(alphas, scores)\n\n# plot error lines showing +/- std. errors of the scores\nstd_error = scores_std / np.sqrt(n_folds)\n\nplt.semilogx(alphas, scores + std_error, 'b--')\nplt.semilogx(alphas, scores - std_error, 'b--')\n\n# alpha=0.2 controls the translucency of the fill color\nplt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2)\n\nplt.ylabel('CV score +/- std error')\nplt.xlabel('alpha')\nplt.axhline(np.max(scores), linestyle='--', color='.5')\nplt.xlim([alphas[0], alphas[-1]])\n\n# #############################################################################\n# Bonus: how much can you trust the selection of alpha?\n\n# To answer this question we use the LassoCV object that sets its alpha\n# parameter automatically from the data by internal cross-validation (i.e. it\n# performs cross-validation on the training data it receives).\n# We use external cross-validation to see how much the automatically obtained\n# alphas differ across different cross-validation folds.\nlasso_cv = LassoCV(alphas=alphas, cv=5, random_state=0)\nk_fold = KFold(3)\n\nprint(\"Answer to the bonus question:\",\n \"how much can you trust the selection of alpha?\")\nprint()\nprint(\"Alpha parameters maximising the generalization score on different\")\nprint(\"subsets of the data:\")\nfor k, (train, test) in enumerate(k_fold.split(X, y)):\n lasso_cv.fit(X[train], y[train])\n print(\"[fold {0}] alpha: {1:.5f}, score: {2:.5f}\".\n format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))\nprint()\nprint(\"Answer: Not very much since we obtained different alphas for different\")\nprint(\"subsets of the data and moreover, the scores for these alphas differ\")\nprint(\"quite substantially.\")\n\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_cv_diabetes.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
alphas = np.logspace(-4, -0.5, 30)
3030

3131
tuned_parameters = [{'alpha': alphas}]
32-
n_folds = 3
32+
n_folds = 5
3333

3434
clf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)
3535
clf.fit(X, y)
@@ -60,7 +60,7 @@
6060
# performs cross-validation on the training data it receives).
6161
# We use external cross-validation to see how much the automatically obtained
6262
# alphas differ across different cross-validation folds.
63-
lasso_cv = LassoCV(alphas=alphas, random_state=0)
63+
lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=0)
6464
k_fold = KFold(3)
6565

6666
print("Answer to the bonus question:",

0 commit comments

Comments
 (0)