Skip to content

Commit 892de9b

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 8e4a8763ceac1624eac05ce3a52a6f7025af2ddb
1 parent 0c2becd commit 892de9b

File tree

1,209 files changed

+5582
-4384
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,209 files changed

+5582
-4384
lines changed
6.67 KB
Binary file not shown.
5.64 KB
Binary file not shown.

dev/_downloads/grid_search_text_feature_extraction.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier()),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n # 'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n # 'tfidf__use_idf': (True, False),\n # 'tfidf__norm': ('l1', 'l2'),\n 'clf__max_iter': (5,),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n # 'clf__max_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, cv=5,\n n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
29+
"# Author: Olivier Grisel <[email protected]>\n# Peter Prettenhofer <[email protected]>\n# Mathieu Blondel <[email protected]>\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nfrom pprint import pprint\nfrom time import time\nimport logging\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\n\nprint(__doc__)\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n format='%(asctime)s %(levelname)s %(message)s')\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n 'alt.atheism',\n 'talk.religion.misc',\n]\n# Uncomment the following to do the analysis on all the categories\n#categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndata = fetch_20newsgroups(subset='train', categories=categories)\nprint(\"%d documents\" % len(data.filenames))\nprint(\"%d categories\" % len(data.target_names))\nprint()\n\n# #############################################################################\n# Define a pipeline combining a text feature extractor with a simple\n# classifier\npipeline = Pipeline([\n ('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier(tol=1e-3)),\n])\n\n# uncommenting more parameters will give better exploring power but will\n# increase processing time in a combinatorial way\nparameters = {\n 'vect__max_df': (0.5, 0.75, 1.0),\n # 'vect__max_features': (None, 5000, 10000, 50000),\n 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams\n # 'tfidf__use_idf': (True, False),\n # 'tfidf__norm': ('l1', 'l2'),\n 'clf__max_iter': (20,),\n 'clf__alpha': (0.00001, 0.000001),\n 'clf__penalty': ('l2', 'elasticnet'),\n # 'clf__max_iter': (10, 50, 80),\n}\n\nif __name__ == \"__main__\":\n # multiprocessing requires the fork to happen in a __main__ protected\n # block\n\n # find the best parameters for both the feature extraction and the\n # classifier\n grid_search = GridSearchCV(pipeline, parameters, cv=5,\n n_jobs=-1, verbose=1)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n t0 = time()\n grid_search.fit(data.data, data.target)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))"
3030
]
3131
}
3232
],

dev/_downloads/grid_search_text_feature_extraction.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@
9090
pipeline = Pipeline([
9191
('vect', CountVectorizer()),
9292
('tfidf', TfidfTransformer()),
93-
('clf', SGDClassifier()),
93+
('clf', SGDClassifier(tol=1e-3)),
9494
])
9595

9696
# uncommenting more parameters will give better exploring power but will
@@ -101,7 +101,7 @@
101101
'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
102102
# 'tfidf__use_idf': (True, False),
103103
# 'tfidf__norm': ('l1', 'l2'),
104-
'clf__max_iter': (5,),
104+
'clf__max_iter': (20,),
105105
'clf__alpha': (0.00001, 0.000001),
106106
'clf__penalty': ('l2', 'elasticnet'),
107107
# 'clf__max_iter': (10, 50, 80),

dev/_downloads/plot_adjusted_for_chance_measures.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\n# Author: Olivier Grisel <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom time import time\nfrom sklearn import metrics\n\n\ndef uniform_labelings_scores(score_func, n_samples, n_clusters_range,\n fixed_n_classes=None, n_runs=5, seed=42):\n \"\"\"Compute score for 2 random uniform cluster labelings.\n\n Both random labelings have the same number of clusters for each value\n possible value in ``n_clusters_range``.\n\n When fixed_n_classes is not None the first labeling is considered a ground\n truth class assignment with fixed number of classes.\n \"\"\"\n random_labels = np.random.RandomState(seed).randint\n scores = np.zeros((len(n_clusters_range), n_runs))\n\n if fixed_n_classes is not None:\n labels_a = random_labels(low=0, high=fixed_n_classes, size=n_samples)\n\n for i, k in enumerate(n_clusters_range):\n for j in range(n_runs):\n if fixed_n_classes is None:\n labels_a = random_labels(low=0, high=k, size=n_samples)\n labels_b = random_labels(low=0, high=k, size=n_samples)\n scores[i, j] = score_func(labels_a, labels_b)\n return scores\n\nscore_funcs = [\n metrics.adjusted_rand_score,\n metrics.v_measure_score,\n metrics.adjusted_mutual_info_score,\n metrics.mutual_info_score,\n]\n\n# 2 independent random clusterings with equal cluster number\n\nn_samples = 100\nn_clusters_range = np.linspace(2, n_samples, 10).astype(np.int)\n\nplt.figure(1)\n\nplots = []\nnames = []\nfor score_func in score_funcs:\n print(\"Computing %s for %d values of n_clusters and n_samples=%d\"\n % (score_func.__name__, len(n_clusters_range), n_samples))\n\n t0 = time()\n scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range)\n print(\"done in %0.3fs\" % (time() - t0))\n plots.append(plt.errorbar(\n n_clusters_range, np.median(scores, axis=1), scores.std(axis=1))[0])\n names.append(score_func.__name__)\n\nplt.title(\"Clustering measures for 2 random uniform labelings\\n\"\n \"with equal number of clusters\")\nplt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)\nplt.ylabel('Score value')\nplt.legend(plots, names)\nplt.ylim(ymin=-0.05, ymax=1.05)\n\n\n# Random labeling with varying n_clusters against ground class labels\n# with fixed number of clusters\n\nn_samples = 1000\nn_clusters_range = np.linspace(2, 100, 10).astype(np.int)\nn_classes = 10\n\nplt.figure(2)\n\nplots = []\nnames = []\nfor score_func in score_funcs:\n print(\"Computing %s for %d values of n_clusters and n_samples=%d\"\n % (score_func.__name__, len(n_clusters_range), n_samples))\n\n t0 = time()\n scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range,\n fixed_n_classes=n_classes)\n print(\"done in %0.3fs\" % (time() - t0))\n plots.append(plt.errorbar(\n n_clusters_range, scores.mean(axis=1), scores.std(axis=1))[0])\n names.append(score_func.__name__)\n\nplt.title(\"Clustering measures for random uniform labeling\\n\"\n \"against reference assignment with %d classes\" % n_classes)\nplt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)\nplt.ylabel('Score value')\nplt.ylim(ymin=-0.05, ymax=1.05)\nplt.legend(plots, names)\nplt.show()"
29+
"print(__doc__)\n\n# Author: Olivier Grisel <[email protected]>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom time import time\nfrom sklearn import metrics\n\ndef uniform_labelings_scores(score_func, n_samples, n_clusters_range,\n fixed_n_classes=None, n_runs=5, seed=42):\n \"\"\"Compute score for 2 random uniform cluster labelings.\n\n Both random labelings have the same number of clusters for each value\n possible value in ``n_clusters_range``.\n\n When fixed_n_classes is not None the first labeling is considered a ground\n truth class assignment with fixed number of classes.\n \"\"\"\n random_labels = np.random.RandomState(seed).randint\n scores = np.zeros((len(n_clusters_range), n_runs))\n\n if fixed_n_classes is not None:\n labels_a = random_labels(low=0, high=fixed_n_classes, size=n_samples)\n\n for i, k in enumerate(n_clusters_range):\n for j in range(n_runs):\n if fixed_n_classes is None:\n labels_a = random_labels(low=0, high=k, size=n_samples)\n labels_b = random_labels(low=0, high=k, size=n_samples)\n scores[i, j] = score_func(labels_a, labels_b)\n return scores\n\n\ndef ami_score(U, V):\n return metrics.adjusted_mutual_info_score(U, V,\n average_method='arithmetic')\n\nscore_funcs = [\n metrics.adjusted_rand_score,\n metrics.v_measure_score,\n ami_score,\n metrics.mutual_info_score,\n]\n\n# 2 independent random clusterings with equal cluster number\n\nn_samples = 100\nn_clusters_range = np.linspace(2, n_samples, 10).astype(np.int)\n\nplt.figure(1)\n\nplots = []\nnames = []\nfor score_func in score_funcs:\n print(\"Computing %s for %d values of n_clusters and n_samples=%d\"\n % (score_func.__name__, len(n_clusters_range), n_samples))\n\n t0 = time()\n scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range)\n print(\"done in %0.3fs\" % (time() - t0))\n plots.append(plt.errorbar(\n n_clusters_range, np.median(scores, axis=1), scores.std(axis=1))[0])\n names.append(score_func.__name__)\n\nplt.title(\"Clustering measures for 2 random uniform labelings\\n\"\n \"with equal number of clusters\")\nplt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)\nplt.ylabel('Score value')\nplt.legend(plots, names)\nplt.ylim(bottom=-0.05, top=1.05)\n\n\n# Random labeling with varying n_clusters against ground class labels\n# with fixed number of clusters\n\nn_samples = 1000\nn_clusters_range = np.linspace(2, 100, 10).astype(np.int)\nn_classes = 10\n\nplt.figure(2)\n\nplots = []\nnames = []\nfor score_func in score_funcs:\n print(\"Computing %s for %d values of n_clusters and n_samples=%d\"\n % (score_func.__name__, len(n_clusters_range), n_samples))\n\n t0 = time()\n scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range,\n fixed_n_classes=n_classes)\n print(\"done in %0.3fs\" % (time() - t0))\n plots.append(plt.errorbar(\n n_clusters_range, scores.mean(axis=1), scores.std(axis=1))[0])\n names.append(score_func.__name__)\n\nplt.title(\"Clustering measures for random uniform labeling\\n\"\n \"against reference assignment with %d classes\" % n_classes)\nplt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)\nplt.ylabel('Score value')\nplt.ylim(bottom=-0.05, top=1.05)\nplt.legend(plots, names)\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_adjusted_for_chance_measures.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from time import time
3131
from sklearn import metrics
3232

33-
3433
def uniform_labelings_scores(score_func, n_samples, n_clusters_range,
3534
fixed_n_classes=None, n_runs=5, seed=42):
3635
"""Compute score for 2 random uniform cluster labelings.
@@ -55,10 +54,15 @@ def uniform_labelings_scores(score_func, n_samples, n_clusters_range,
5554
scores[i, j] = score_func(labels_a, labels_b)
5655
return scores
5756

57+
58+
def ami_score(U, V):
59+
return metrics.adjusted_mutual_info_score(U, V,
60+
average_method='arithmetic')
61+
5862
score_funcs = [
5963
metrics.adjusted_rand_score,
6064
metrics.v_measure_score,
61-
metrics.adjusted_mutual_info_score,
65+
ami_score,
6266
metrics.mutual_info_score,
6367
]
6468

@@ -87,7 +91,7 @@ def uniform_labelings_scores(score_func, n_samples, n_clusters_range,
8791
plt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)
8892
plt.ylabel('Score value')
8993
plt.legend(plots, names)
90-
plt.ylim(ymin=-0.05, ymax=1.05)
94+
plt.ylim(bottom=-0.05, top=1.05)
9195

9296

9397
# Random labeling with varying n_clusters against ground class labels
@@ -117,6 +121,6 @@ def uniform_labelings_scores(score_func, n_samples, n_clusters_range,
117121
"against reference assignment with %d classes" % n_classes)
118122
plt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)
119123
plt.ylabel('Score value')
120-
plt.ylim(ymin=-0.05, ymax=1.05)
124+
plt.ylim(bottom=-0.05, top=1.05)
121125
plt.legend(plots, names)
122126
plt.show()

dev/_downloads/plot_affinity_propagation.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\nfrom sklearn.cluster import AffinityPropagation\nfrom sklearn import metrics\nfrom sklearn.datasets.samples_generator import make_blobs\n\n# #############################################################################\n# Generate sample data\ncenters = [[1, 1], [-1, -1], [1, -1]]\nX, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,\n random_state=0)\n\n# #############################################################################\n# Compute Affinity Propagation\naf = AffinityPropagation(preference=-50).fit(X)\ncluster_centers_indices = af.cluster_centers_indices_\nlabels = af.labels_\n\nn_clusters_ = len(cluster_centers_indices)\n\nprint('Estimated number of clusters: %d' % n_clusters_)\nprint(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels_true, labels))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels_true, labels))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels_true, labels))\nprint(\"Adjusted Rand Index: %0.3f\"\n % metrics.adjusted_rand_score(labels_true, labels))\nprint(\"Adjusted Mutual Information: %0.3f\"\n % metrics.adjusted_mutual_info_score(labels_true, labels))\nprint(\"Silhouette Coefficient: %0.3f\"\n % metrics.silhouette_score(X, labels, metric='sqeuclidean'))\n\n# #############################################################################\n# Plot result\nimport matplotlib.pyplot as plt\nfrom itertools import cycle\n\nplt.close('all')\nplt.figure(1)\nplt.clf()\n\ncolors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')\nfor k, col in zip(range(n_clusters_), colors):\n class_members = labels == k\n cluster_center = X[cluster_centers_indices[k]]\n plt.plot(X[class_members, 0], X[class_members, 1], col + '.')\n plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,\n markeredgecolor='k', markersize=14)\n for x in X[class_members]:\n plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)\n\nplt.title('Estimated number of clusters: %d' % n_clusters_)\nplt.show()"
29+
"print(__doc__)\n\nfrom sklearn.cluster import AffinityPropagation\nfrom sklearn import metrics\nfrom sklearn.datasets.samples_generator import make_blobs\n\n# #############################################################################\n# Generate sample data\ncenters = [[1, 1], [-1, -1], [1, -1]]\nX, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,\n random_state=0)\n\n# #############################################################################\n# Compute Affinity Propagation\naf = AffinityPropagation(preference=-50).fit(X)\ncluster_centers_indices = af.cluster_centers_indices_\nlabels = af.labels_\n\nn_clusters_ = len(cluster_centers_indices)\n\nprint('Estimated number of clusters: %d' % n_clusters_)\nprint(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels_true, labels))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels_true, labels))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels_true, labels))\nprint(\"Adjusted Rand Index: %0.3f\"\n % metrics.adjusted_rand_score(labels_true, labels))\nprint(\"Adjusted Mutual Information: %0.3f\"\n % metrics.adjusted_mutual_info_score(labels_true, labels,\n average_method='arithmetic'))\nprint(\"Silhouette Coefficient: %0.3f\"\n % metrics.silhouette_score(X, labels, metric='sqeuclidean'))\n\n# #############################################################################\n# Plot result\nimport matplotlib.pyplot as plt\nfrom itertools import cycle\n\nplt.close('all')\nplt.figure(1)\nplt.clf()\n\ncolors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')\nfor k, col in zip(range(n_clusters_), colors):\n class_members = labels == k\n cluster_center = X[cluster_centers_indices[k]]\n plt.plot(X[class_members, 0], X[class_members, 1], col + '.')\n plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,\n markeredgecolor='k', markersize=14)\n for x in X[class_members]:\n plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)\n\nplt.title('Estimated number of clusters: %d' % n_clusters_)\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_affinity_propagation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@
3535
print("Adjusted Rand Index: %0.3f"
3636
% metrics.adjusted_rand_score(labels_true, labels))
3737
print("Adjusted Mutual Information: %0.3f"
38-
% metrics.adjusted_mutual_info_score(labels_true, labels))
38+
% metrics.adjusted_mutual_info_score(labels_true, labels,
39+
average_method='arithmetic'))
3940
print("Silhouette Coefficient: %0.3f"
4041
% metrics.silhouette_score(X, labels, metric='sqeuclidean'))
4142

0 commit comments

Comments
 (0)