Skip to content

Commit ab682f7

Browse files
committed
Pushing the docs to dev/ for branch: master, commit e1dd0d85c4a19795523668403bb066c6d0b9592b
1 parent d146f14 commit ab682f7

File tree

1,102 files changed

+3554
-4213
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,102 files changed

+3554
-4213
lines changed
-158 Bytes
Binary file not shown.
-160 Bytes
Binary file not shown.

dev/_downloads/plot_discretization_classification.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Code source: Tom Dupr\u00e9 la Tour\n# Adapted from plot_classifier_comparison by Ga\u00ebl Varoquaux and Andreas M\u00fcller\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.datasets import make_moons, make_circles, make_classification\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import KBinsDiscretizer\nfrom sklearn.svm import SVC, LinearSVC\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.utils.testing import ignore_warnings\nfrom sklearn.exceptions import ConvergenceWarning\n\nprint(__doc__)\n\nh = .02 # step size in the mesh\n\n\ndef get_name(estimator):\n name = estimator.__class__.__name__\n if name == 'Pipeline':\n name = [get_name(est[1]) for est in estimator.steps]\n name = ' + '.join(name)\n return name\n\n\n# list of (estimator, param_grid), where param_grid is used in GridSearchCV\nclassifiers = [\n (LogisticRegression(solver='lbfgs', random_state=0), {\n 'C': np.logspace(-2, 7, 10)\n }),\n (LinearSVC(random_state=0), {\n 'C': np.logspace(-2, 7, 10)\n }),\n (make_pipeline(\n KBinsDiscretizer(encode='onehot'),\n LogisticRegression(solver='lbfgs', random_state=0)), {\n 'kbinsdiscretizer__n_bins': np.arange(2, 10),\n 'logisticregression__C': np.logspace(-2, 7, 10),\n }),\n (make_pipeline(\n KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), {\n 'kbinsdiscretizer__n_bins': np.arange(2, 10),\n 'linearsvc__C': np.logspace(-2, 7, 10),\n }),\n (GradientBoostingClassifier(n_estimators=50, random_state=0), {\n 'learning_rate': np.logspace(-4, 0, 10)\n }),\n (SVC(random_state=0, gamma='scale'), {\n 'C': np.logspace(-2, 7, 10)\n }),\n]\n\nnames = [get_name(e) for e, g in classifiers]\n\nn_samples = 100\ndatasets = [\n make_moons(n_samples=n_samples, noise=0.2, random_state=0),\n make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),\n make_classification(n_samples=n_samples, n_features=2, n_redundant=0,\n n_informative=2, random_state=2,\n n_clusters_per_class=1)\n]\n\nfigure = plt.figure(figsize=(21, 9))\ncm = plt.cm.PiYG\ncm_bright = ListedColormap(['#b30065', '#178000'])\ni = 1\n# iterate over datasets\nfor ds_cnt, (X, y) in enumerate(datasets):\n print('\\ndataset %d\\n---------' % ds_cnt)\n\n # preprocess dataset, split into training and test part\n X = StandardScaler().fit_transform(X)\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=.5, random_state=42)\n\n # create the grid for background colors\n x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n xx, yy = np.meshgrid(\n np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n\n # plot the dataset first\n ax = plt.subplot(len(datasets), len(classifiers) + 1, i)\n if ds_cnt == 0:\n ax.set_title(\"Input data\")\n # plot the training points\n ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,\n edgecolors='k')\n # and testing points\n ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,\n edgecolors='k')\n ax.set_xlim(xx.min(), xx.max())\n ax.set_ylim(yy.min(), yy.max())\n ax.set_xticks(())\n ax.set_yticks(())\n i += 1\n\n # iterate over classifiers\n for name, (estimator, param_grid) in zip(names, classifiers):\n ax = plt.subplot(len(datasets), len(classifiers) + 1, i)\n\n clf = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5,\n iid=False)\n with ignore_warnings(category=ConvergenceWarning):\n clf.fit(X_train, y_train)\n score = clf.score(X_test, y_test)\n print('%s: %.2f' % (name, score))\n\n # plot the decision boundary. For that, we will assign a color to each\n # point in the mesh [x_min, x_max]*[y_min, y_max].\n if hasattr(clf, \"decision_function\"):\n Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n else:\n Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]\n\n # put the result into a color plot\n Z = Z.reshape(xx.shape)\n ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)\n\n # plot the training points\n ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,\n edgecolors='k')\n # and testing points\n ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,\n edgecolors='k', alpha=0.6)\n ax.set_xlim(xx.min(), xx.max())\n ax.set_ylim(yy.min(), yy.max())\n ax.set_xticks(())\n ax.set_yticks(())\n\n if ds_cnt == 0:\n ax.set_title(name.replace(' + ', '\\n'))\n ax.text(0.95, 0.06, ('%.2f' % score).lstrip('0'), size=15,\n bbox=dict(boxstyle='round', alpha=0.8, facecolor='white'),\n transform=ax.transAxes, horizontalalignment='right')\n\n i += 1\n\nplt.tight_layout()\n\n# Add suptitles above the figure\nplt.subplots_adjust(top=0.90)\nsuptitles = [\n 'Linear classifiers',\n 'Feature discretization and linear classifiers',\n 'Non-linear classifiers',\n]\nfor i, suptitle in zip([2, 4, 6], suptitles):\n ax = plt.subplot(len(datasets), len(classifiers) + 1, i)\n ax.text(1.05, 1.25, suptitle, transform=ax.transAxes,\n horizontalalignment='center', size='x-large')\nplt.show()"
29+
"# Code source: Tom Dupr\u00e9 la Tour\n# Adapted from plot_classifier_comparison by Ga\u00ebl Varoquaux and Andreas M\u00fcller\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.datasets import make_moons, make_circles, make_classification\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import KBinsDiscretizer\nfrom sklearn.svm import SVC, LinearSVC\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.utils.testing import ignore_warnings\nfrom sklearn.exceptions import ConvergenceWarning\n\nprint(__doc__)\n\nh = .02 # step size in the mesh\n\n\ndef get_name(estimator):\n name = estimator.__class__.__name__\n if name == 'Pipeline':\n name = [get_name(est[1]) for est in estimator.steps]\n name = ' + '.join(name)\n return name\n\n\n# list of (estimator, param_grid), where param_grid is used in GridSearchCV\nclassifiers = [\n (LogisticRegression(solver='lbfgs', random_state=0), {\n 'C': np.logspace(-2, 7, 10)\n }),\n (LinearSVC(random_state=0), {\n 'C': np.logspace(-2, 7, 10)\n }),\n (make_pipeline(\n KBinsDiscretizer(encode='onehot'),\n LogisticRegression(solver='lbfgs', random_state=0)), {\n 'kbinsdiscretizer__n_bins': np.arange(2, 10),\n 'logisticregression__C': np.logspace(-2, 7, 10),\n }),\n (make_pipeline(\n KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), {\n 'kbinsdiscretizer__n_bins': np.arange(2, 10),\n 'linearsvc__C': np.logspace(-2, 7, 10),\n }),\n (GradientBoostingClassifier(n_estimators=50, random_state=0), {\n 'learning_rate': np.logspace(-4, 0, 10)\n }),\n (SVC(random_state=0, gamma='scale'), {\n 'C': np.logspace(-2, 7, 10)\n }),\n]\n\nnames = [get_name(e) for e, g in classifiers]\n\nn_samples = 100\ndatasets = [\n make_moons(n_samples=n_samples, noise=0.2, random_state=0),\n make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),\n make_classification(n_samples=n_samples, n_features=2, n_redundant=0,\n n_informative=2, random_state=2,\n n_clusters_per_class=1)\n]\n\nfig, axes = plt.subplots(nrows=len(datasets), ncols=len(classifiers) + 1,\n figsize=(21, 9))\n\ncm = plt.cm.PiYG\ncm_bright = ListedColormap(['#b30065', '#178000'])\n\n# iterate over datasets\nfor ds_cnt, (X, y) in enumerate(datasets):\n print('\\ndataset %d\\n---------' % ds_cnt)\n\n # preprocess dataset, split into training and test part\n X = StandardScaler().fit_transform(X)\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=.5, random_state=42)\n\n # create the grid for background colors\n x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n xx, yy = np.meshgrid(\n np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n\n # plot the dataset first\n ax = axes[ds_cnt, 0]\n if ds_cnt == 0:\n ax.set_title(\"Input data\")\n # plot the training points\n ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,\n edgecolors='k')\n # and testing points\n ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,\n edgecolors='k')\n ax.set_xlim(xx.min(), xx.max())\n ax.set_ylim(yy.min(), yy.max())\n ax.set_xticks(())\n ax.set_yticks(())\n\n # iterate over classifiers\n for est_idx, (name, (estimator, param_grid)) in \\\n enumerate(zip(names, classifiers)):\n ax = axes[ds_cnt, est_idx + 1]\n\n clf = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5,\n iid=False)\n with ignore_warnings(category=ConvergenceWarning):\n clf.fit(X_train, y_train)\n score = clf.score(X_test, y_test)\n print('%s: %.2f' % (name, score))\n\n # plot the decision boundary. For that, we will assign a color to each\n # point in the mesh [x_min, x_max]*[y_min, y_max].\n if hasattr(clf, \"decision_function\"):\n Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n else:\n Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]\n\n # put the result into a color plot\n Z = Z.reshape(xx.shape)\n ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)\n\n # plot the training points\n ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,\n edgecolors='k')\n # and testing points\n ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,\n edgecolors='k', alpha=0.6)\n ax.set_xlim(xx.min(), xx.max())\n ax.set_ylim(yy.min(), yy.max())\n ax.set_xticks(())\n ax.set_yticks(())\n\n if ds_cnt == 0:\n ax.set_title(name.replace(' + ', '\\n'))\n ax.text(0.95, 0.06, ('%.2f' % score).lstrip('0'), size=15,\n bbox=dict(boxstyle='round', alpha=0.8, facecolor='white'),\n transform=ax.transAxes, horizontalalignment='right')\n\n\nplt.tight_layout()\n\n# Add suptitles above the figure\nplt.subplots_adjust(top=0.90)\nsuptitles = [\n 'Linear classifiers',\n 'Feature discretization and linear classifiers',\n 'Non-linear classifiers',\n]\nfor i, suptitle in zip([1, 3, 5], suptitles):\n ax = axes[0, i]\n ax.text(1.05, 1.25, suptitle, transform=ax.transAxes,\n horizontalalignment='center', size='x-large')\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_discretization_classification.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,12 @@ def get_name(estimator):
9999
n_clusters_per_class=1)
100100
]
101101

102-
figure = plt.figure(figsize=(21, 9))
102+
fig, axes = plt.subplots(nrows=len(datasets), ncols=len(classifiers) + 1,
103+
figsize=(21, 9))
104+
103105
cm = plt.cm.PiYG
104106
cm_bright = ListedColormap(['#b30065', '#178000'])
105-
i = 1
107+
106108
# iterate over datasets
107109
for ds_cnt, (X, y) in enumerate(datasets):
108110
print('\ndataset %d\n---------' % ds_cnt)
@@ -119,7 +121,7 @@ def get_name(estimator):
119121
np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
120122

121123
# plot the dataset first
122-
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
124+
ax = axes[ds_cnt, 0]
123125
if ds_cnt == 0:
124126
ax.set_title("Input data")
125127
# plot the training points
@@ -132,11 +134,11 @@ def get_name(estimator):
132134
ax.set_ylim(yy.min(), yy.max())
133135
ax.set_xticks(())
134136
ax.set_yticks(())
135-
i += 1
136137

137138
# iterate over classifiers
138-
for name, (estimator, param_grid) in zip(names, classifiers):
139-
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
139+
for est_idx, (name, (estimator, param_grid)) in \
140+
enumerate(zip(names, classifiers)):
141+
ax = axes[ds_cnt, est_idx + 1]
140142

141143
clf = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5,
142144
iid=False)
@@ -173,7 +175,6 @@ def get_name(estimator):
173175
bbox=dict(boxstyle='round', alpha=0.8, facecolor='white'),
174176
transform=ax.transAxes, horizontalalignment='right')
175177

176-
i += 1
177178

178179
plt.tight_layout()
179180

@@ -184,8 +185,8 @@ def get_name(estimator):
184185
'Feature discretization and linear classifiers',
185186
'Non-linear classifiers',
186187
]
187-
for i, suptitle in zip([2, 4, 6], suptitles):
188-
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
188+
for i, suptitle in zip([1, 3, 5], suptitles):
189+
ax = axes[0, i]
189190
ax.text(1.05, 1.25, suptitle, transform=ax.transAxes,
190191
horizontalalignment='center', size='x-large')
191192
plt.show()

dev/_downloads/plot_ensemble_oob.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"import matplotlib.pyplot as plt\n\nfrom collections import OrderedDict\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n\n# Author: Kian Ho <[email protected]>\n# Gilles Louppe <[email protected]>\n# Andreas Mueller <[email protected]>\n#\n# License: BSD 3 Clause\n\nprint(__doc__)\n\nRANDOM_STATE = 123\n\n# Generate a binary classification dataset.\nX, y = make_classification(n_samples=500, n_features=25,\n n_clusters_per_class=1, n_informative=15,\n random_state=RANDOM_STATE)\n\n# NOTE: Setting the `warm_start` construction parameter to `True` disables\n# support for parallelized ensembles but is necessary for tracking the OOB\n# error trajectory during training.\nensemble_clfs = [\n (\"RandomForestClassifier, max_features='sqrt'\",\n RandomForestClassifier(warm_start=True, oob_score=True,\n max_features=\"sqrt\",\n random_state=RANDOM_STATE)),\n (\"RandomForestClassifier, max_features='log2'\",\n RandomForestClassifier(warm_start=True, max_features='log2',\n oob_score=True,\n random_state=RANDOM_STATE)),\n (\"RandomForestClassifier, max_features=None\",\n RandomForestClassifier(warm_start=True, max_features=None,\n oob_score=True,\n random_state=RANDOM_STATE))\n]\n\n# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.\nerror_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)\n\n# Range of `n_estimators` values to explore.\nmin_estimators = 15\nmax_estimators = 175\n\nfor label, clf in ensemble_clfs:\n for i in range(min_estimators, max_estimators + 1):\n clf.set_params(n_estimators=i)\n clf.fit(X, y)\n\n # Record the OOB error for each `n_estimators=i` setting.\n oob_error = 1 - clf.oob_score_\n error_rate[label].append((i, oob_error))\n\n# Generate the \"OOB error rate\" vs. \"n_estimators\" plot.\nfor label, clf_err in error_rate.items():\n xs, ys = zip(*clf_err)\n plt.plot(xs, ys, label=label)\n\nplt.xlim(min_estimators, max_estimators)\nplt.xlabel(\"n_estimators\")\nplt.ylabel(\"OOB error rate\")\nplt.legend(loc=\"upper right\")\nplt.show()"
29+
"import matplotlib.pyplot as plt\n\nfrom collections import OrderedDict\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n\n# Author: Kian Ho <[email protected]>\n# Gilles Louppe <[email protected]>\n# Andreas Mueller <[email protected]>\n#\n# License: BSD 3 Clause\n\nprint(__doc__)\n\nRANDOM_STATE = 123\n\n# Generate a binary classification dataset.\nX, y = make_classification(n_samples=500, n_features=25,\n n_clusters_per_class=1, n_informative=15,\n random_state=RANDOM_STATE)\n\n# NOTE: Setting the `warm_start` construction parameter to `True` disables\n# support for parallelized ensembles but is necessary for tracking the OOB\n# error trajectory during training.\nensemble_clfs = [\n (\"RandomForestClassifier, max_features='sqrt'\",\n RandomForestClassifier(n_estimators=100,\n warm_start=True, oob_score=True,\n max_features=\"sqrt\",\n random_state=RANDOM_STATE)),\n (\"RandomForestClassifier, max_features='log2'\",\n RandomForestClassifier(n_estimators=100,\n warm_start=True, max_features='log2',\n oob_score=True,\n random_state=RANDOM_STATE)),\n (\"RandomForestClassifier, max_features=None\",\n RandomForestClassifier(n_estimators=100,\n warm_start=True, max_features=None,\n oob_score=True,\n random_state=RANDOM_STATE))\n]\n\n# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.\nerror_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)\n\n# Range of `n_estimators` values to explore.\nmin_estimators = 15\nmax_estimators = 175\n\nfor label, clf in ensemble_clfs:\n for i in range(min_estimators, max_estimators + 1):\n clf.set_params(n_estimators=i)\n clf.fit(X, y)\n\n # Record the OOB error for each `n_estimators=i` setting.\n oob_error = 1 - clf.oob_score_\n error_rate[label].append((i, oob_error))\n\n# Generate the \"OOB error rate\" vs. \"n_estimators\" plot.\nfor label, clf_err in error_rate.items():\n xs, ys = zip(*clf_err)\n plt.plot(xs, ys, label=label)\n\nplt.xlim(min_estimators, max_estimators)\nplt.xlabel(\"n_estimators\")\nplt.ylabel(\"OOB error rate\")\nplt.legend(loc=\"upper right\")\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_ensemble_oob.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,18 @@
4545
# error trajectory during training.
4646
ensemble_clfs = [
4747
("RandomForestClassifier, max_features='sqrt'",
48-
RandomForestClassifier(warm_start=True, oob_score=True,
48+
RandomForestClassifier(n_estimators=100,
49+
warm_start=True, oob_score=True,
4950
max_features="sqrt",
5051
random_state=RANDOM_STATE)),
5152
("RandomForestClassifier, max_features='log2'",
52-
RandomForestClassifier(warm_start=True, max_features='log2',
53+
RandomForestClassifier(n_estimators=100,
54+
warm_start=True, max_features='log2',
5355
oob_score=True,
5456
random_state=RANDOM_STATE)),
5557
("RandomForestClassifier, max_features=None",
56-
RandomForestClassifier(warm_start=True, max_features=None,
58+
RandomForestClassifier(n_estimators=100,
59+
warm_start=True, max_features=None,
5760
oob_score=True,
5861
random_state=RANDOM_STATE))
5962
]

0 commit comments

Comments
 (0)