Skip to content

Commit c171049

Browse files
committed
Pushing the docs to dev/ for branch: main, commit d7ddffe9c1770596f5d5194486f763ac667fa64c
1 parent 868255d commit c171049

File tree

1,239 files changed

+4393
-4336
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,239 files changed

+4393
-4336
lines changed
Binary file not shown.

dev/_downloads/4e46f015ab8300f262e6e8775bcdcf8a/plot_adaboost_multiclass.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,12 @@
4646
y_train, y_test = y[:n_split], y[n_split:]
4747

4848
bdt_real = AdaBoostClassifier(
49-
DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1
49+
DecisionTreeClassifier(max_depth=2), n_estimators=300, learning_rate=1
5050
)
5151

5252
bdt_discrete = AdaBoostClassifier(
5353
DecisionTreeClassifier(max_depth=2),
54-
n_estimators=600,
54+
n_estimators=300,
5555
learning_rate=1.5,
5656
algorithm="SAMME",
5757
)

dev/_downloads/607c99671400a5055ef516d1aabd00c1/plot_adaboost_multiclass.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_gaussian_quantiles\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nX, y = make_gaussian_quantiles(\n n_samples=13000, n_features=10, n_classes=3, random_state=1\n)\n\nn_split = 3000\n\nX_train, X_test = X[:n_split], X[n_split:]\ny_train, y_test = y[:n_split], y[n_split:]\n\nbdt_real = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1\n)\n\nbdt_discrete = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2),\n n_estimators=600,\n learning_rate=1.5,\n algorithm=\"SAMME\",\n)\n\nbdt_real.fit(X_train, y_train)\nbdt_discrete.fit(X_train, y_train)\n\nreal_test_errors = []\ndiscrete_test_errors = []\n\nfor real_test_predict, discrete_train_predict in zip(\n bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)\n):\n real_test_errors.append(1.0 - accuracy_score(real_test_predict, y_test))\n discrete_test_errors.append(1.0 - accuracy_score(discrete_train_predict, y_test))\n\nn_trees_discrete = len(bdt_discrete)\nn_trees_real = len(bdt_real)\n\n# Boosting might terminate early, but the following arrays are always\n# n_estimators long. We crop them to the actual number of trees here:\ndiscrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\nreal_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\ndiscrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]\n\nplt.figure(figsize=(15, 5))\n\nplt.subplot(131)\nplt.plot(range(1, n_trees_discrete + 1), discrete_test_errors, c=\"black\", label=\"SAMME\")\nplt.plot(\n range(1, n_trees_real + 1),\n real_test_errors,\n c=\"black\",\n linestyle=\"dashed\",\n label=\"SAMME.R\",\n)\nplt.legend()\nplt.ylim(0.18, 0.62)\nplt.ylabel(\"Test Error\")\nplt.xlabel(\"Number of Trees\")\n\nplt.subplot(132)\nplt.plot(\n range(1, n_trees_discrete + 1),\n discrete_estimator_errors,\n \"b\",\n label=\"SAMME\",\n alpha=0.5,\n)\nplt.plot(\n range(1, n_trees_real + 1), real_estimator_errors, \"r\", label=\"SAMME.R\", alpha=0.5\n)\nplt.legend()\nplt.ylabel(\"Error\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0.2, max(real_estimator_errors.max(), discrete_estimator_errors.max()) * 1.2))\nplt.xlim((-20, len(bdt_discrete) + 20))\n\nplt.subplot(133)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights, \"b\", label=\"SAMME\")\nplt.legend()\nplt.ylabel(\"Weight\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0, discrete_estimator_weights.max() * 1.2))\nplt.xlim((-20, n_trees_discrete + 20))\n\n# prevent overlapping y-axis labels\nplt.subplots_adjust(wspace=0.25)\nplt.show()"
29+
"# Author: Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_gaussian_quantiles\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nX, y = make_gaussian_quantiles(\n n_samples=13000, n_features=10, n_classes=3, random_state=1\n)\n\nn_split = 3000\n\nX_train, X_test = X[:n_split], X[n_split:]\ny_train, y_test = y[:n_split], y[n_split:]\n\nbdt_real = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2), n_estimators=300, learning_rate=1\n)\n\nbdt_discrete = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2),\n n_estimators=300,\n learning_rate=1.5,\n algorithm=\"SAMME\",\n)\n\nbdt_real.fit(X_train, y_train)\nbdt_discrete.fit(X_train, y_train)\n\nreal_test_errors = []\ndiscrete_test_errors = []\n\nfor real_test_predict, discrete_train_predict in zip(\n bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)\n):\n real_test_errors.append(1.0 - accuracy_score(real_test_predict, y_test))\n discrete_test_errors.append(1.0 - accuracy_score(discrete_train_predict, y_test))\n\nn_trees_discrete = len(bdt_discrete)\nn_trees_real = len(bdt_real)\n\n# Boosting might terminate early, but the following arrays are always\n# n_estimators long. We crop them to the actual number of trees here:\ndiscrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\nreal_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\ndiscrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]\n\nplt.figure(figsize=(15, 5))\n\nplt.subplot(131)\nplt.plot(range(1, n_trees_discrete + 1), discrete_test_errors, c=\"black\", label=\"SAMME\")\nplt.plot(\n range(1, n_trees_real + 1),\n real_test_errors,\n c=\"black\",\n linestyle=\"dashed\",\n label=\"SAMME.R\",\n)\nplt.legend()\nplt.ylim(0.18, 0.62)\nplt.ylabel(\"Test Error\")\nplt.xlabel(\"Number of Trees\")\n\nplt.subplot(132)\nplt.plot(\n range(1, n_trees_discrete + 1),\n discrete_estimator_errors,\n \"b\",\n label=\"SAMME\",\n alpha=0.5,\n)\nplt.plot(\n range(1, n_trees_real + 1), real_estimator_errors, \"r\", label=\"SAMME.R\", alpha=0.5\n)\nplt.legend()\nplt.ylabel(\"Error\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0.2, max(real_estimator_errors.max(), discrete_estimator_errors.max()) * 1.2))\nplt.xlim((-20, len(bdt_discrete) + 20))\n\nplt.subplot(133)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights, \"b\", label=\"SAMME\")\nplt.legend()\nplt.ylabel(\"Weight\")\nplt.xlabel(\"Number of Trees\")\nplt.ylim((0, discrete_estimator_weights.max() * 1.2))\nplt.xlim((-20, n_trees_discrete + 20))\n\n# prevent overlapping y-axis labels\nplt.subplots_adjust(wspace=0.25)\nplt.show()"
3030
]
3131
}
3232
],
Binary file not shown.

dev/_downloads/733ff7845fe2f197ecd0c72afcf23651/plot_randomized_search.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"import numpy as np\n\nfrom time import time\nimport scipy.stats as stats\nfrom sklearn.utils.fixes import loguniform\n\nfrom sklearn.model_selection import GridSearchCV, RandomizedSearchCV\nfrom sklearn.datasets import load_digits\nfrom sklearn.linear_model import SGDClassifier\n\n# get some data\nX, y = load_digits(return_X_y=True)\n\n# build a classifier\nclf = SGDClassifier(loss=\"hinge\", penalty=\"elasticnet\", fit_intercept=True)\n\n\n# Utility function to report best scores\ndef report(results, n_top=3):\n for i in range(1, n_top + 1):\n candidates = np.flatnonzero(results[\"rank_test_score\"] == i)\n for candidate in candidates:\n print(\"Model with rank: {0}\".format(i))\n print(\n \"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n results[\"mean_test_score\"][candidate],\n results[\"std_test_score\"][candidate],\n )\n )\n print(\"Parameters: {0}\".format(results[\"params\"][candidate]))\n print(\"\")\n\n\n# specify parameters and distributions to sample from\nparam_dist = {\n \"average\": [True, False],\n \"l1_ratio\": stats.uniform(0, 1),\n \"alpha\": loguniform(1e-4, 1e0),\n}\n\n# run randomized search\nn_iter_search = 20\nrandom_search = RandomizedSearchCV(\n clf, param_distributions=param_dist, n_iter=n_iter_search\n)\n\nstart = time()\nrandom_search.fit(X, y)\nprint(\n \"RandomizedSearchCV took %.2f seconds for %d candidates parameter settings.\"\n % ((time() - start), n_iter_search)\n)\nreport(random_search.cv_results_)\n\n# use a full grid over all parameters\nparam_grid = {\n \"average\": [True, False],\n \"l1_ratio\": np.linspace(0, 1, num=10),\n \"alpha\": np.power(10, np.arange(-4, 1, dtype=float)),\n}\n\n# run grid search\ngrid_search = GridSearchCV(clf, param_grid=param_grid)\nstart = time()\ngrid_search.fit(X, y)\n\nprint(\n \"GridSearchCV took %.2f seconds for %d candidate parameter settings.\"\n % (time() - start, len(grid_search.cv_results_[\"params\"]))\n)\nreport(grid_search.cv_results_)"
29+
"import numpy as np\n\nfrom time import time\nimport scipy.stats as stats\nfrom sklearn.utils.fixes import loguniform\n\nfrom sklearn.model_selection import GridSearchCV, RandomizedSearchCV\nfrom sklearn.datasets import load_digits\nfrom sklearn.linear_model import SGDClassifier\n\n# get some data\nX, y = load_digits(return_X_y=True, n_class=3)\n\n# build a classifier\nclf = SGDClassifier(loss=\"hinge\", penalty=\"elasticnet\", fit_intercept=True)\n\n\n# Utility function to report best scores\ndef report(results, n_top=3):\n for i in range(1, n_top + 1):\n candidates = np.flatnonzero(results[\"rank_test_score\"] == i)\n for candidate in candidates:\n print(\"Model with rank: {0}\".format(i))\n print(\n \"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n results[\"mean_test_score\"][candidate],\n results[\"std_test_score\"][candidate],\n )\n )\n print(\"Parameters: {0}\".format(results[\"params\"][candidate]))\n print(\"\")\n\n\n# specify parameters and distributions to sample from\nparam_dist = {\n \"average\": [True, False],\n \"l1_ratio\": stats.uniform(0, 1),\n \"alpha\": loguniform(1e-2, 1e0),\n}\n\n# run randomized search\nn_iter_search = 15\nrandom_search = RandomizedSearchCV(\n clf, param_distributions=param_dist, n_iter=n_iter_search\n)\n\nstart = time()\nrandom_search.fit(X, y)\nprint(\n \"RandomizedSearchCV took %.2f seconds for %d candidates parameter settings.\"\n % ((time() - start), n_iter_search)\n)\nreport(random_search.cv_results_)\n\n# use a full grid over all parameters\nparam_grid = {\n \"average\": [True, False],\n \"l1_ratio\": np.linspace(0, 1, num=10),\n \"alpha\": np.power(10, np.arange(-2, 1, dtype=float)),\n}\n\n# run grid search\ngrid_search = GridSearchCV(clf, param_grid=param_grid)\nstart = time()\ngrid_search.fit(X, y)\n\nprint(\n \"GridSearchCV took %.2f seconds for %d candidate parameter settings.\"\n % (time() - start, len(grid_search.cv_results_[\"params\"]))\n)\nreport(grid_search.cv_results_)"
3030
]
3131
}
3232
],

dev/_downloads/74caedf3eb449b80f3f00e66c1c576bd/plot_discretization_classification.py

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -61,33 +61,53 @@ def get_name(estimator):
6161

6262

6363
# list of (estimator, param_grid), where param_grid is used in GridSearchCV
64+
# The parameter spaces in this example are limited to a narrow band to reduce
65+
# its runtime. In a real use case, a broader search space for the algorithms
66+
# should be used.
6467
classifiers = [
65-
(LogisticRegression(random_state=0), {"C": np.logspace(-2, 7, 10)}),
66-
(LinearSVC(random_state=0), {"C": np.logspace(-2, 7, 10)}),
68+
(
69+
make_pipeline(StandardScaler(), LogisticRegression(random_state=0)),
70+
{"logisticregression__C": np.logspace(-1, 1, 3)},
71+
),
72+
(
73+
make_pipeline(StandardScaler(), LinearSVC(random_state=0)),
74+
{"linearsvc__C": np.logspace(-1, 1, 3)},
75+
),
6776
(
6877
make_pipeline(
69-
KBinsDiscretizer(encode="onehot"), LogisticRegression(random_state=0)
78+
StandardScaler(),
79+
KBinsDiscretizer(encode="onehot"),
80+
LogisticRegression(random_state=0),
7081
),
7182
{
72-
"kbinsdiscretizer__n_bins": np.arange(2, 10),
73-
"logisticregression__C": np.logspace(-2, 7, 10),
83+
"kbinsdiscretizer__n_bins": np.arange(5, 8),
84+
"logisticregression__C": np.logspace(-1, 1, 3),
7485
},
7586
),
7687
(
77-
make_pipeline(KBinsDiscretizer(encode="onehot"), LinearSVC(random_state=0)),
88+
make_pipeline(
89+
StandardScaler(),
90+
KBinsDiscretizer(encode="onehot"),
91+
LinearSVC(random_state=0),
92+
),
7893
{
79-
"kbinsdiscretizer__n_bins": np.arange(2, 10),
80-
"linearsvc__C": np.logspace(-2, 7, 10),
94+
"kbinsdiscretizer__n_bins": np.arange(5, 8),
95+
"linearsvc__C": np.logspace(-1, 1, 3),
8196
},
8297
),
8398
(
84-
GradientBoostingClassifier(n_estimators=50, random_state=0),
85-
{"learning_rate": np.logspace(-4, 0, 10)},
99+
make_pipeline(
100+
StandardScaler(), GradientBoostingClassifier(n_estimators=5, random_state=0)
101+
),
102+
{"gradientboostingclassifier__learning_rate": np.logspace(-2, 0, 5)},
103+
),
104+
(
105+
make_pipeline(StandardScaler(), SVC(random_state=0)),
106+
{"svc__C": np.logspace(-1, 1, 3)},
86107
),
87-
(SVC(random_state=0), {"C": np.logspace(-2, 7, 10)}),
88108
]
89109

90-
names = [get_name(e) for e, g in classifiers]
110+
names = [get_name(e).replace("StandardScaler + ", "") for e, _ in classifiers]
91111

92112
n_samples = 100
93113
datasets = [
@@ -107,15 +127,14 @@ def get_name(estimator):
107127
nrows=len(datasets), ncols=len(classifiers) + 1, figsize=(21, 9)
108128
)
109129

110-
cm = plt.cm.PiYG
130+
cm_piyg = plt.cm.PiYG
111131
cm_bright = ListedColormap(["#b30065", "#178000"])
112132

113133
# iterate over datasets
114134
for ds_cnt, (X, y) in enumerate(datasets):
115-
print("\ndataset %d\n---------" % ds_cnt)
135+
print(f"\ndataset {ds_cnt}\n---------")
116136

117-
# preprocess dataset, split into training and test part
118-
X = StandardScaler().fit_transform(X)
137+
# split into training and test part
119138
X_train, X_test, y_train, y_test = train_test_split(
120139
X, y, test_size=0.5, random_state=42
121140
)
@@ -148,18 +167,18 @@ def get_name(estimator):
148167
with ignore_warnings(category=ConvergenceWarning):
149168
clf.fit(X_train, y_train)
150169
score = clf.score(X_test, y_test)
151-
print("%s: %.2f" % (name, score))
170+
print(f"{name}: {score:.2f}")
152171

153172
# plot the decision boundary. For that, we will assign a color to each
154173
# point in the mesh [x_min, x_max]*[y_min, y_max].
155174
if hasattr(clf, "decision_function"):
156-
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
175+
Z = clf.decision_function(np.column_stack([xx.ravel(), yy.ravel()]))
157176
else:
158-
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
177+
Z = clf.predict_proba(np.column_stack([xx.ravel(), yy.ravel()]))[:, 1]
159178

160179
# put the result into a color plot
161180
Z = Z.reshape(xx.shape)
162-
ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)
181+
ax.contourf(xx, yy, Z, cmap=cm_piyg, alpha=0.8)
163182

164183
# plot the training points
165184
ax.scatter(
@@ -184,7 +203,7 @@ def get_name(estimator):
184203
ax.text(
185204
0.95,
186205
0.06,
187-
("%.2f" % score).lstrip("0"),
206+
(f"{score:.2f}").lstrip("0"),
188207
size=15,
189208
bbox=dict(boxstyle="round", alpha=0.8, facecolor="white"),
190209
transform=ax.transAxes,

0 commit comments

Comments
 (0)