Skip to content

Commit 1a5bd51

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 660e43aec705e0fb0efd23533ead6b4ed21540e6
1 parent 8c1571e commit 1a5bd51

File tree

1,225 files changed

+4562
-4581
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,225 files changed

+4562
-4581
lines changed
Binary file not shown.

dev/_downloads/3ed102fa8211c8d36f2331f0c5e1dcef/plot_model_complexity_influence.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Authors: Eustache Diemert <[email protected]>\n# Maria Telenczuk <https://github.com/maikia>\n# Guillaume Lemaitre <[email protected]>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import NuSVR\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n# Initialize random generator\nnp.random.seed(0)"
29+
"# Authors: Eustache Diemert <[email protected]>\n# Maria Telenczuk <https://github.com/maikia>\n# Guillaume Lemaitre <[email protected]>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import NuSVR\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n\n# Initialize random generator\nnp.random.seed(0)"
3030
]
3131
},
3232
{
@@ -44,7 +44,7 @@
4444
},
4545
"outputs": [],
4646
"source": [
47-
"def generate_data(case):\n \"\"\"Generate regression/classification data.\"\"\"\n if case == \"regression\":\n X, y = datasets.load_diabetes(return_X_y=True)\n train_size = 0.8\n elif case == \"classification\":\n X, y = datasets.fetch_20newsgroups_vectorized(subset=\"all\", return_X_y=True)\n train_size = 0.4 # to make the example run faster\n\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=train_size, random_state=0\n )\n\n data = {\"X_train\": X_train, \"X_test\": X_test, \"y_train\": y_train, \"y_test\": y_test}\n return data\n\n\nregression_data = generate_data(\"regression\")\nclassification_data = generate_data(\"classification\")"
47+
"def generate_data(case):\n \"\"\"Generate regression/classification data.\"\"\"\n if case == \"regression\":\n X, y = datasets.load_diabetes(return_X_y=True)\n elif case == \"classification\":\n X, y = datasets.fetch_20newsgroups_vectorized(subset=\"all\", return_X_y=True)\n X, y = shuffle(X, y)\n offset = int(X.shape[0] * 0.8)\n X_train, y_train = X[:offset], y[:offset]\n X_test, y_test = X[offset:], y[offset:]\n\n data = {\"X_train\": X_train, \"X_test\": X_test, \"y_train\": y_train, \"y_test\": y_test}\n return data\n\n\nregression_data = generate_data(\"regression\")\nclassification_data = generate_data(\"classification\")"
4848
]
4949
},
5050
{
@@ -80,7 +80,7 @@
8080
},
8181
"outputs": [],
8282
"source": [
83-
"def _count_nonzero_coefficients(estimator):\n a = estimator.coef_.toarray()\n return np.count_nonzero(a)\n\n\nconfigurations = [\n {\n \"estimator\": SGDClassifier,\n \"tuned_params\": {\n \"penalty\": \"elasticnet\",\n \"alpha\": 0.001,\n \"loss\": \"modified_huber\",\n \"fit_intercept\": True,\n \"tol\": 1e-3,\n },\n \"changing_param\": \"l1_ratio\",\n \"changing_param_values\": [0.25, 0.5, 0.75, 0.9],\n \"complexity_label\": \"non_zero coefficients\",\n \"complexity_computer\": _count_nonzero_coefficients,\n \"prediction_performance_computer\": hamming_loss,\n \"prediction_performance_label\": \"Hamming Loss (Misclassification Ratio)\",\n \"postfit_hook\": lambda x: x.sparsify(),\n \"data\": classification_data,\n \"n_samples\": 5,\n },\n {\n \"estimator\": NuSVR,\n \"tuned_params\": {\"C\": 1e3, \"gamma\": 2 ** -15},\n \"changing_param\": \"nu\",\n \"changing_param_values\": [0.05, 0.1, 0.2, 0.35, 0.5],\n \"complexity_label\": \"n_support_vectors\",\n \"complexity_computer\": lambda x: len(x.support_vectors_),\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 15,\n },\n {\n \"estimator\": GradientBoostingRegressor,\n \"tuned_params\": {\n \"loss\": \"squared_error\",\n \"learning_rate\": 0.05,\n \"max_depth\": 2,\n },\n \"changing_param\": \"n_estimators\",\n \"changing_param_values\": [10, 25, 50, 75, 100],\n \"complexity_label\": \"n_trees\",\n \"complexity_computer\": lambda x: x.n_estimators,\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 15,\n },\n]"
83+
"def _count_nonzero_coefficients(estimator):\n a = estimator.coef_.toarray()\n return np.count_nonzero(a)\n\n\nconfigurations = [\n {\n \"estimator\": SGDClassifier,\n \"tuned_params\": {\n \"penalty\": \"elasticnet\",\n \"alpha\": 0.001,\n \"loss\": \"modified_huber\",\n \"fit_intercept\": True,\n \"tol\": 1e-3,\n },\n \"changing_param\": \"l1_ratio\",\n \"changing_param_values\": [0.25, 0.5, 0.75, 0.9],\n \"complexity_label\": \"non_zero coefficients\",\n \"complexity_computer\": _count_nonzero_coefficients,\n \"prediction_performance_computer\": hamming_loss,\n \"prediction_performance_label\": \"Hamming Loss (Misclassification Ratio)\",\n \"postfit_hook\": lambda x: x.sparsify(),\n \"data\": classification_data,\n \"n_samples\": 30,\n },\n {\n \"estimator\": NuSVR,\n \"tuned_params\": {\"C\": 1e3, \"gamma\": 2 ** -15},\n \"changing_param\": \"nu\",\n \"changing_param_values\": [0.1, 0.25, 0.5, 0.75, 0.9],\n \"complexity_label\": \"n_support_vectors\",\n \"complexity_computer\": lambda x: len(x.support_vectors_),\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 30,\n },\n {\n \"estimator\": GradientBoostingRegressor,\n \"tuned_params\": {\"loss\": \"squared_error\"},\n \"changing_param\": \"n_estimators\",\n \"changing_param_values\": [10, 50, 100, 200, 500],\n \"complexity_label\": \"n_trees\",\n \"complexity_computer\": lambda x: x.n_estimators,\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 30,\n },\n]"
8484
]
8585
},
8686
{
@@ -98,7 +98,7 @@
9898
},
9999
"outputs": [],
100100
"source": [
101-
"def plot_influence(conf, mse_values, prediction_times, complexities):\n \"\"\"\n Plot influence of model complexity on both accuracy and latency.\n \"\"\"\n\n fig = plt.figure()\n fig.subplots_adjust(right=0.75)\n\n # first axes (prediction error)\n ax1 = fig.add_subplot(111)\n line1 = ax1.plot(complexities, mse_values, c=\"tab:blue\", ls=\"-\")[0]\n ax1.set_xlabel(\"Model Complexity (%s)\" % conf[\"complexity_label\"])\n y1_label = conf[\"prediction_performance_label\"]\n ax1.set_ylabel(y1_label)\n\n ax1.spines[\"left\"].set_color(line1.get_color())\n ax1.yaxis.label.set_color(line1.get_color())\n ax1.tick_params(axis=\"y\", colors=line1.get_color())\n\n # second axes (latency)\n ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)\n line2 = ax2.plot(complexities, prediction_times, c=\"tab:orange\", ls=\"-\")[0]\n ax2.yaxis.tick_right()\n ax2.yaxis.set_label_position(\"right\")\n y2_label = \"Time (s)\"\n ax2.set_ylabel(y2_label)\n ax1.spines[\"right\"].set_color(line2.get_color())\n ax2.yaxis.label.set_color(line2.get_color())\n ax2.tick_params(axis=\"y\", colors=line2.get_color())\n\n plt.legend(\n (line1, line2), (\"prediction error\", \"prediction latency\"), loc=\"upper right\"\n )\n\n plt.title(\n \"Influence of varying '%s' on %s\"\n % (conf[\"changing_param\"], conf[\"estimator\"].__name__)\n )\n\n\nfor conf in configurations:\n prediction_performances, prediction_times, complexities = benchmark_influence(conf)\n plot_influence(conf, prediction_performances, prediction_times, complexities)\nplt.show()"
101+
"def plot_influence(conf, mse_values, prediction_times, complexities):\n \"\"\"\n Plot influence of model complexity on both accuracy and latency.\n \"\"\"\n\n fig = plt.figure()\n fig.subplots_adjust(right=0.75)\n\n # first axes (prediction error)\n ax1 = fig.add_subplot(111)\n line1 = ax1.plot(complexities, mse_values, c=\"tab:blue\", ls=\"-\")[0]\n ax1.set_xlabel(\"Model Complexity (%s)\" % conf[\"complexity_label\"])\n y1_label = conf[\"prediction_performance_label\"]\n ax1.set_ylabel(y1_label)\n\n ax1.spines[\"left\"].set_color(line1.get_color())\n ax1.yaxis.label.set_color(line1.get_color())\n ax1.tick_params(axis=\"y\", colors=line1.get_color())\n\n # second axes (latency)\n ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)\n line2 = ax2.plot(complexities, prediction_times, c=\"tab:orange\", ls=\"-\")[0]\n ax2.yaxis.tick_right()\n ax2.yaxis.set_label_position(\"right\")\n y2_label = \"Time (s)\"\n ax2.set_ylabel(y2_label)\n ax1.spines[\"right\"].set_color(line2.get_color())\n ax2.yaxis.label.set_color(line2.get_color())\n ax2.tick_params(axis=\"y\", colors=line2.get_color())\n\n plt.legend((line1, line2), (\"prediction error\", \"latency\"), loc=\"upper right\")\n\n plt.title(\n \"Influence of varying '%s' on %s\"\n % (conf[\"changing_param\"], conf[\"estimator\"].__name__)\n )\n\n\nfor conf in configurations:\n prediction_performances, prediction_times, complexities = benchmark_influence(conf)\n plot_influence(conf, prediction_performances, prediction_times, complexities)\nplt.show()"
102102
]
103103
},
104104
{
Binary file not shown.

dev/_downloads/ddd79923ba48c7f71fb17697baa1a22b/plot_model_complexity_influence.py

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,14 @@
4242
import matplotlib.pyplot as plt
4343

4444
from sklearn import datasets
45-
from sklearn.model_selection import train_test_split
45+
from sklearn.utils import shuffle
4646
from sklearn.metrics import mean_squared_error
4747
from sklearn.svm import NuSVR
4848
from sklearn.ensemble import GradientBoostingRegressor
4949
from sklearn.linear_model import SGDClassifier
5050
from sklearn.metrics import hamming_loss
5151

52+
5253
# Initialize random generator
5354
np.random.seed(0)
5455

@@ -71,14 +72,12 @@ def generate_data(case):
7172
"""Generate regression/classification data."""
7273
if case == "regression":
7374
X, y = datasets.load_diabetes(return_X_y=True)
74-
train_size = 0.8
7575
elif case == "classification":
7676
X, y = datasets.fetch_20newsgroups_vectorized(subset="all", return_X_y=True)
77-
train_size = 0.4 # to make the example run faster
78-
79-
X_train, X_test, y_train, y_test = train_test_split(
80-
X, y, train_size=train_size, random_state=0
81-
)
77+
X, y = shuffle(X, y)
78+
offset = int(X.shape[0] * 0.8)
79+
X_train, y_train = X[:offset], y[:offset]
80+
X_test, y_test = X[offset:], y[offset:]
8281

8382
data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
8483
return data
@@ -175,37 +174,33 @@ def _count_nonzero_coefficients(estimator):
175174
"prediction_performance_label": "Hamming Loss (Misclassification Ratio)",
176175
"postfit_hook": lambda x: x.sparsify(),
177176
"data": classification_data,
178-
"n_samples": 5,
177+
"n_samples": 30,
179178
},
180179
{
181180
"estimator": NuSVR,
182181
"tuned_params": {"C": 1e3, "gamma": 2 ** -15},
183182
"changing_param": "nu",
184-
"changing_param_values": [0.05, 0.1, 0.2, 0.35, 0.5],
183+
"changing_param_values": [0.1, 0.25, 0.5, 0.75, 0.9],
185184
"complexity_label": "n_support_vectors",
186185
"complexity_computer": lambda x: len(x.support_vectors_),
187186
"data": regression_data,
188187
"postfit_hook": lambda x: x,
189188
"prediction_performance_computer": mean_squared_error,
190189
"prediction_performance_label": "MSE",
191-
"n_samples": 15,
190+
"n_samples": 30,
192191
},
193192
{
194193
"estimator": GradientBoostingRegressor,
195-
"tuned_params": {
196-
"loss": "squared_error",
197-
"learning_rate": 0.05,
198-
"max_depth": 2,
199-
},
194+
"tuned_params": {"loss": "squared_error"},
200195
"changing_param": "n_estimators",
201-
"changing_param_values": [10, 25, 50, 75, 100],
196+
"changing_param_values": [10, 50, 100, 200, 500],
202197
"complexity_label": "n_trees",
203198
"complexity_computer": lambda x: x.n_estimators,
204199
"data": regression_data,
205200
"postfit_hook": lambda x: x,
206201
"prediction_performance_computer": mean_squared_error,
207202
"prediction_performance_label": "MSE",
208-
"n_samples": 15,
203+
"n_samples": 30,
209204
},
210205
]
211206

@@ -260,9 +255,7 @@ def plot_influence(conf, mse_values, prediction_times, complexities):
260255
ax2.yaxis.label.set_color(line2.get_color())
261256
ax2.tick_params(axis="y", colors=line2.get_color())
262257

263-
plt.legend(
264-
(line1, line2), ("prediction error", "prediction latency"), loc="upper right"
265-
)
258+
plt.legend((line1, line2), ("prediction error", "latency"), loc="upper right")
266259

267260
plt.title(
268261
"Influence of varying '%s' on %s"
@@ -275,6 +268,7 @@ def plot_influence(conf, mse_values, prediction_times, complexities):
275268
plot_influence(conf, prediction_performances, prediction_times, complexities)
276269
plt.show()
277270

271+
278272
##############################################################################
279273
# Conclusion
280274
# ----------

dev/_downloads/scikit-learn-docs.zip

3.72 KB
Binary file not shown.
-125 Bytes
-192 Bytes
186 Bytes
293 Bytes
-52 Bytes

0 commit comments

Comments
 (0)