Skip to content

Commit 0f831d1

Browse files
committed
Pushing the docs to dev/ for branch: main, commit d09e1d72399a7ed558cf7ced1f5a26caf674e3fc
1 parent 1142b02 commit 0f831d1

File tree

1,236 files changed

+4540
-4521
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,236 files changed

+4540
-4521
lines changed
Binary file not shown.

dev/_downloads/3ed102fa8211c8d36f2331f0c5e1dcef/plot_model_complexity_influence.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Authors: Eustache Diemert <[email protected]>\n# Maria Telenczuk <https://github.com/maikia>\n# Guillaume Lemaitre <[email protected]>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import NuSVR\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n\n# Initialize random generator\nnp.random.seed(0)"
29+
"# Authors: Eustache Diemert <[email protected]>\n# Maria Telenczuk <https://github.com/maikia>\n# Guillaume Lemaitre <[email protected]>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import NuSVR\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n# Initialize random generator\nnp.random.seed(0)"
3030
]
3131
},
3232
{
@@ -44,7 +44,7 @@
4444
},
4545
"outputs": [],
4646
"source": [
47-
"def generate_data(case):\n \"\"\"Generate regression/classification data.\"\"\"\n if case == \"regression\":\n X, y = datasets.load_diabetes(return_X_y=True)\n elif case == \"classification\":\n X, y = datasets.fetch_20newsgroups_vectorized(subset=\"all\", return_X_y=True)\n X, y = shuffle(X, y)\n offset = int(X.shape[0] * 0.8)\n X_train, y_train = X[:offset], y[:offset]\n X_test, y_test = X[offset:], y[offset:]\n\n data = {\"X_train\": X_train, \"X_test\": X_test, \"y_train\": y_train, \"y_test\": y_test}\n return data\n\n\nregression_data = generate_data(\"regression\")\nclassification_data = generate_data(\"classification\")"
47+
"def generate_data(case):\n \"\"\"Generate regression/classification data.\"\"\"\n if case == \"regression\":\n X, y = datasets.load_diabetes(return_X_y=True)\n train_size = 0.8\n elif case == \"classification\":\n X, y = datasets.fetch_20newsgroups_vectorized(subset=\"all\", return_X_y=True)\n train_size = 0.4 # to make the example run faster\n\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=train_size, random_state=0\n )\n\n data = {\"X_train\": X_train, \"X_test\": X_test, \"y_train\": y_train, \"y_test\": y_test}\n return data\n\n\nregression_data = generate_data(\"regression\")\nclassification_data = generate_data(\"classification\")"
4848
]
4949
},
5050
{
@@ -80,7 +80,7 @@
8080
},
8181
"outputs": [],
8282
"source": [
83-
"def _count_nonzero_coefficients(estimator):\n a = estimator.coef_.toarray()\n return np.count_nonzero(a)\n\n\nconfigurations = [\n {\n \"estimator\": SGDClassifier,\n \"tuned_params\": {\n \"penalty\": \"elasticnet\",\n \"alpha\": 0.001,\n \"loss\": \"modified_huber\",\n \"fit_intercept\": True,\n \"tol\": 1e-3,\n },\n \"changing_param\": \"l1_ratio\",\n \"changing_param_values\": [0.25, 0.5, 0.75, 0.9],\n \"complexity_label\": \"non_zero coefficients\",\n \"complexity_computer\": _count_nonzero_coefficients,\n \"prediction_performance_computer\": hamming_loss,\n \"prediction_performance_label\": \"Hamming Loss (Misclassification Ratio)\",\n \"postfit_hook\": lambda x: x.sparsify(),\n \"data\": classification_data,\n \"n_samples\": 30,\n },\n {\n \"estimator\": NuSVR,\n \"tuned_params\": {\"C\": 1e3, \"gamma\": 2 ** -15},\n \"changing_param\": \"nu\",\n \"changing_param_values\": [0.1, 0.25, 0.5, 0.75, 0.9],\n \"complexity_label\": \"n_support_vectors\",\n \"complexity_computer\": lambda x: len(x.support_vectors_),\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 30,\n },\n {\n \"estimator\": GradientBoostingRegressor,\n \"tuned_params\": {\"loss\": \"squared_error\"},\n \"changing_param\": \"n_estimators\",\n \"changing_param_values\": [10, 50, 100, 200, 500],\n \"complexity_label\": \"n_trees\",\n \"complexity_computer\": lambda x: x.n_estimators,\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 30,\n },\n]"
83+
"def _count_nonzero_coefficients(estimator):\n a = estimator.coef_.toarray()\n return np.count_nonzero(a)\n\n\nconfigurations = [\n {\n \"estimator\": SGDClassifier,\n \"tuned_params\": {\n \"penalty\": \"elasticnet\",\n \"alpha\": 0.001,\n \"loss\": \"modified_huber\",\n \"fit_intercept\": True,\n \"tol\": 1e-3,\n },\n \"changing_param\": \"l1_ratio\",\n \"changing_param_values\": [0.25, 0.5, 0.75, 0.9],\n \"complexity_label\": \"non_zero coefficients\",\n \"complexity_computer\": _count_nonzero_coefficients,\n \"prediction_performance_computer\": hamming_loss,\n \"prediction_performance_label\": \"Hamming Loss (Misclassification Ratio)\",\n \"postfit_hook\": lambda x: x.sparsify(),\n \"data\": classification_data,\n \"n_samples\": 5,\n },\n {\n \"estimator\": NuSVR,\n \"tuned_params\": {\"C\": 1e3, \"gamma\": 2 ** -15},\n \"changing_param\": \"nu\",\n \"changing_param_values\": [0.05, 0.1, 0.2, 0.35, 0.5],\n \"complexity_label\": \"n_support_vectors\",\n \"complexity_computer\": lambda x: len(x.support_vectors_),\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 15,\n },\n {\n \"estimator\": GradientBoostingRegressor,\n \"tuned_params\": {\n \"loss\": \"squared_error\",\n \"learning_rate\": 0.05,\n \"max_depth\": 2,\n },\n \"changing_param\": \"n_estimators\",\n \"changing_param_values\": [10, 25, 50, 75, 100],\n \"complexity_label\": \"n_trees\",\n \"complexity_computer\": lambda x: x.n_estimators,\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 15,\n },\n]"
8484
]
8585
},
8686
{
@@ -98,7 +98,7 @@
9898
},
9999
"outputs": [],
100100
"source": [
101-
"def plot_influence(conf, mse_values, prediction_times, complexities):\n \"\"\"\n Plot influence of model complexity on both accuracy and latency.\n \"\"\"\n\n fig = plt.figure()\n fig.subplots_adjust(right=0.75)\n\n # first axes (prediction error)\n ax1 = fig.add_subplot(111)\n line1 = ax1.plot(complexities, mse_values, c=\"tab:blue\", ls=\"-\")[0]\n ax1.set_xlabel(\"Model Complexity (%s)\" % conf[\"complexity_label\"])\n y1_label = conf[\"prediction_performance_label\"]\n ax1.set_ylabel(y1_label)\n\n ax1.spines[\"left\"].set_color(line1.get_color())\n ax1.yaxis.label.set_color(line1.get_color())\n ax1.tick_params(axis=\"y\", colors=line1.get_color())\n\n # second axes (latency)\n ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)\n line2 = ax2.plot(complexities, prediction_times, c=\"tab:orange\", ls=\"-\")[0]\n ax2.yaxis.tick_right()\n ax2.yaxis.set_label_position(\"right\")\n y2_label = \"Time (s)\"\n ax2.set_ylabel(y2_label)\n ax1.spines[\"right\"].set_color(line2.get_color())\n ax2.yaxis.label.set_color(line2.get_color())\n ax2.tick_params(axis=\"y\", colors=line2.get_color())\n\n plt.legend((line1, line2), (\"prediction error\", \"latency\"), loc=\"upper right\")\n\n plt.title(\n \"Influence of varying '%s' on %s\"\n % (conf[\"changing_param\"], conf[\"estimator\"].__name__)\n )\n\n\nfor conf in configurations:\n prediction_performances, prediction_times, complexities = benchmark_influence(conf)\n plot_influence(conf, prediction_performances, prediction_times, complexities)\nplt.show()"
101+
"def plot_influence(conf, mse_values, prediction_times, complexities):\n \"\"\"\n Plot influence of model complexity on both accuracy and latency.\n \"\"\"\n\n fig = plt.figure()\n fig.subplots_adjust(right=0.75)\n\n # first axes (prediction error)\n ax1 = fig.add_subplot(111)\n line1 = ax1.plot(complexities, mse_values, c=\"tab:blue\", ls=\"-\")[0]\n ax1.set_xlabel(\"Model Complexity (%s)\" % conf[\"complexity_label\"])\n y1_label = conf[\"prediction_performance_label\"]\n ax1.set_ylabel(y1_label)\n\n ax1.spines[\"left\"].set_color(line1.get_color())\n ax1.yaxis.label.set_color(line1.get_color())\n ax1.tick_params(axis=\"y\", colors=line1.get_color())\n\n # second axes (latency)\n ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)\n line2 = ax2.plot(complexities, prediction_times, c=\"tab:orange\", ls=\"-\")[0]\n ax2.yaxis.tick_right()\n ax2.yaxis.set_label_position(\"right\")\n y2_label = \"Time (s)\"\n ax2.set_ylabel(y2_label)\n ax1.spines[\"right\"].set_color(line2.get_color())\n ax2.yaxis.label.set_color(line2.get_color())\n ax2.tick_params(axis=\"y\", colors=line2.get_color())\n\n plt.legend(\n (line1, line2), (\"prediction error\", \"prediction latency\"), loc=\"upper right\"\n )\n\n plt.title(\n \"Influence of varying '%s' on %s\"\n % (conf[\"changing_param\"], conf[\"estimator\"].__name__)\n )\n\n\nfor conf in configurations:\n prediction_performances, prediction_times, complexities = benchmark_influence(conf)\n plot_influence(conf, prediction_performances, prediction_times, complexities)\nplt.show()"
102102
]
103103
},
104104
{

dev/_downloads/54c7ea3b3671861fbfb2161a6f0ab6d0/plot_nca_classification.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis\nfrom sklearn.pipeline import Pipeline\n\n\nn_neighbors = 1\n\ndataset = datasets.load_iris()\nX, y = dataset.data, dataset.target\n\n# we only take two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = X[:, [0, 2]]\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, stratify=y, test_size=0.7, random_state=42\n)\n\nh = 0.01 # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap([\"#FFAAAA\", \"#AAFFAA\", \"#AAAAFF\"])\ncmap_bold = ListedColormap([\"#FF0000\", \"#00FF00\", \"#0000FF\"])\n\nnames = [\"KNN\", \"NCA, KNN\"]\n\nclassifiers = [\n Pipeline(\n [\n (\"scaler\", StandardScaler()),\n (\"knn\", KNeighborsClassifier(n_neighbors=n_neighbors)),\n ]\n ),\n Pipeline(\n [\n (\"scaler\", StandardScaler()),\n (\"nca\", NeighborhoodComponentsAnalysis()),\n (\"knn\", KNeighborsClassifier(n_neighbors=n_neighbors)),\n ]\n ),\n]\n\nx_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\ny_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\nxx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n\nfor name, clf in zip(names, classifiers):\n\n clf.fit(X_train, y_train)\n score = clf.score(X_test, y_test)\n\n # Plot the decision boundary. For that, we will assign a color to each\n # point in the mesh [x_min, x_max]x[y_min, y_max].\n Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n # Put the result into a color plot\n Z = Z.reshape(xx.shape)\n plt.figure()\n plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=0.8)\n\n # Plot also the training and testing points\n plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor=\"k\", s=20)\n plt.xlim(xx.min(), xx.max())\n plt.ylim(yy.min(), yy.max())\n plt.title(\"{} (k = {})\".format(name, n_neighbors))\n plt.text(\n 0.9,\n 0.1,\n \"{:.2f}\".format(score),\n size=15,\n ha=\"center\",\n va=\"center\",\n transform=plt.gca().transAxes,\n )\n\nplt.show()"
29+
"# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis\nfrom sklearn.pipeline import Pipeline\n\n\nn_neighbors = 1\n\ndataset = datasets.load_iris()\nX, y = dataset.data, dataset.target\n\n# we only take two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = X[:, [0, 2]]\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, stratify=y, test_size=0.7, random_state=42\n)\n\nh = 0.05 # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap([\"#FFAAAA\", \"#AAFFAA\", \"#AAAAFF\"])\ncmap_bold = ListedColormap([\"#FF0000\", \"#00FF00\", \"#0000FF\"])\n\nnames = [\"KNN\", \"NCA, KNN\"]\n\nclassifiers = [\n Pipeline(\n [\n (\"scaler\", StandardScaler()),\n (\"knn\", KNeighborsClassifier(n_neighbors=n_neighbors)),\n ]\n ),\n Pipeline(\n [\n (\"scaler\", StandardScaler()),\n (\"nca\", NeighborhoodComponentsAnalysis()),\n (\"knn\", KNeighborsClassifier(n_neighbors=n_neighbors)),\n ]\n ),\n]\n\nx_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\ny_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\nxx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n\nfor name, clf in zip(names, classifiers):\n\n clf.fit(X_train, y_train)\n score = clf.score(X_test, y_test)\n\n # Plot the decision boundary. For that, we will assign a color to each\n # point in the mesh [x_min, x_max]x[y_min, y_max].\n Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n # Put the result into a color plot\n Z = Z.reshape(xx.shape)\n plt.figure()\n plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=0.8)\n\n # Plot also the training and testing points\n plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor=\"k\", s=20)\n plt.xlim(xx.min(), xx.max())\n plt.ylim(yy.min(), yy.max())\n plt.title(\"{} (k = {})\".format(name, n_neighbors))\n plt.text(\n 0.9,\n 0.1,\n \"{:.2f}\".format(score),\n size=15,\n ha=\"center\",\n va=\"center\",\n transform=plt.gca().transAxes,\n )\n\nplt.show()"
3030
]
3131
}
3232
],
Binary file not shown.

dev/_downloads/b7792f6c26a74369f67bbe6f9ac41edf/plot_nca_classification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
X, y, stratify=y, test_size=0.7, random_state=42
4141
)
4242

43-
h = 0.01 # step size in the mesh
43+
h = 0.05 # step size in the mesh
4444

4545
# Create color maps
4646
cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])

dev/_downloads/ddd79923ba48c7f71fb17697baa1a22b/plot_model_complexity_influence.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,13 @@
4242
import matplotlib.pyplot as plt
4343

4444
from sklearn import datasets
45-
from sklearn.utils import shuffle
45+
from sklearn.model_selection import train_test_split
4646
from sklearn.metrics import mean_squared_error
4747
from sklearn.svm import NuSVR
4848
from sklearn.ensemble import GradientBoostingRegressor
4949
from sklearn.linear_model import SGDClassifier
5050
from sklearn.metrics import hamming_loss
5151

52-
5352
# Initialize random generator
5453
np.random.seed(0)
5554

@@ -72,12 +71,14 @@ def generate_data(case):
7271
"""Generate regression/classification data."""
7372
if case == "regression":
7473
X, y = datasets.load_diabetes(return_X_y=True)
74+
train_size = 0.8
7575
elif case == "classification":
7676
X, y = datasets.fetch_20newsgroups_vectorized(subset="all", return_X_y=True)
77-
X, y = shuffle(X, y)
78-
offset = int(X.shape[0] * 0.8)
79-
X_train, y_train = X[:offset], y[:offset]
80-
X_test, y_test = X[offset:], y[offset:]
77+
train_size = 0.4 # to make the example run faster
78+
79+
X_train, X_test, y_train, y_test = train_test_split(
80+
X, y, train_size=train_size, random_state=0
81+
)
8182

8283
data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
8384
return data
@@ -174,33 +175,37 @@ def _count_nonzero_coefficients(estimator):
174175
"prediction_performance_label": "Hamming Loss (Misclassification Ratio)",
175176
"postfit_hook": lambda x: x.sparsify(),
176177
"data": classification_data,
177-
"n_samples": 30,
178+
"n_samples": 5,
178179
},
179180
{
180181
"estimator": NuSVR,
181182
"tuned_params": {"C": 1e3, "gamma": 2 ** -15},
182183
"changing_param": "nu",
183-
"changing_param_values": [0.1, 0.25, 0.5, 0.75, 0.9],
184+
"changing_param_values": [0.05, 0.1, 0.2, 0.35, 0.5],
184185
"complexity_label": "n_support_vectors",
185186
"complexity_computer": lambda x: len(x.support_vectors_),
186187
"data": regression_data,
187188
"postfit_hook": lambda x: x,
188189
"prediction_performance_computer": mean_squared_error,
189190
"prediction_performance_label": "MSE",
190-
"n_samples": 30,
191+
"n_samples": 15,
191192
},
192193
{
193194
"estimator": GradientBoostingRegressor,
194-
"tuned_params": {"loss": "squared_error"},
195+
"tuned_params": {
196+
"loss": "squared_error",
197+
"learning_rate": 0.05,
198+
"max_depth": 2,
199+
},
195200
"changing_param": "n_estimators",
196-
"changing_param_values": [10, 50, 100, 200, 500],
201+
"changing_param_values": [10, 25, 50, 75, 100],
197202
"complexity_label": "n_trees",
198203
"complexity_computer": lambda x: x.n_estimators,
199204
"data": regression_data,
200205
"postfit_hook": lambda x: x,
201206
"prediction_performance_computer": mean_squared_error,
202207
"prediction_performance_label": "MSE",
203-
"n_samples": 30,
208+
"n_samples": 15,
204209
},
205210
]
206211

@@ -255,7 +260,9 @@ def plot_influence(conf, mse_values, prediction_times, complexities):
255260
ax2.yaxis.label.set_color(line2.get_color())
256261
ax2.tick_params(axis="y", colors=line2.get_color())
257262

258-
plt.legend((line1, line2), ("prediction error", "latency"), loc="upper right")
263+
plt.legend(
264+
(line1, line2), ("prediction error", "prediction latency"), loc="upper right"
265+
)
259266

260267
plt.title(
261268
"Influence of varying '%s' on %s"
@@ -268,7 +275,6 @@ def plot_influence(conf, mse_values, prediction_times, complexities):
268275
plot_influence(conf, prediction_performances, prediction_times, complexities)
269276
plt.show()
270277

271-
272278
##############################################################################
273279
# Conclusion
274280
# ----------

dev/_downloads/scikit-learn-docs.zip

2.74 KB
Binary file not shown.
-2 Bytes
-4 Bytes
294 Bytes

0 commit comments

Comments
 (0)