Skip to content

Commit 91ea20e

Browse files
committed
Pushing the docs to _pst_preview/ for branch: new_web_theme, commit 738c089c6109a435af6571121bbcf589e0337dc2
1 parent 9fa121e commit 91ea20e

File tree

1,453 files changed

+10933
-9770
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,453 files changed

+10933
-9770
lines changed

_pst_preview/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: 2989f2cef33afd6b3d0497449fd13634
3+
config: 951d6f26a98cfac1c6be55856f555e79
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.

_pst_preview/_downloads/21a6ff17ef2837fe1cd49e63223a368d/plot_unveil_tree_structure.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
# - ``weighted_n_node_samples[i]``: the weighted number of training samples
6969
# reaching node ``i``
7070
# - ``value[i, j, k]``: the summary of the training samples that reached node i for
71-
# class j and output k.
71+
# output j and class k (for regression tree, class is set to 1).
7272
#
7373
# Using the arrays, we can traverse the tree structure to compute various
7474
# properties. Below, we will compute the depth of each node and whether or not
Binary file not shown.

_pst_preview/_downloads/898b30acf62919d918478efbe526195f/plot_digits_pipe.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
},
1616
"outputs": [],
1717
"source": [
18-
"# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn import datasets\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\n\n# Define a pipeline to search for the best combination of PCA truncation\n# and classifier regularization.\npca = PCA()\n# Define a Standard Scaler to normalize inputs\nscaler = StandardScaler()\n\n# set the tolerance to a large value to make the example faster\nlogistic = LogisticRegression(max_iter=10000, tol=0.1)\npipe = Pipeline(steps=[(\"scaler\", scaler), (\"pca\", pca), (\"logistic\", logistic)])\n\nX_digits, y_digits = datasets.load_digits(return_X_y=True)\n# Parameters of pipelines can be set using '__' separated parameter names:\nparam_grid = {\n \"pca__n_components\": [5, 15, 30, 45, 60],\n \"logistic__C\": np.logspace(-4, 4, 4),\n}\nsearch = GridSearchCV(pipe, param_grid, n_jobs=2)\nsearch.fit(X_digits, y_digits)\nprint(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\nprint(search.best_params_)\n\n# Plot the PCA spectrum\npca.fit(X_digits)\n\nfig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))\nax0.plot(\n np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, \"+\", linewidth=2\n)\nax0.set_ylabel(\"PCA explained variance ratio\")\n\nax0.axvline(\n search.best_estimator_.named_steps[\"pca\"].n_components,\n linestyle=\":\",\n label=\"n_components chosen\",\n)\nax0.legend(prop=dict(size=12))\n\n# For each number of components, find the best classifier results\nresults = pd.DataFrame(search.cv_results_)\ncomponents_col = \"param_pca__n_components\"\nbest_clfs = results.groupby(components_col)[\n [components_col, \"mean_test_score\", \"std_test_score\"]\n].apply(lambda g: g.nlargest(1, \"mean_test_score\"))\nax1.errorbar(\n best_clfs[components_col],\n best_clfs[\"mean_test_score\"],\n yerr=best_clfs[\"std_test_score\"],\n)\nax1.set_ylabel(\"Classification accuracy (val)\")\nax1.set_xlabel(\"n_components\")\n\nplt.xlim(-1, 70)\n\nplt.tight_layout()\nplt.show()"
18+
"# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport polars as pl\n\nfrom sklearn import datasets\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\n\n# Define a pipeline to search for the best combination of PCA truncation\n# and classifier regularization.\npca = PCA()\n# Define a Standard Scaler to normalize inputs\nscaler = StandardScaler()\n\n# set the tolerance to a large value to make the example faster\nlogistic = LogisticRegression(max_iter=10000, tol=0.1)\npipe = Pipeline(steps=[(\"scaler\", scaler), (\"pca\", pca), (\"logistic\", logistic)])\n\nX_digits, y_digits = datasets.load_digits(return_X_y=True)\n# Parameters of pipelines can be set using '__' separated parameter names:\nparam_grid = {\n \"pca__n_components\": [5, 15, 30, 45, 60],\n \"logistic__C\": np.logspace(-4, 4, 4),\n}\nsearch = GridSearchCV(pipe, param_grid, n_jobs=2)\nsearch.fit(X_digits, y_digits)\nprint(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\nprint(search.best_params_)\n\n# Plot the PCA spectrum\npca.fit(X_digits)\n\nfig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))\nax0.plot(\n np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, \"+\", linewidth=2\n)\nax0.set_ylabel(\"PCA explained variance ratio\")\n\nax0.axvline(\n search.best_estimator_.named_steps[\"pca\"].n_components,\n linestyle=\":\",\n label=\"n_components chosen\",\n)\nax0.legend(prop=dict(size=12))\n\n# For each number of components, find the best classifier results\ncomponents_col = \"param_pca__n_components\"\nis_max_test_score = pl.col(\"mean_test_score\") == pl.col(\"mean_test_score\").max()\nbest_clfs = (\n pl.LazyFrame(search.cv_results_)\n .filter(is_max_test_score.over(components_col))\n .unique(components_col)\n .sort(components_col)\n .collect()\n)\nax1.errorbar(\n best_clfs[components_col],\n best_clfs[\"mean_test_score\"],\n yerr=best_clfs[\"std_test_score\"],\n)\nax1.set_ylabel(\"Classification accuracy (val)\")\nax1.set_xlabel(\"n_components\")\n\nplt.xlim(-1, 70)\n\nplt.tight_layout()\nplt.show()"
1919
]
2020
}
2121
],

_pst_preview/_downloads/b7e32fe54d613dce0d3c376377af061d/plot_outlier_detection_bench.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
This example compares two outlier detection algorithms, namely
77
:ref:`local_outlier_factor` (LOF) and :ref:`isolation_forest` (IForest), on
88
real-world datasets available in :class:`sklearn.datasets`. The goal is to show
9-
that different algorithms perform well on different datasets.
9+
that different algorithms perform well on different datasets and contrast their
10+
training speed and sensitivity to hyperparameters.
1011
11-
The algorithms are trained in an outlier detection context:
12+
The algorithms are trained (without labels) on the whole dataset assumed to
13+
contain outliers.
1214
1315
1. The ROC curves are computed using knowledge of the ground-truth labels
1416
and displayed using :class:`~sklearn.metrics.RocCurveDisplay`.
@@ -314,6 +316,12 @@ def fit_predict(estimator, X):
314316
# datasets. The score for IForest is slightly better for the SA dataset and LOF
315317
# performs considerably better on the Ames housing dataset than IForest.
316318
#
319+
# Recall however that Isolation Forest tends to train much faster than LOF on
320+
# datasets with a large number of samples. LOF needs to compute pairwise
321+
# distances to find nearest neighbors, which has a quadratic complexity with respect
322+
# to the number of observations. This can make this method prohibitive on large
323+
# datasets.
324+
#
317325
# Ablation study
318326
# ==============
319327
#

_pst_preview/_downloads/ba89a400c6902f85c10199ff86947d23/plot_digits_pipe.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import matplotlib.pyplot as plt
1818
import numpy as np
19-
import pandas as pd
19+
import polars as pl
2020

2121
from sklearn import datasets
2222
from sklearn.decomposition import PCA
@@ -63,11 +63,15 @@
6363
ax0.legend(prop=dict(size=12))
6464

6565
# For each number of components, find the best classifier results
66-
results = pd.DataFrame(search.cv_results_)
6766
components_col = "param_pca__n_components"
68-
best_clfs = results.groupby(components_col)[
69-
[components_col, "mean_test_score", "std_test_score"]
70-
].apply(lambda g: g.nlargest(1, "mean_test_score"))
67+
is_max_test_score = pl.col("mean_test_score") == pl.col("mean_test_score").max()
68+
best_clfs = (
69+
pl.LazyFrame(search.cv_results_)
70+
.filter(is_max_test_score.over(components_col))
71+
.unique(components_col)
72+
.sort(components_col)
73+
.collect()
74+
)
7175
ax1.errorbar(
7276
best_clfs[components_col],
7377
best_clfs["mean_test_score"],

_pst_preview/_downloads/eacb6a63c887dafcff02b3cee64854ef/plot_outlier_detection_bench.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"\n# Evaluation of outlier detection estimators\n\nThis example compares two outlier detection algorithms, namely\n`local_outlier_factor` (LOF) and `isolation_forest` (IForest), on\nreal-world datasets available in :class:`sklearn.datasets`. The goal is to show\nthat different algorithms perform well on different datasets.\n\nThe algorithms are trained in an outlier detection context:\n\n1. The ROC curves are computed using knowledge of the ground-truth labels\nand displayed using :class:`~sklearn.metrics.RocCurveDisplay`.\n\n2. The performance is assessed in terms of the ROC-AUC.\n"
7+
"\n# Evaluation of outlier detection estimators\n\nThis example compares two outlier detection algorithms, namely\n`local_outlier_factor` (LOF) and `isolation_forest` (IForest), on\nreal-world datasets available in :class:`sklearn.datasets`. The goal is to show\nthat different algorithms perform well on different datasets and contrast their\ntraining speed and sensitivity to hyperparameters.\n\nThe algorithms are trained (without labels) on the whole dataset assumed to\ncontain outliers.\n\n1. The ROC curves are computed using knowledge of the ground-truth labels\nand displayed using :class:`~sklearn.metrics.RocCurveDisplay`.\n\n2. The performance is assessed in terms of the ROC-AUC.\n"
88
]
99
},
1010
{
@@ -217,7 +217,7 @@
217217
"cell_type": "markdown",
218218
"metadata": {},
219219
"source": [
220-
"We observe that once the number of neighbors is tuned, LOF and IForest perform\nsimilarly in terms of ROC AUC for the forestcover and cardiotocography\ndatasets. The score for IForest is slightly better for the SA dataset and LOF\nperforms considerably better on the Ames housing dataset than IForest.\n\n## Ablation study\n\nIn this section we explore the impact of the hyperparameter `n_neighbors` and\nthe choice of scaling the numerical variables on the LOF model. Here we use\nthe `covtype_dataset` dataset as the binary encoded categories introduce\na natural scale of euclidean distances between 0 and 1. We then want a scaling\nmethod to avoid granting a privilege to non-binary features and that is robust\nenough to outliers so that the task of finding them does not become too\ndifficult.\n\n"
220+
"We observe that once the number of neighbors is tuned, LOF and IForest perform\nsimilarly in terms of ROC AUC for the forestcover and cardiotocography\ndatasets. The score for IForest is slightly better for the SA dataset and LOF\nperforms considerably better on the Ames housing dataset than IForest.\n\nRecall however that Isolation Forest tends to train much faster than LOF on\ndatasets with a large number of samples. LOF needs to compute pairwise\ndistances to find nearest neighbors, which has a quadratic complexity with respect\nto the number of observations. This can make this method prohibitive on large\ndatasets.\n\n## Ablation study\n\nIn this section we explore the impact of the hyperparameter `n_neighbors` and\nthe choice of scaling the numerical variables on the LOF model. Here we use\nthe `covtype_dataset` dataset as the binary encoded categories introduce\na natural scale of euclidean distances between 0 and 1. We then want a scaling\nmethod to avoid granting a privilege to non-binary features and that is robust\nenough to outliers so that the task of finding them does not become too\ndifficult.\n\n"
221221
]
222222
},
223223
{

_pst_preview/_downloads/f7a387851c5762610f4e8197e52bbbca/plot_unveil_tree_structure.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
"cell_type": "markdown",
4141
"metadata": {},
4242
"source": [
43-
"## Tree structure\n\nThe decision classifier has an attribute called ``tree_`` which allows access\nto low level attributes such as ``node_count``, the total number of nodes,\nand ``max_depth``, the maximal depth of the tree. The\n``tree_.compute_node_depths()`` method computes the depth of each node in the\ntree. `tree_` also stores the entire binary tree structure, represented as a\nnumber of parallel arrays. The i-th element of each array holds information\nabout the node ``i``. Node 0 is the tree's root. Some of the arrays only\napply to either leaves or split nodes. In this case the values of the nodes\nof the other type is arbitrary. For example, the arrays ``feature`` and\n``threshold`` only apply to split nodes. The values for leaf nodes in these\narrays are therefore arbitrary.\n\nAmong these arrays, we have:\n\n - ``children_left[i]``: id of the left child of node ``i`` or -1 if leaf\n node\n - ``children_right[i]``: id of the right child of node ``i`` or -1 if leaf\n node\n - ``feature[i]``: feature used for splitting node ``i``\n - ``threshold[i]``: threshold value at node ``i``\n - ``n_node_samples[i]``: the number of training samples reaching node\n ``i``\n - ``impurity[i]``: the impurity at node ``i``\n - ``weighted_n_node_samples[i]``: the weighted number of training samples\n reaching node ``i``\n - ``value[i, j, k]``: the summary of the training samples that reached node i for\n class j and output k.\n\nUsing the arrays, we can traverse the tree structure to compute various\nproperties. Below, we will compute the depth of each node and whether or not\nit is a leaf.\n\n"
43+
"## Tree structure\n\nThe decision classifier has an attribute called ``tree_`` which allows access\nto low level attributes such as ``node_count``, the total number of nodes,\nand ``max_depth``, the maximal depth of the tree. The\n``tree_.compute_node_depths()`` method computes the depth of each node in the\ntree. `tree_` also stores the entire binary tree structure, represented as a\nnumber of parallel arrays. The i-th element of each array holds information\nabout the node ``i``. Node 0 is the tree's root. Some of the arrays only\napply to either leaves or split nodes. In this case the values of the nodes\nof the other type is arbitrary. For example, the arrays ``feature`` and\n``threshold`` only apply to split nodes. The values for leaf nodes in these\narrays are therefore arbitrary.\n\nAmong these arrays, we have:\n\n - ``children_left[i]``: id of the left child of node ``i`` or -1 if leaf\n node\n - ``children_right[i]``: id of the right child of node ``i`` or -1 if leaf\n node\n - ``feature[i]``: feature used for splitting node ``i``\n - ``threshold[i]``: threshold value at node ``i``\n - ``n_node_samples[i]``: the number of training samples reaching node\n ``i``\n - ``impurity[i]``: the impurity at node ``i``\n - ``weighted_n_node_samples[i]``: the weighted number of training samples\n reaching node ``i``\n - ``value[i, j, k]``: the summary of the training samples that reached node i for\n output j and class k (for regression tree, class is set to 1).\n\nUsing the arrays, we can traverse the tree structure to compute various\nproperties. Below, we will compute the depth of each node and whether or not\nit is a leaf.\n\n"
4444
]
4545
},
4646
{
236 Bytes

0 commit comments

Comments
 (0)