scikit-learn
diff --git a/‎1.4/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎1.4/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎1.4/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-2.73 KB b/‎1.4/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-2.73 KB
diff --git a/‎1.4/_downloads/18eb95af29bd5554020a8428b3ceac54/plot_cluster_iris.ipynb
Lines changed: 2 additions & 2 deletions b/‎1.4/_downloads/18eb95af29bd5554020a8428b3ceac54/plot_cluster_iris.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎1.4/_downloads/3c3c738275484acc54821615bf72894a/plot_permutation_importance.py
Lines changed: 2 additions & 2 deletions b/‎1.4/_downloads/3c3c738275484acc54821615bf72894a/plot_permutation_importance.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎1.4/_downloads/48802d222a21d57b36b5e7a61adb770c/plot_cv_digits.ipynb
Lines changed: 0 additions & 43 deletions b/‎1.4/_downloads/48802d222a21d57b36b5e7a61adb770c/plot_cv_digits.ipynb
Lines changed: 0 additions & 43 deletions
diff --git a/‎1.4/_downloads/53490cdb42c3c07ba8cccd1c4ed4dca4/plot_release_highlights_1_4_0.ipynb
Lines changed: 31 additions & 2 deletions b/‎1.4/_downloads/53490cdb42c3c07ba8cccd1c4ed4dca4/plot_release_highlights_1_4_0.ipynb
Lines changed: 31 additions & 2 deletions
diff --git a/‎1.4/_downloads/6304a55e6fa4d75c8e8d11b4ea9a8679/plot_pca_3d.py
Lines changed: 0 additions & 99 deletions b/‎1.4/_downloads/6304a55e6fa4d75c8e8d11b4ea9a8679/plot_pca_3d.py
Lines changed: 0 additions & 99 deletions
diff --git a/‎1.4/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
-4.46 KB b/‎1.4/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
-4.46 KB
diff --git a/‎1.4/_downloads/720e4861bf00cd09b55ae64187ea58be/plot_pca_3d.ipynb
Lines changed: 0 additions & 79 deletions b/‎1.4/_downloads/720e4861bf00cd09b55ae64187ea58be/plot_pca_3d.ipynb
Lines changed: 0 additions & 79 deletions
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 2f03b9ed6a366461c62ef145db2ca4a5
+config: c086ce46b402dc6decef6d3cb80277c2
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -4,7 +4,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# K-means Clustering\n\nThe plot shows:\n\n- top left: What a K-means algorithm would yield using 8 clusters.\n\n- top right: What the effect of a bad initialization is\n  on the classification process: By setting n_init to only 1\n  (default is 10), the amount of times that the algorithm will\n  be run with different centroid seeds is reduced.\n\n- bottom left: What using eight clusters would deliver.\n\n- bottom right: The ground truth.\n"
+        "\n# K-means Clustering\n\nThe plot shows:\n\n- top left: What a K-means algorithm would yield using 8 clusters.\n\n- top right: What using three clusters would deliver.\n\n- bottom left: What the effect of a bad initialization is\n  on the classification process: By setting n_init to only 1\n  (default is 10), the amount of times that the algorithm will\n  be run with different centroid seeds is reduced.\n\n- bottom right: The ground truth.\n"
       ]
     },
     {
@@ -15,7 +15,7 @@
       },
       "outputs": [],
       "source": [
-        "# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\n# Though the following import is not directly being used, it is required\n# for 3D projection to work with matplotlib < 3.2\nimport mpl_toolkits.mplot3d  # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.cluster import KMeans\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nestimators = [\n    (\"k_means_iris_8\", KMeans(n_clusters=8)),\n    (\"k_means_iris_3\", KMeans(n_clusters=3)),\n    (\"k_means_iris_bad_init\", KMeans(n_clusters=3, n_init=1, init=\"random\")),\n]\n\nfig = plt.figure(figsize=(10, 8))\ntitles = [\"8 clusters\", \"3 clusters\", \"3 clusters, bad initialization\"]\nfor idx, ((name, est), title) in enumerate(zip(estimators, titles)):\n    ax = fig.add_subplot(2, 2, idx + 1, projection=\"3d\", elev=48, azim=134)\n    est.fit(X)\n    labels = est.labels_\n\n    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor=\"k\")\n\n    ax.xaxis.set_ticklabels([])\n    ax.yaxis.set_ticklabels([])\n    ax.zaxis.set_ticklabels([])\n    ax.set_xlabel(\"Petal width\")\n    ax.set_ylabel(\"Sepal length\")\n    ax.set_zlabel(\"Petal length\")\n    ax.set_title(title)\n\n# Plot the ground truth\nax = fig.add_subplot(2, 2, 4, projection=\"3d\", elev=48, azim=134)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n    ax.text3D(\n        X[y == label, 3].mean(),\n        X[y == label, 0].mean(),\n        X[y == label, 2].mean() + 2,\n        name,\n        horizontalalignment=\"center\",\n        bbox=dict(alpha=0.2, edgecolor=\"w\", facecolor=\"w\"),\n    )\n# Reorder the labels to have colors matching the cluster results\ny = np.choose(y, [1, 2, 0]).astype(float)\nax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\nax.set_xlabel(\"Petal width\")\nax.set_ylabel(\"Sepal length\")\nax.set_zlabel(\"Petal length\")\nax.set_title(\"Ground Truth\")\n\nplt.subplots_adjust(wspace=0.25, hspace=0.25)\nplt.show()"
+        "# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\n# Though the following import is not directly being used, it is required\n# for 3D projection to work with matplotlib < 3.2\nimport mpl_toolkits.mplot3d  # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.cluster import KMeans\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nestimators = [\n    (\"k_means_iris_8\", KMeans(n_clusters=8)),\n    (\"k_means_iris_3\", KMeans(n_clusters=3)),\n    (\"k_means_iris_bad_init\", KMeans(n_clusters=3, n_init=1, init=\"random\")),\n]\n\nfig = plt.figure(figsize=(10, 8))\ntitles = [\"8 clusters\", \"3 clusters\", \"3 clusters, bad initialization\"]\nfor idx, ((name, est), title) in enumerate(zip(estimators, titles)):\n    ax = fig.add_subplot(2, 2, idx + 1, projection=\"3d\", elev=48, azim=134)\n    est.fit(X)\n    labels = est.labels_\n\n    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor=\"k\")\n\n    ax.xaxis.set_ticklabels([])\n    ax.yaxis.set_ticklabels([])\n    ax.zaxis.set_ticklabels([])\n    ax.set_xlabel(\"Petal width\")\n    ax.set_ylabel(\"Sepal length\")\n    ax.set_zlabel(\"Petal length\")\n    ax.set_title(title)\n\n# Plot the ground truth\nax = fig.add_subplot(2, 2, 4, projection=\"3d\", elev=48, azim=134)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n    ax.text3D(\n        X[y == label, 3].mean(),\n        X[y == label, 0].mean(),\n        X[y == label, 2].mean() + 2,\n        name,\n        horizontalalignment=\"center\",\n        bbox=dict(alpha=0.2, edgecolor=\"w\", facecolor=\"w\"),\n    )\n\nax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\nax.set_xlabel(\"Petal width\")\nax.set_ylabel(\"Sepal length\")\nax.set_zlabel(\"Petal length\")\nax.set_title(\"Ground Truth\")\n\nplt.subplots_adjust(wspace=0.25, hspace=0.25)\nplt.show()"
       ]
     }
   ],
 
@@ -24,8 +24,6 @@
      2001. <10.1023/A:1010933404324>`
 
 """
-# %%
-import numpy as np
 
 # %%
 # Data Loading and Feature Engineering
@@ -40,6 +38,8 @@
 #   values as records).
 # - ``random_cat`` is a low cardinality categorical variable (3 possible
 #   values).
+import numpy as np
+
 from sklearn.datasets import fetch_openml
 from sklearn.model_selection import train_test_split
 
 
@@ -58,7 +58,18 @@
       },
       "outputs": [],
       "source": [
-        "import polars as pl\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\n\ndf = pl.DataFrame(\n    {\"height\": [120, 140, 150, 110, 100], \"pet\": [\"dog\", \"cat\", \"dog\", \"cat\", \"cat\"]}\n)\npreprocessor = ColumnTransformer(\n    [\n        (\"numerical\", StandardScaler(), [\"height\"]),\n        (\"categorical\", OneHotEncoder(sparse_output=False), [\"pet\"]),\n    ],\n    verbose_feature_names_out=False,\n)\npreprocessor.set_output(transform=\"polars\")\n\ndf_out = preprocessor.fit_transform(df)\nprint(f\"Output type: {type(df_out)}\")"
+        "import polars as pl\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\n\ndf = pl.DataFrame(\n    {\"height\": [120, 140, 150, 110, 100], \"pet\": [\"dog\", \"cat\", \"dog\", \"cat\", \"cat\"]}\n)\npreprocessor = ColumnTransformer(\n    [\n        (\"numerical\", StandardScaler(), [\"height\"]),\n        (\"categorical\", OneHotEncoder(sparse_output=False), [\"pet\"]),\n    ],\n    verbose_feature_names_out=False,\n)\npreprocessor.set_output(transform=\"polars\")\n\ndf_out = preprocessor.fit_transform(df)\ndf_out"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(f\"Output type: {type(df_out)}\")"
       ]
     },
     {
@@ -137,7 +148,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Metadata Routing Support\nMany meta-estimators and cross-validation routines now support metadata\nrouting, which are listed in the `user guide\n<_metadata_routing_models>`. For instance, this is how you can do a nested\ncross-validation with sample weights and :class:`~model_selection.GroupKFold`:\n\n"
+        "## Metadata Routing Support\nMany meta-estimators and cross-validation routines now support metadata\nrouting, which are listed in the `user guide\n<metadata_routing_models>`. For instance, this is how you can do a nested\ncross-validation with sample weights and :class:`~model_selection.GroupKFold`:\n\n"
       ]
     },
     {
@@ -150,6 +161,24 @@
       "source": [
         "import sklearn\nfrom sklearn.metrics import get_scorer\nfrom sklearn.datasets import make_regression\nfrom sklearn.linear_model import Lasso\nfrom sklearn.model_selection import GridSearchCV, cross_validate, GroupKFold\n\n# For now by default metadata routing is disabled, and need to be explicitly\n# enabled.\nsklearn.set_config(enable_metadata_routing=True)\n\nn_samples = 100\nX, y = make_regression(n_samples=n_samples, n_features=5, noise=0.5)\nrng = np.random.RandomState(7)\ngroups = rng.randint(0, 10, size=n_samples)\nsample_weights = rng.rand(n_samples)\nestimator = Lasso().set_fit_request(sample_weight=True)\nhyperparameter_grid = {\"alpha\": [0.1, 0.5, 1.0, 2.0]}\nscoring_inner_cv = get_scorer(\"neg_mean_squared_error\").set_score_request(\n    sample_weight=True\n)\ninner_cv = GroupKFold(n_splits=5)\n\ngrid_search = GridSearchCV(\n    estimator=estimator,\n    param_grid=hyperparameter_grid,\n    cv=inner_cv,\n    scoring=scoring_inner_cv,\n)\n\nouter_cv = GroupKFold(n_splits=5)\nscorers = {\n    \"mse\": get_scorer(\"neg_mean_squared_error\").set_score_request(sample_weight=True)\n}\nresults = cross_validate(\n    grid_search,\n    X,\n    y,\n    cv=outer_cv,\n    scoring=scorers,\n    return_estimator=True,\n    params={\"sample_weight\": sample_weights, \"groups\": groups},\n)\nprint(\"cv error on test sets:\", results[\"test_mse\"])\n\n# Setting the flag to the default `False` to avoid interference with other\n# scripts.\nsklearn.set_config(enable_metadata_routing=False)"
       ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Improved memory and runtime efficiency for PCA on sparse data\nPCA is now able to handle sparse matrices natively for the `arpack`\nsolver by levaraging `scipy.sparse.linalg.LinearOperator` to avoid\nmaterializing large sparse matrices when performing the\neigenvalue decomposition of the data set covariance matrix.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.decomposition import PCA\nimport scipy.sparse as sp\nfrom time import time\n\nX_sparse = sp.random(m=1000, n=1000, random_state=0)\nX_dense = X_sparse.toarray()\n\nt0 = time()\nPCA(n_components=10, svd_solver=\"arpack\").fit(X_sparse)\ntime_sparse = time() - t0\n\nt0 = time()\nPCA(n_components=10, svd_solver=\"arpack\").fit(X_dense)\ntime_dense = time() - t0\n\nprint(f\"Speedup: {time_dense / time_sparse:.1f}x\")"
+      ]
     }
   ],
   "metadata": {
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`"cell_type": "markdown",`
`5`	`5`	`"metadata": {},`
`6`	`6`	`"source": [`
`7`		`- "\n# K-means Clustering\n\nThe plot shows:\n\n- top left: What a K-means algorithm would yield using 8 clusters.\n\n- top right: What the effect of a bad initialization is\n on the classification process: By setting n_init to only 1\n (default is 10), the amount of times that the algorithm will\n be run with different centroid seeds is reduced.\n\n- bottom left: What using eight clusters would deliver.\n\n- bottom right: The ground truth.\n"`
	`7`	`+ "\n# K-means Clustering\n\nThe plot shows:\n\n- top left: What a K-means algorithm would yield using 8 clusters.\n\n- top right: What using three clusters would deliver.\n\n- bottom left: What the effect of a bad initialization is\n on the classification process: By setting n_init to only 1\n (default is 10), the amount of times that the algorithm will\n be run with different centroid seeds is reduced.\n\n- bottom right: The ground truth.\n"`
`8`	`8`	`]`
`9`	`9`	`},`
`10`	`10`	`{`
`@@ -15,7 +15,7 @@`
`15`	`15`	`},`
`16`	`16`	`"outputs": [],`
`17`	`17`	`"source": [`
`18`		- "# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\n# Though the following import is not directly being used, it is required\n# for 3D projection to work with matplotlib < 3.2\nimport mpl_toolkits.mplot3d # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.cluster import KMeans\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nestimators = [\n (\"k_means_iris_8\", KMeans(n_clusters=8)),\n (\"k_means_iris_3\", KMeans(n_clusters=3)),\n (\"k_means_iris_bad_init\", KMeans(n_clusters=3, n_init=1, init=\"random\")),\n]\n\nfig = plt.figure(figsize=(10, 8))\ntitles = [\"8 clusters\", \"3 clusters\", \"3 clusters, bad initialization\"]\nfor idx, ((name, est), title) in enumerate(zip(estimators, titles)):\n ax = fig.add_subplot(2, 2, idx + 1, projection=\"3d\", elev=48, azim=134)\n est.fit(X)\n labels = est.labels_\n\n ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor=\"k\")\n\n ax.xaxis.set_ticklabels([])\n ax.yaxis.set_ticklabels([])\n ax.zaxis.set_ticklabels([])\n ax.set_xlabel(\"Petal width\")\n ax.set_ylabel(\"Sepal length\")\n ax.set_zlabel(\"Petal length\")\n ax.set_title(title)\n\n# Plot the ground truth\nax = fig.add_subplot(2, 2, 4, projection=\"3d\", elev=48, azim=134)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n ax.text3D(\n X[y == label, 3].mean(),\n X[y == label, 0].mean(),\n X[y == label, 2].mean() + 2,\n name,\n horizontalalignment=\"center\",\n bbox=dict(alpha=0.2, edgecolor=\"w\", facecolor=\"w\"),\n )\n# Reorder the labels to have colors matching the cluster results\ny = np.choose(y, [1, 2, 0]).astype(float)\nax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\nax.set_xlabel(\"Petal width\")\nax.set_ylabel(\"Sepal length\")\nax.set_zlabel(\"Petal length\")\nax.set_title(\"Ground Truth\")\n\nplt.subplots_adjust(wspace=0.25, hspace=0.25)\nplt.show()"
	`18`	+ "# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\n# Though the following import is not directly being used, it is required\n# for 3D projection to work with matplotlib < 3.2\nimport mpl_toolkits.mplot3d # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.cluster import KMeans\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nestimators = [\n (\"k_means_iris_8\", KMeans(n_clusters=8)),\n (\"k_means_iris_3\", KMeans(n_clusters=3)),\n (\"k_means_iris_bad_init\", KMeans(n_clusters=3, n_init=1, init=\"random\")),\n]\n\nfig = plt.figure(figsize=(10, 8))\ntitles = [\"8 clusters\", \"3 clusters\", \"3 clusters, bad initialization\"]\nfor idx, ((name, est), title) in enumerate(zip(estimators, titles)):\n ax = fig.add_subplot(2, 2, idx + 1, projection=\"3d\", elev=48, azim=134)\n est.fit(X)\n labels = est.labels_\n\n ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor=\"k\")\n\n ax.xaxis.set_ticklabels([])\n ax.yaxis.set_ticklabels([])\n ax.zaxis.set_ticklabels([])\n ax.set_xlabel(\"Petal width\")\n ax.set_ylabel(\"Sepal length\")\n ax.set_zlabel(\"Petal length\")\n ax.set_title(title)\n\n# Plot the ground truth\nax = fig.add_subplot(2, 2, 4, projection=\"3d\", elev=48, azim=134)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n ax.text3D(\n X[y == label, 3].mean(),\n X[y == label, 0].mean(),\n X[y == label, 2].mean() + 2,\n name,\n horizontalalignment=\"center\",\n bbox=dict(alpha=0.2, edgecolor=\"w\", facecolor=\"w\"),\n )\n\nax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\nax.set_xlabel(\"Petal width\")\nax.set_ylabel(\"Sepal length\")\nax.set_zlabel(\"Petal length\")\nax.set_title(\"Ground Truth\")\n\nplt.subplots_adjust(wspace=0.25, hspace=0.25)\nplt.show()"
`19`	`19`	`]`
`20`	`20`	`}`
`21`	`21`	`],`