scikit-learn
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
804 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
804 Bytes
diff --git a/‎dev/_downloads/53490cdb42c3c07ba8cccd1c4ed4dca4/plot_release_highlights_1_4_0.ipynb
Lines changed: 18 additions & 0 deletions b/‎dev/_downloads/53490cdb42c3c07ba8cccd1c4ed4dca4/plot_release_highlights_1_4_0.ipynb
Lines changed: 18 additions & 0 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.02 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.02 KB
diff --git a/‎dev/_downloads/c15cce0dbcd8722cb5638987eff985c0/plot_release_highlights_1_4_0.py
Lines changed: 25 additions & 0 deletions b/‎dev/_downloads/c15cce0dbcd8722cb5638987eff985c0/plot_release_highlights_1_4_0.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
7.96 KB b/‎dev/_downloads/scikit-learn-docs.zip
7.96 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
287 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
287 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
419 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
419 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
213 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
213 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-2 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-2 Bytes
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: dac8c3fe1a779cbc4a2e7650e8445792
+config: d7564201336e2c7517e53f9b29700ccb
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -161,6 +161,24 @@
       "source": [
         "import sklearn\nfrom sklearn.metrics import get_scorer\nfrom sklearn.datasets import make_regression\nfrom sklearn.linear_model import Lasso\nfrom sklearn.model_selection import GridSearchCV, cross_validate, GroupKFold\n\n# For now by default metadata routing is disabled, and need to be explicitly\n# enabled.\nsklearn.set_config(enable_metadata_routing=True)\n\nn_samples = 100\nX, y = make_regression(n_samples=n_samples, n_features=5, noise=0.5)\nrng = np.random.RandomState(7)\ngroups = rng.randint(0, 10, size=n_samples)\nsample_weights = rng.rand(n_samples)\nestimator = Lasso().set_fit_request(sample_weight=True)\nhyperparameter_grid = {\"alpha\": [0.1, 0.5, 1.0, 2.0]}\nscoring_inner_cv = get_scorer(\"neg_mean_squared_error\").set_score_request(\n    sample_weight=True\n)\ninner_cv = GroupKFold(n_splits=5)\n\ngrid_search = GridSearchCV(\n    estimator=estimator,\n    param_grid=hyperparameter_grid,\n    cv=inner_cv,\n    scoring=scoring_inner_cv,\n)\n\nouter_cv = GroupKFold(n_splits=5)\nscorers = {\n    \"mse\": get_scorer(\"neg_mean_squared_error\").set_score_request(sample_weight=True)\n}\nresults = cross_validate(\n    grid_search,\n    X,\n    y,\n    cv=outer_cv,\n    scoring=scorers,\n    return_estimator=True,\n    params={\"sample_weight\": sample_weights, \"groups\": groups},\n)\nprint(\"cv error on test sets:\", results[\"test_mse\"])\n\n# Setting the flag to the default `False` to avoid interference with other\n# scripts.\nsklearn.set_config(enable_metadata_routing=False)"
       ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Improved memory and runtime efficiency for PCA on sparse data\nPCA is now able to handle sparse matrices natively for the `arpack`\nsolver by levaraging `scipy.sparse.linalg.LinearOperator` to avoid\nmaterializing large sparse matrices when performing the\neigenvalue decomposition of the data set covariance matrix.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.decomposition import PCA\nimport scipy.sparse as sp\nfrom time import time\n\nX_sparse = sp.random(m=1000, n=1000, random_state=0)\nX_dense = X_sparse.toarray()\n\nt0 = time()\nPCA(n_components=10, svd_solver=\"arpack\").fit(X_sparse)\ntime_sparse = time() - t0\n\nt0 = time()\nPCA(n_components=10, svd_solver=\"arpack\").fit(X_dense)\ntime_dense = time() - t0\n\nprint(f\"Speedup: {time_dense / time_sparse:.1f}x\")"
+      ]
     }
   ],
   "metadata": {
 
@@ -207,3 +207,28 @@
 # Setting the flag to the default `False` to avoid interference with other
 # scripts.
 sklearn.set_config(enable_metadata_routing=False)
+
+# %%
+# Improved memory and runtime efficiency for PCA on sparse data
+# -------------------------------------------------------------
+# PCA is now able to handle sparse matrices natively for the `arpack`
+# solver by levaraging `scipy.sparse.linalg.LinearOperator` to avoid
+# materializing large sparse matrices when performing the
+# eigenvalue decomposition of the data set covariance matrix.
+#
+from sklearn.decomposition import PCA
+import scipy.sparse as sp
+from time import time
+
+X_sparse = sp.random(m=1000, n=1000, random_state=0)
+X_dense = X_sparse.toarray()
+
+t0 = time()
+PCA(n_components=10, svd_solver="arpack").fit(X_sparse)
+time_sparse = time() - t0
+
+t0 = time()
+PCA(n_components=10, svd_solver="arpack").fit(X_dense)
+time_dense = time() - t0
+
+print(f"Speedup: {time_dense / time_sparse:.1f}x")