scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1006 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1006 Bytes
diff --git a/‎dev/_downloads/0aadb4e0dc9f402704c8a56152f01083/plot_lasso_dense_vs_sparse_data.ipynb
Lines changed: 37 additions & 1 deletion b/‎dev/_downloads/0aadb4e0dc9f402704c8a56152f01083/plot_lasso_dense_vs_sparse_data.ipynb
Lines changed: 37 additions & 1 deletion
diff --git a/‎dev/_downloads/510f5becea7ec7018a8eee43d6f12b1b/plot_lasso_dense_vs_sparse_data.py
Lines changed: 44 additions & 26 deletions b/‎dev/_downloads/510f5becea7ec7018a8eee43d6f12b1b/plot_lasso_dense_vs_sparse_data.py
Lines changed: 44 additions & 26 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.4 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.4 KB
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
-542 Bytes b/‎dev/_downloads/scikit-learn-docs.zip
-542 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
26 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
26 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-328 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-328 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-148 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-148 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-173 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-173 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-7 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-7 Bytes
@@ -26,7 +26,43 @@
       },
       "outputs": [],
       "source": [
-        "from time import time\nfrom scipy import sparse\nfrom scipy import linalg\n\nfrom sklearn.datasets import make_regression\nfrom sklearn.linear_model import Lasso\n\n\n# #############################################################################\n# The two Lasso implementations on Dense data\nprint(\"--- Dense matrices\")\n\nX, y = make_regression(n_samples=200, n_features=5000, random_state=0)\nX_sp = sparse.coo_matrix(X)\n\nalpha = 1\nsparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000)\ndense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000)\n\nt0 = time()\nsparse_lasso.fit(X_sp, y)\nprint(\"Sparse Lasso done in %fs\" % (time() - t0))\n\nt0 = time()\ndense_lasso.fit(X, y)\nprint(\"Dense Lasso done in %fs\" % (time() - t0))\n\nprint(\n    \"Distance between coefficients : %s\"\n    % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)\n)\n\n# #############################################################################\n# The two Lasso implementations on Sparse data\nprint(\"--- Sparse matrices\")\n\nXs = X.copy()\nXs[Xs < 2.5] = 0.0\nXs = sparse.coo_matrix(Xs)\nXs = Xs.tocsc()\n\nprint(\"Matrix density : %s %%\" % (Xs.nnz / float(X.size) * 100))\n\nalpha = 0.1\nsparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)\ndense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)\n\nt0 = time()\nsparse_lasso.fit(Xs, y)\nprint(\"Sparse Lasso done in %fs\" % (time() - t0))\n\nt0 = time()\ndense_lasso.fit(Xs.toarray(), y)\nprint(\"Dense Lasso done in %fs\" % (time() - t0))\n\nprint(\n    \"Distance between coefficients : %s\"\n    % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)\n)"
+        "from time import time\nfrom scipy import sparse\nfrom scipy import linalg\n\nfrom sklearn.datasets import make_regression\nfrom sklearn.linear_model import Lasso"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Comparing the two Lasso implementations on Dense data\n\nWe create a linear regression problem that is suitable for the Lasso,\nthat is to say, with more features than samples. We then store the data\nmatrix in both dense (the usual) and sparse format, and train a Lasso on\neach. We compute the runtime of both and check that they learned the\nsame model by computing the Euclidean norm of the difference between the\ncoefficients they learned. Because the data is dense, we expect better\nruntime with a dense data format.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "X, y = make_regression(n_samples=200, n_features=5000, random_state=0)\n# create a copy of X in sparse format\nX_sp = sparse.coo_matrix(X)\n\nalpha = 1\nsparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000)\ndense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000)\n\nt0 = time()\nsparse_lasso.fit(X_sp, y)\nprint(f\"Sparse Lasso done in {(time() - t0):.3f}s\")\n\nt0 = time()\ndense_lasso.fit(X, y)\nprint(f\"Dense Lasso done in {(time() - t0):.3f}s\")\n\n# compare the regression coefficients\ncoeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)\nprint(f\"Distance between coefficients : {coeff_diff:.2e}\")\n\n#"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Comparing the two Lasso implementations on Sparse data\n\nWe make the previous problem sparse by replacing all small values with 0\nand run the same comparisons as above. Because the data is now sparse, we\nexpect the implementation that uses the sparse data format to be faster.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# make a copy of the previous data\nXs = X.copy()\n# make Xs sparse by replacing the values lower than 2.5 with 0s\nXs[Xs < 2.5] = 0.0\n# create a copy of Xs in sparse format\nXs_sp = sparse.coo_matrix(Xs)\nXs_sp = Xs_sp.tocsc()\n\n# compute the proportion of non-zero coefficient in the data matrix\nprint(f\"Matrix density : {(Xs_sp.nnz / float(X.size) * 100):.3f}%\")\n\nalpha = 0.1\nsparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)\ndense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)\n\nt0 = time()\nsparse_lasso.fit(Xs_sp, y)\nprint(f\"Sparse Lasso done in {(time() - t0):.3f}s\")\n\nt0 = time()\ndense_lasso.fit(Xs, y)\nprint(f\"Dense Lasso done in  {(time() - t0):.3f}s\")\n\n# compare the regression coefficients\ncoeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)\nprint(f\"Distance between coefficients : {coeff_diff:.2e}\")"
       ]
     }
   ],
 
@@ -16,11 +16,20 @@
 from sklearn.linear_model import Lasso
 
 
-# #############################################################################
-# The two Lasso implementations on Dense data
-print("--- Dense matrices")
+# %%
+# Comparing the two Lasso implementations on Dense data
+# -----------------------------------------------------
+#
+# We create a linear regression problem that is suitable for the Lasso,
+# that is to say, with more features than samples. We then store the data
+# matrix in both dense (the usual) and sparse format, and train a Lasso on
+# each. We compute the runtime of both and check that they learned the
+# same model by computing the Euclidean norm of the difference between the
+# coefficients they learned. Because the data is dense, we expect better
+# runtime with a dense data format.
 
 X, y = make_regression(n_samples=200, n_features=5000, random_state=0)
+# create a copy of X in sparse format
 X_sp = sparse.coo_matrix(X)
 
 alpha = 1
@@ -29,41 +38,50 @@
 
 t0 = time()
 sparse_lasso.fit(X_sp, y)
-print("Sparse Lasso done in %fs" % (time() - t0))
+print(f"Sparse Lasso done in {(time() - t0):.3f}s")
 
 t0 = time()
 dense_lasso.fit(X, y)
-print("Dense Lasso done in %fs" % (time() - t0))
-
-print(
-    "Distance between coefficients : %s"
-    % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
-)
-
-# #############################################################################
-# The two Lasso implementations on Sparse data
-print("--- Sparse matrices")
-
+print(f"Dense Lasso done in {(time() - t0):.3f}s")
+
+# compare the regression coefficients
+coeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
+print(f"Distance between coefficients : {coeff_diff:.2e}")
+
+#
+# %%
+# Comparing the two Lasso implementations on Sparse data
+# ------------------------------------------------------
+#
+# We make the previous problem sparse by replacing all small values with 0
+# and run the same comparisons as above. Because the data is now sparse, we
+# expect the implementation that uses the sparse data format to be faster.
+
+# make a copy of the previous data
 Xs = X.copy()
+# make Xs sparse by replacing the values lower than 2.5 with 0s
 Xs[Xs < 2.5] = 0.0
-Xs = sparse.coo_matrix(Xs)
-Xs = Xs.tocsc()
+# create a copy of Xs in sparse format
+Xs_sp = sparse.coo_matrix(Xs)
+Xs_sp = Xs_sp.tocsc()
 
-print("Matrix density : %s %%" % (Xs.nnz / float(X.size) * 100))
+# compute the proportion of non-zero coefficient in the data matrix
+print(f"Matrix density : {(Xs_sp.nnz / float(X.size) * 100):.3f}%")
 
 alpha = 0.1
 sparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)
 dense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)
 
 t0 = time()
-sparse_lasso.fit(Xs, y)
-print("Sparse Lasso done in %fs" % (time() - t0))
+sparse_lasso.fit(Xs_sp, y)
+print(f"Sparse Lasso done in {(time() - t0):.3f}s")
 
 t0 = time()
-dense_lasso.fit(Xs.toarray(), y)
-print("Dense Lasso done in %fs" % (time() - t0))
+dense_lasso.fit(Xs, y)
+print(f"Dense Lasso done in  {(time() - t0):.3f}s")
+
+# compare the regression coefficients
+coeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
+print(f"Distance between coefficients : {coeff_diff:.2e}")
 
-print(
-    "Distance between coefficients : %s"
-    % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
-)
+# %%