Skip to content

Commit 926cf8e

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 9d7220b57ccac4ac12268a96281940667a4de1d8
1 parent a6c7fdf commit 926cf8e

File tree

1,228 files changed

+4656
-4506
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,228 files changed

+4656
-4506
lines changed
Binary file not shown.

dev/_downloads/0aadb4e0dc9f402704c8a56152f01083/plot_lasso_dense_vs_sparse_data.ipynb

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,43 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"from time import time\nfrom scipy import sparse\nfrom scipy import linalg\n\nfrom sklearn.datasets import make_regression\nfrom sklearn.linear_model import Lasso\n\n\n# #############################################################################\n# The two Lasso implementations on Dense data\nprint(\"--- Dense matrices\")\n\nX, y = make_regression(n_samples=200, n_features=5000, random_state=0)\nX_sp = sparse.coo_matrix(X)\n\nalpha = 1\nsparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000)\ndense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000)\n\nt0 = time()\nsparse_lasso.fit(X_sp, y)\nprint(\"Sparse Lasso done in %fs\" % (time() - t0))\n\nt0 = time()\ndense_lasso.fit(X, y)\nprint(\"Dense Lasso done in %fs\" % (time() - t0))\n\nprint(\n \"Distance between coefficients : %s\"\n % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)\n)\n\n# #############################################################################\n# The two Lasso implementations on Sparse data\nprint(\"--- Sparse matrices\")\n\nXs = X.copy()\nXs[Xs < 2.5] = 0.0\nXs = sparse.coo_matrix(Xs)\nXs = Xs.tocsc()\n\nprint(\"Matrix density : %s %%\" % (Xs.nnz / float(X.size) * 100))\n\nalpha = 0.1\nsparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)\ndense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)\n\nt0 = time()\nsparse_lasso.fit(Xs, y)\nprint(\"Sparse Lasso done in %fs\" % (time() - t0))\n\nt0 = time()\ndense_lasso.fit(Xs.toarray(), y)\nprint(\"Dense Lasso done in %fs\" % (time() - t0))\n\nprint(\n \"Distance between coefficients : %s\"\n % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)\n)"
29+
"from time import time\nfrom scipy import sparse\nfrom scipy import linalg\n\nfrom sklearn.datasets import make_regression\nfrom sklearn.linear_model import Lasso"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"## Comparing the two Lasso implementations on Dense data\n\nWe create a linear regression problem that is suitable for the Lasso,\nthat is to say, with more features than samples. We then store the data\nmatrix in both dense (the usual) and sparse format, and train a Lasso on\neach. We compute the runtime of both and check that they learned the\nsame model by computing the Euclidean norm of the difference between the\ncoefficients they learned. Because the data is dense, we expect better\nruntime with a dense data format.\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"X, y = make_regression(n_samples=200, n_features=5000, random_state=0)\n# create a copy of X in sparse format\nX_sp = sparse.coo_matrix(X)\n\nalpha = 1\nsparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000)\ndense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000)\n\nt0 = time()\nsparse_lasso.fit(X_sp, y)\nprint(f\"Sparse Lasso done in {(time() - t0):.3f}s\")\n\nt0 = time()\ndense_lasso.fit(X, y)\nprint(f\"Dense Lasso done in {(time() - t0):.3f}s\")\n\n# compare the regression coefficients\ncoeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)\nprint(f\"Distance between coefficients : {coeff_diff:.2e}\")\n\n#"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"## Comparing the two Lasso implementations on Sparse data\n\nWe make the previous problem sparse by replacing all small values with 0\nand run the same comparisons as above. Because the data is now sparse, we\nexpect the implementation that uses the sparse data format to be faster.\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"# make a copy of the previous data\nXs = X.copy()\n# make Xs sparse by replacing the values lower than 2.5 with 0s\nXs[Xs < 2.5] = 0.0\n# create a copy of Xs in sparse format\nXs_sp = sparse.coo_matrix(Xs)\nXs_sp = Xs_sp.tocsc()\n\n# compute the proportion of non-zero coefficient in the data matrix\nprint(f\"Matrix density : {(Xs_sp.nnz / float(X.size) * 100):.3f}%\")\n\nalpha = 0.1\nsparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)\ndense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)\n\nt0 = time()\nsparse_lasso.fit(Xs_sp, y)\nprint(f\"Sparse Lasso done in {(time() - t0):.3f}s\")\n\nt0 = time()\ndense_lasso.fit(Xs, y)\nprint(f\"Dense Lasso done in {(time() - t0):.3f}s\")\n\n# compare the regression coefficients\ncoeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)\nprint(f\"Distance between coefficients : {coeff_diff:.2e}\")"
3066
]
3167
}
3268
],

dev/_downloads/510f5becea7ec7018a8eee43d6f12b1b/plot_lasso_dense_vs_sparse_data.py

Lines changed: 44 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,20 @@
1616
from sklearn.linear_model import Lasso
1717

1818

19-
# #############################################################################
20-
# The two Lasso implementations on Dense data
21-
print("--- Dense matrices")
19+
# %%
20+
# Comparing the two Lasso implementations on Dense data
21+
# -----------------------------------------------------
22+
#
23+
# We create a linear regression problem that is suitable for the Lasso,
24+
# that is to say, with more features than samples. We then store the data
25+
# matrix in both dense (the usual) and sparse format, and train a Lasso on
26+
# each. We compute the runtime of both and check that they learned the
27+
# same model by computing the Euclidean norm of the difference between the
28+
# coefficients they learned. Because the data is dense, we expect better
29+
# runtime with a dense data format.
2230

2331
X, y = make_regression(n_samples=200, n_features=5000, random_state=0)
32+
# create a copy of X in sparse format
2433
X_sp = sparse.coo_matrix(X)
2534

2635
alpha = 1
@@ -29,41 +38,50 @@
2938

3039
t0 = time()
3140
sparse_lasso.fit(X_sp, y)
32-
print("Sparse Lasso done in %fs" % (time() - t0))
41+
print(f"Sparse Lasso done in {(time() - t0):.3f}s")
3342

3443
t0 = time()
3544
dense_lasso.fit(X, y)
36-
print("Dense Lasso done in %fs" % (time() - t0))
37-
38-
print(
39-
"Distance between coefficients : %s"
40-
% linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
41-
)
42-
43-
# #############################################################################
44-
# The two Lasso implementations on Sparse data
45-
print("--- Sparse matrices")
46-
45+
print(f"Dense Lasso done in {(time() - t0):.3f}s")
46+
47+
# compare the regression coefficients
48+
coeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
49+
print(f"Distance between coefficients : {coeff_diff:.2e}")
50+
51+
#
52+
# %%
53+
# Comparing the two Lasso implementations on Sparse data
54+
# ------------------------------------------------------
55+
#
56+
# We make the previous problem sparse by replacing all small values with 0
57+
# and run the same comparisons as above. Because the data is now sparse, we
58+
# expect the implementation that uses the sparse data format to be faster.
59+
60+
# make a copy of the previous data
4761
Xs = X.copy()
62+
# make Xs sparse by replacing the values lower than 2.5 with 0s
4863
Xs[Xs < 2.5] = 0.0
49-
Xs = sparse.coo_matrix(Xs)
50-
Xs = Xs.tocsc()
64+
# create a copy of Xs in sparse format
65+
Xs_sp = sparse.coo_matrix(Xs)
66+
Xs_sp = Xs_sp.tocsc()
5167

52-
print("Matrix density : %s %%" % (Xs.nnz / float(X.size) * 100))
68+
# compute the proportion of non-zero coefficient in the data matrix
69+
print(f"Matrix density : {(Xs_sp.nnz / float(X.size) * 100):.3f}%")
5370

5471
alpha = 0.1
5572
sparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)
5673
dense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)
5774

5875
t0 = time()
59-
sparse_lasso.fit(Xs, y)
60-
print("Sparse Lasso done in %fs" % (time() - t0))
76+
sparse_lasso.fit(Xs_sp, y)
77+
print(f"Sparse Lasso done in {(time() - t0):.3f}s")
6178

6279
t0 = time()
63-
dense_lasso.fit(Xs.toarray(), y)
64-
print("Dense Lasso done in %fs" % (time() - t0))
80+
dense_lasso.fit(Xs, y)
81+
print(f"Dense Lasso done in {(time() - t0):.3f}s")
82+
83+
# compare the regression coefficients
84+
coeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
85+
print(f"Distance between coefficients : {coeff_diff:.2e}")
6586

66-
print(
67-
"Distance between coefficients : %s"
68-
% linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
69-
)
87+
# %%
Binary file not shown.

dev/_downloads/scikit-learn-docs.zip

-542 Bytes
Binary file not shown.
26 Bytes
-328 Bytes
-148 Bytes
-173 Bytes
-7 Bytes

0 commit comments

Comments
 (0)