Skip to content

Commit 16a6755

Browse files
committed
Pushing the docs to dev/ for branch: main, commit e70fb3f91b00f59899c63dc32cd8fc9162284b60
1 parent d7cd1ad commit 16a6755

File tree

1,363 files changed

+5051
-4856
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,363 files changed

+5051
-4856
lines changed
Binary file not shown.

dev/_downloads/592b2521e44501266ca5339d1fb123cb/plot_rfe_with_cross_validation.py

Lines changed: 53 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,57 +3,87 @@
33
Recursive feature elimination with cross-validation
44
===================================================
55
6-
A recursive feature elimination example with automatic tuning of the
6+
A Recursive Feature Elimination (RFE) example with automatic tuning of the
77
number of features selected with cross-validation.
88
99
"""
1010

11-
import numpy as np
12-
import matplotlib.pyplot as plt
13-
from sklearn.svm import SVC
14-
from sklearn.model_selection import StratifiedKFold
15-
from sklearn.feature_selection import RFECV
11+
# %%
12+
# Data generation
13+
# ---------------
14+
#
15+
# We build a classification task using 3 informative features. The introduction
16+
# of 2 additional redundant (i.e. correlated) features has the effect that the
17+
# selected features vary depending on the cross-validation fold. The remaining
18+
# features are non-informative as they are drawn at random.
19+
1620
from sklearn.datasets import make_classification
1721

18-
# Build a classification task using 3 informative features
1922
X, y = make_classification(
20-
n_samples=1000,
21-
n_features=25,
23+
n_samples=500,
24+
n_features=15,
2225
n_informative=3,
2326
n_redundant=2,
2427
n_repeated=0,
2528
n_classes=8,
2629
n_clusters_per_class=1,
30+
class_sep=0.8,
2731
random_state=0,
2832
)
2933

30-
# Create the RFE object and compute a cross-validated score.
31-
svc = SVC(kernel="linear")
32-
# The "accuracy" scoring shows the proportion of correct classifications
34+
# %%
35+
# Model training and selection
36+
# ----------------------------
37+
#
38+
# We create the RFE object and compute the cross-validated scores. The scoring
39+
# strategy "accuracy" optimizes the proportion of correctly classified samples.
40+
41+
from sklearn.feature_selection import RFECV
42+
from sklearn.model_selection import StratifiedKFold
43+
from sklearn.linear_model import LogisticRegression
3344

3445
min_features_to_select = 1 # Minimum number of features to consider
46+
clf = LogisticRegression()
47+
cv = StratifiedKFold(5)
48+
3549
rfecv = RFECV(
36-
estimator=svc,
50+
estimator=clf,
3751
step=1,
38-
cv=StratifiedKFold(2),
52+
cv=cv,
3953
scoring="accuracy",
4054
min_features_to_select=min_features_to_select,
55+
n_jobs=2,
4156
)
4257
rfecv.fit(X, y)
4358

44-
print("Optimal number of features : %d" % rfecv.n_features_)
45-
46-
n_scores = len(rfecv.cv_results_["split0_test_score"])
47-
cv_scores = np.vstack(
48-
(rfecv.cv_results_["split0_test_score"], rfecv.cv_results_["split1_test_score"])
49-
).T
59+
print(f"Optimal number of features: {rfecv.n_features_}")
5060

61+
# %%
62+
# In the present case, the model with 3 features (which corresponds to the true
63+
# generative model) is found to be the most optimal.
64+
#
5165
# Plot number of features VS. cross-validation scores
66+
# ---------------------------------------------------
67+
68+
import matplotlib.pyplot as plt
69+
70+
n_scores = len(rfecv.cv_results_["mean_test_score"])
5271
plt.figure()
5372
plt.xlabel("Number of features selected")
54-
plt.ylabel("Cross validation score (accuracy)")
55-
plt.plot(
73+
plt.ylabel("Mean test accuracy")
74+
plt.errorbar(
5675
range(min_features_to_select, n_scores + min_features_to_select),
57-
cv_scores,
76+
rfecv.cv_results_["mean_test_score"],
77+
yerr=rfecv.cv_results_["std_test_score"],
5878
)
79+
plt.title("Recursive Feature Elimination \nwith correlated features")
5980
plt.show()
81+
82+
# %%
83+
# From the plot above one can further notice a plateau of equivalent scores
84+
# (similar mean value and overlapping errorbars) for 3 to 5 selected features.
85+
# This is the result of introducing correlated features. Indeed, the optimal
86+
# model selected by the RFE can lie within this range, depending on the
87+
# cross-validation technique. The test accuracy decreases above 5 selected
88+
# features, this is, keeping non-informative features leads to over-fitting and
89+
# is therefore detrimental for the statistical performance of the models.
Binary file not shown.

dev/_downloads/949ed208b2147ed2b3e348e81fef52be/plot_rfe_with_cross_validation.ipynb

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,32 @@
1515
"cell_type": "markdown",
1616
"metadata": {},
1717
"source": [
18-
"\n# Recursive feature elimination with cross-validation\n\nA recursive feature elimination example with automatic tuning of the\nnumber of features selected with cross-validation.\n"
18+
"\n# Recursive feature elimination with cross-validation\n\nA Recursive Feature Elimination (RFE) example with automatic tuning of the\nnumber of features selected with cross-validation.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"## Data generation\n\nWe build a classification task using 3 informative features. The introduction\nof 2 additional redundant (i.e. correlated) features has the effect that the\nselected features vary depending on the cross-validation fold. The remaining\nfeatures are non-informative as they are drawn at random.\n\n"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {
32+
"collapsed": false
33+
},
34+
"outputs": [],
35+
"source": [
36+
"from sklearn.datasets import make_classification\n\nX, y = make_classification(\n n_samples=500,\n n_features=15,\n n_informative=3,\n n_redundant=2,\n n_repeated=0,\n n_classes=8,\n n_clusters_per_class=1,\n class_sep=0.8,\n random_state=0,\n)"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"## Model training and selection\n\nWe create the RFE object and compute the cross-validated scores. The scoring\nstrategy \"accuracy\" optimizes the proportion of correctly classified samples.\n\n"
1944
]
2045
},
2146
{
@@ -26,7 +51,32 @@
2651
},
2752
"outputs": [],
2853
"source": [
29-
"import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.svm import SVC\nfrom sklearn.model_selection import StratifiedKFold\nfrom sklearn.feature_selection import RFECV\nfrom sklearn.datasets import make_classification\n\n# Build a classification task using 3 informative features\nX, y = make_classification(\n n_samples=1000,\n n_features=25,\n n_informative=3,\n n_redundant=2,\n n_repeated=0,\n n_classes=8,\n n_clusters_per_class=1,\n random_state=0,\n)\n\n# Create the RFE object and compute a cross-validated score.\nsvc = SVC(kernel=\"linear\")\n# The \"accuracy\" scoring shows the proportion of correct classifications\n\nmin_features_to_select = 1 # Minimum number of features to consider\nrfecv = RFECV(\n estimator=svc,\n step=1,\n cv=StratifiedKFold(2),\n scoring=\"accuracy\",\n min_features_to_select=min_features_to_select,\n)\nrfecv.fit(X, y)\n\nprint(\"Optimal number of features : %d\" % rfecv.n_features_)\n\nn_scores = len(rfecv.cv_results_[\"split0_test_score\"])\ncv_scores = np.vstack(\n (rfecv.cv_results_[\"split0_test_score\"], rfecv.cv_results_[\"split1_test_score\"])\n).T\n\n# Plot number of features VS. cross-validation scores\nplt.figure()\nplt.xlabel(\"Number of features selected\")\nplt.ylabel(\"Cross validation score (accuracy)\")\nplt.plot(\n range(min_features_to_select, n_scores + min_features_to_select),\n cv_scores,\n)\nplt.show()"
54+
"from sklearn.feature_selection import RFECV\nfrom sklearn.model_selection import StratifiedKFold\nfrom sklearn.linear_model import LogisticRegression\n\nmin_features_to_select = 1 # Minimum number of features to consider\nclf = LogisticRegression()\ncv = StratifiedKFold(5)\n\nrfecv = RFECV(\n estimator=clf,\n step=1,\n cv=cv,\n scoring=\"accuracy\",\n min_features_to_select=min_features_to_select,\n n_jobs=2,\n)\nrfecv.fit(X, y)\n\nprint(f\"Optimal number of features: {rfecv.n_features_}\")"
55+
]
56+
},
57+
{
58+
"cell_type": "markdown",
59+
"metadata": {},
60+
"source": [
61+
"In the present case, the model with 3 features (which corresponds to the true\ngenerative model) is found to be the most optimal.\n\n## Plot number of features VS. cross-validation scores\n\n"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {
68+
"collapsed": false
69+
},
70+
"outputs": [],
71+
"source": [
72+
"import matplotlib.pyplot as plt\n\nn_scores = len(rfecv.cv_results_[\"mean_test_score\"])\nplt.figure()\nplt.xlabel(\"Number of features selected\")\nplt.ylabel(\"Mean test accuracy\")\nplt.errorbar(\n range(min_features_to_select, n_scores + min_features_to_select),\n rfecv.cv_results_[\"mean_test_score\"],\n yerr=rfecv.cv_results_[\"std_test_score\"],\n)\nplt.title(\"Recursive Feature Elimination \\nwith correlated features\")\nplt.show()"
73+
]
74+
},
75+
{
76+
"cell_type": "markdown",
77+
"metadata": {},
78+
"source": [
79+
"From the plot above one can further notice a plateau of equivalent scores\n(similar mean value and overlapping errorbars) for 3 to 5 selected features.\nThis is the result of introducing correlated features. Indeed, the optimal\nmodel selected by the RFE can lie within this range, depending on the\ncross-validation technique. The test accuracy decreases above 5 selected\nfeatures, this is, keeping non-informative features leads to over-fitting and\nis therefore detrimental for the statistical performance of the models.\n\n"
3080
]
3181
}
3282
],

dev/_downloads/scikit-learn-docs.zip

-1.13 KB
Binary file not shown.
-204 Bytes
111 Bytes
-123 Bytes
0 Bytes
0 Bytes

0 commit comments

Comments
 (0)