Skip to content

Commit d8b7a22

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 9014a6f02d0ce801ac6da0cdd7dc304f30fb5a65
1 parent 8187798 commit d8b7a22

File tree

1,184 files changed

+3886
-3809
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,184 files changed

+3886
-3809
lines changed

dev/_downloads/171a3f824958ccf6a73f531421087204/plot_feature_selection.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets, svm\nfrom sklearn.feature_selection import SelectPercentile, f_classif\n\n# #############################################################################\n# Import some data to play with\n\n# The iris dataset\niris = datasets.load_iris()\n\n# Some noisy data not correlated\nE = np.random.uniform(0, 0.1, size=(len(iris.data), 20))\n\n# Add the noisy data to the informative features\nX = np.hstack((iris.data, E))\ny = iris.target\n\nplt.figure(1)\nplt.clf()\n\nX_indices = np.arange(X.shape[-1])\n\n# #############################################################################\n# Univariate feature selection with F-test for feature scoring\n# We use the default selection function: the 10% most significant features\nselector = SelectPercentile(f_classif, percentile=10)\nselector.fit(X, y)\nscores = -np.log10(selector.pvalues_)\nscores /= scores.max()\nplt.bar(X_indices - .45, scores, width=.2,\n label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',\n edgecolor='black')\n\n# #############################################################################\n# Compare to the weights of an SVM\nclf = svm.SVC(kernel='linear')\nclf.fit(X, y)\n\nsvm_weights = (clf.coef_ ** 2).sum(axis=0)\nsvm_weights /= svm_weights.max()\n\nplt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',\n color='navy', edgecolor='black')\n\nclf_selected = svm.SVC(kernel='linear')\nclf_selected.fit(selector.transform(X), y)\n\nsvm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)\nsvm_weights_selected /= svm_weights_selected.max()\n\nplt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,\n width=.2, label='SVM weights after selection', color='c',\n edgecolor='black')\n\n\nplt.title(\"Comparing feature selection\")\nplt.xlabel('Feature number')\nplt.yticks(())\nplt.axis('tight')\nplt.legend(loc='upper right')\nplt.show()"
29+
"print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_iris\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.svm import LinearSVC\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.feature_selection import SelectKBest, f_classif\n\n# #############################################################################\n# Import some data to play with\n\n# The iris dataset\nX, y = load_iris(return_X_y=True)\n\n# Some noisy data not correlated\nE = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))\n\n# Add the noisy data to the informative features\nX = np.hstack((X, E))\n\n# Split dataset to select feature and evaluate the classifier\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, stratify=y, random_state=0\n)\n\nplt.figure(1)\nplt.clf()\n\nX_indices = np.arange(X.shape[-1])\n\n# #############################################################################\n# Univariate feature selection with F-test for feature scoring\n# We use the default selection function to select the four\n# most significant features\nselector = SelectKBest(f_classif, k=4)\nselector.fit(X_train, y_train)\nscores = -np.log10(selector.pvalues_)\nscores /= scores.max()\nplt.bar(X_indices - .45, scores, width=.2,\n label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',\n edgecolor='black')\n\n# #############################################################################\n# Compare to the weights of an SVM\nclf = make_pipeline(MinMaxScaler(), LinearSVC())\nclf.fit(X_train, y_train)\nprint('Classification accuracy without selecting features: {:.3f}'\n .format(clf.score(X_test, y_test)))\n\nsvm_weights = np.abs(clf[-1].coef_).sum(axis=0)\nsvm_weights /= svm_weights.sum()\n\nplt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',\n color='navy', edgecolor='black')\n\nclf_selected = make_pipeline(\n SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC()\n)\nclf_selected.fit(X_train, y_train)\nprint('Classification accuracy after univariate feature selection: {:.3f}'\n .format(clf_selected.score(X_test, y_test)))\n\nsvm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)\nsvm_weights_selected /= svm_weights_selected.sum()\n\nplt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,\n width=.2, label='SVM weights after selection', color='c',\n edgecolor='black')\n\n\nplt.title(\"Comparing feature selection\")\nplt.xlabel('Feature number')\nplt.yticks(())\nplt.axis('tight')\nplt.legend(loc='upper right')\nplt.show()"
3030
]
3131
}
3232
],
Binary file not shown.
Binary file not shown.

dev/_downloads/d533f86417afef3237ab99bfcb87321c/plot_feature_selection.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
2-
===============================
2+
============================
33
Univariate Feature Selection
4-
===============================
4+
============================
55
66
An example showing univariate feature selection.
77
@@ -24,21 +24,29 @@
2424
import numpy as np
2525
import matplotlib.pyplot as plt
2626

27-
from sklearn import datasets, svm
28-
from sklearn.feature_selection import SelectPercentile, f_classif
27+
from sklearn.datasets import load_iris
28+
from sklearn.model_selection import train_test_split
29+
from sklearn.preprocessing import MinMaxScaler
30+
from sklearn.svm import LinearSVC
31+
from sklearn.pipeline import make_pipeline
32+
from sklearn.feature_selection import SelectKBest, f_classif
2933

3034
# #############################################################################
3135
# Import some data to play with
3236

3337
# The iris dataset
34-
iris = datasets.load_iris()
38+
X, y = load_iris(return_X_y=True)
3539

3640
# Some noisy data not correlated
37-
E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))
41+
E = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))
3842

3943
# Add the noisy data to the informative features
40-
X = np.hstack((iris.data, E))
41-
y = iris.target
44+
X = np.hstack((X, E))
45+
46+
# Split dataset to select feature and evaluate the classifier
47+
X_train, X_test, y_train, y_test = train_test_split(
48+
X, y, stratify=y, random_state=0
49+
)
4250

4351
plt.figure(1)
4452
plt.clf()
@@ -47,9 +55,10 @@
4755

4856
# #############################################################################
4957
# Univariate feature selection with F-test for feature scoring
50-
# We use the default selection function: the 10% most significant features
51-
selector = SelectPercentile(f_classif, percentile=10)
52-
selector.fit(X, y)
58+
# We use the default selection function to select the four
59+
# most significant features
60+
selector = SelectKBest(f_classif, k=4)
61+
selector.fit(X_train, y_train)
5362
scores = -np.log10(selector.pvalues_)
5463
scores /= scores.max()
5564
plt.bar(X_indices - .45, scores, width=.2,
@@ -58,20 +67,26 @@
5867

5968
# #############################################################################
6069
# Compare to the weights of an SVM
61-
clf = svm.SVC(kernel='linear')
62-
clf.fit(X, y)
70+
clf = make_pipeline(MinMaxScaler(), LinearSVC())
71+
clf.fit(X_train, y_train)
72+
print('Classification accuracy without selecting features: {:.3f}'
73+
.format(clf.score(X_test, y_test)))
6374

64-
svm_weights = (clf.coef_ ** 2).sum(axis=0)
65-
svm_weights /= svm_weights.max()
75+
svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
76+
svm_weights /= svm_weights.sum()
6677

6778
plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',
6879
color='navy', edgecolor='black')
6980

70-
clf_selected = svm.SVC(kernel='linear')
71-
clf_selected.fit(selector.transform(X), y)
81+
clf_selected = make_pipeline(
82+
SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC()
83+
)
84+
clf_selected.fit(X_train, y_train)
85+
print('Classification accuracy after univariate feature selection: {:.3f}'
86+
.format(clf_selected.score(X_test, y_test)))
7287

73-
svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
74-
svm_weights_selected /= svm_weights_selected.max()
88+
svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
89+
svm_weights_selected /= svm_weights_selected.sum()
7590

7691
plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
7792
width=.2, label='SVM weights after selection', color='c',

dev/_downloads/scikit-learn-docs.pdf

26.8 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes

0 commit comments

Comments
 (0)