scikit-learn
diff --git a/‎dev/_downloads/171a3f824958ccf6a73f531421087204/plot_feature_selection.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/171a3f824958ccf6a73f531421087204/plot_feature_selection.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
657 Bytes b/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
657 Bytes
diff --git a/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
678 Bytes b/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
678 Bytes
diff --git a/‎dev/_downloads/d533f86417afef3237ab99bfcb87321c/plot_feature_selection.py
Lines changed: 34 additions & 19 deletions b/‎dev/_downloads/d533f86417afef3237ab99bfcb87321c/plot_feature_selection.py
Lines changed: 34 additions & 19 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
26.8 KB b/‎dev/_downloads/scikit-learn-docs.pdf
26.8 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
148 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
148 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
148 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
148 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
353 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
353 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
353 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
353 Bytes
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets, svm\nfrom sklearn.feature_selection import SelectPercentile, f_classif\n\n# #############################################################################\n# Import some data to play with\n\n# The iris dataset\niris = datasets.load_iris()\n\n# Some noisy data not correlated\nE = np.random.uniform(0, 0.1, size=(len(iris.data), 20))\n\n# Add the noisy data to the informative features\nX = np.hstack((iris.data, E))\ny = iris.target\n\nplt.figure(1)\nplt.clf()\n\nX_indices = np.arange(X.shape[-1])\n\n# #############################################################################\n# Univariate feature selection with F-test for feature scoring\n# We use the default selection function: the 10% most significant features\nselector = SelectPercentile(f_classif, percentile=10)\nselector.fit(X, y)\nscores = -np.log10(selector.pvalues_)\nscores /= scores.max()\nplt.bar(X_indices - .45, scores, width=.2,\n        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',\n        edgecolor='black')\n\n# #############################################################################\n# Compare to the weights of an SVM\nclf = svm.SVC(kernel='linear')\nclf.fit(X, y)\n\nsvm_weights = (clf.coef_ ** 2).sum(axis=0)\nsvm_weights /= svm_weights.max()\n\nplt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',\n        color='navy', edgecolor='black')\n\nclf_selected = svm.SVC(kernel='linear')\nclf_selected.fit(selector.transform(X), y)\n\nsvm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)\nsvm_weights_selected /= svm_weights_selected.max()\n\nplt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,\n        width=.2, label='SVM weights after selection', color='c',\n        edgecolor='black')\n\n\nplt.title(\"Comparing feature selection\")\nplt.xlabel('Feature number')\nplt.yticks(())\nplt.axis('tight')\nplt.legend(loc='upper right')\nplt.show()"
+        "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_iris\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.svm import LinearSVC\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.feature_selection import SelectKBest, f_classif\n\n# #############################################################################\n# Import some data to play with\n\n# The iris dataset\nX, y = load_iris(return_X_y=True)\n\n# Some noisy data not correlated\nE = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))\n\n# Add the noisy data to the informative features\nX = np.hstack((X, E))\n\n# Split dataset to select feature and evaluate the classifier\nX_train, X_test, y_train, y_test = train_test_split(\n        X, y, stratify=y, random_state=0\n)\n\nplt.figure(1)\nplt.clf()\n\nX_indices = np.arange(X.shape[-1])\n\n# #############################################################################\n# Univariate feature selection with F-test for feature scoring\n# We use the default selection function to select the four\n# most significant features\nselector = SelectKBest(f_classif, k=4)\nselector.fit(X_train, y_train)\nscores = -np.log10(selector.pvalues_)\nscores /= scores.max()\nplt.bar(X_indices - .45, scores, width=.2,\n        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',\n        edgecolor='black')\n\n# #############################################################################\n# Compare to the weights of an SVM\nclf = make_pipeline(MinMaxScaler(), LinearSVC())\nclf.fit(X_train, y_train)\nprint('Classification accuracy without selecting features: {:.3f}'\n      .format(clf.score(X_test, y_test)))\n\nsvm_weights = np.abs(clf[-1].coef_).sum(axis=0)\nsvm_weights /= svm_weights.sum()\n\nplt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',\n        color='navy', edgecolor='black')\n\nclf_selected = make_pipeline(\n        SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC()\n)\nclf_selected.fit(X_train, y_train)\nprint('Classification accuracy after univariate feature selection: {:.3f}'\n      .format(clf_selected.score(X_test, y_test)))\n\nsvm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)\nsvm_weights_selected /= svm_weights_selected.sum()\n\nplt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,\n        width=.2, label='SVM weights after selection', color='c',\n        edgecolor='black')\n\n\nplt.title(\"Comparing feature selection\")\nplt.xlabel('Feature number')\nplt.yticks(())\nplt.axis('tight')\nplt.legend(loc='upper right')\nplt.show()"
       ]
     }
   ],
 
@@ -1,7 +1,7 @@
 """
-===============================
+============================
 Univariate Feature Selection
-===============================
+============================
 
 An example showing univariate feature selection.
 
@@ -24,21 +24,29 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-from sklearn import datasets, svm
-from sklearn.feature_selection import SelectPercentile, f_classif
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.svm import LinearSVC
+from sklearn.pipeline import make_pipeline
+from sklearn.feature_selection import SelectKBest, f_classif
 
 # #############################################################################
 # Import some data to play with
 
 # The iris dataset
-iris = datasets.load_iris()
+X, y = load_iris(return_X_y=True)
 
 # Some noisy data not correlated
-E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))
+E = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))
 
 # Add the noisy data to the informative features
-X = np.hstack((iris.data, E))
-y = iris.target
+X = np.hstack((X, E))
+
+# Split dataset to select feature and evaluate the classifier
+X_train, X_test, y_train, y_test = train_test_split(
+        X, y, stratify=y, random_state=0
+)
 
 plt.figure(1)
 plt.clf()
@@ -47,9 +55,10 @@
 
 # #############################################################################
 # Univariate feature selection with F-test for feature scoring
-# We use the default selection function: the 10% most significant features
-selector = SelectPercentile(f_classif, percentile=10)
-selector.fit(X, y)
+# We use the default selection function to select the four
+# most significant features
+selector = SelectKBest(f_classif, k=4)
+selector.fit(X_train, y_train)
 scores = -np.log10(selector.pvalues_)
 scores /= scores.max()
 plt.bar(X_indices - .45, scores, width=.2,
@@ -58,20 +67,26 @@
 
 # #############################################################################
 # Compare to the weights of an SVM
-clf = svm.SVC(kernel='linear')
-clf.fit(X, y)
+clf = make_pipeline(MinMaxScaler(), LinearSVC())
+clf.fit(X_train, y_train)
+print('Classification accuracy without selecting features: {:.3f}'
+      .format(clf.score(X_test, y_test)))
 
-svm_weights = (clf.coef_ ** 2).sum(axis=0)
-svm_weights /= svm_weights.max()
+svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
+svm_weights /= svm_weights.sum()
 
 plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',
         color='navy', edgecolor='black')
 
-clf_selected = svm.SVC(kernel='linear')
-clf_selected.fit(selector.transform(X), y)
+clf_selected = make_pipeline(
+        SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC()
+)
+clf_selected.fit(X_train, y_train)
+print('Classification accuracy after univariate feature selection: {:.3f}'
+      .format(clf_selected.score(X_test, y_test)))
 
-svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
-svm_weights_selected /= svm_weights_selected.max()
+svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
+svm_weights_selected /= svm_weights_selected.sum()
 
 plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
         width=.2, label='SVM weights after selection', color='c',
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets, svm\nfrom sklearn.feature_selection import SelectPercentile, f_classif\n\n# #############################################################################\n# Import some data to play with\n\n# The iris dataset\niris = datasets.load_iris()\n\n# Some noisy data not correlated\nE = np.random.uniform(0, 0.1, size=(len(iris.data), 20))\n\n# Add the noisy data to the informative features\nX = np.hstack((iris.data, E))\ny = iris.target\n\nplt.figure(1)\nplt.clf()\n\nX_indices = np.arange(X.shape[-1])\n\n# #############################################################################\n# Univariate feature selection with F-test for feature scoring\n# We use the default selection function: the 10% most significant features\nselector = SelectPercentile(f_classif, percentile=10)\nselector.fit(X, y)\nscores = -np.log10(selector.pvalues_)\nscores /= scores.max()\nplt.bar(X_indices - .45, scores, width=.2,\n label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',\n edgecolor='black')\n\n# #############################################################################\n# Compare to the weights of an SVM\nclf = svm.SVC(kernel='linear')\nclf.fit(X, y)\n\nsvm_weights = (clf.coef_ 2).sum(axis=0)\nsvm_weights /= svm_weights.max()\n\nplt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',\n color='navy', edgecolor='black')\n\nclf_selected = svm.SVC(kernel='linear')\nclf_selected.fit(selector.transform(X), y)\n\nsvm_weights_selected = (clf_selected.coef_ 2).sum(axis=0)\nsvm_weights_selected /= svm_weights_selected.max()\n\nplt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,\n width=.2, label='SVM weights after selection', color='c',\n edgecolor='black')\n\n\nplt.title(\"Comparing feature selection\")\nplt.xlabel('Feature number')\nplt.yticks(())\nplt.axis('tight')\nplt.legend(loc='upper right')\nplt.show()"
	`29`	+ "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_iris\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.svm import LinearSVC\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.feature_selection import SelectKBest, f_classif\n\n# #############################################################################\n# Import some data to play with\n\n# The iris dataset\nX, y = load_iris(return_X_y=True)\n\n# Some noisy data not correlated\nE = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))\n\n# Add the noisy data to the informative features\nX = np.hstack((X, E))\n\n# Split dataset to select feature and evaluate the classifier\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, stratify=y, random_state=0\n)\n\nplt.figure(1)\nplt.clf()\n\nX_indices = np.arange(X.shape[-1])\n\n# #############################################################################\n# Univariate feature selection with F-test for feature scoring\n# We use the default selection function to select the four\n# most significant features\nselector = SelectKBest(f_classif, k=4)\nselector.fit(X_train, y_train)\nscores = -np.log10(selector.pvalues_)\nscores /= scores.max()\nplt.bar(X_indices - .45, scores, width=.2,\n label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',\n edgecolor='black')\n\n# #############################################################################\n# Compare to the weights of an SVM\nclf = make_pipeline(MinMaxScaler(), LinearSVC())\nclf.fit(X_train, y_train)\nprint('Classification accuracy without selecting features: {:.3f}'\n .format(clf.score(X_test, y_test)))\n\nsvm_weights = np.abs(clf[-1].coef_).sum(axis=0)\nsvm_weights /= svm_weights.sum()\n\nplt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',\n color='navy', edgecolor='black')\n\nclf_selected = make_pipeline(\n SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC()\n)\nclf_selected.fit(X_train, y_train)\nprint('Classification accuracy after univariate feature selection: {:.3f}'\n .format(clf_selected.score(X_test, y_test)))\n\nsvm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)\nsvm_weights_selected /= svm_weights_selected.sum()\n\nplt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,\n width=.2, label='SVM weights after selection', color='c',\n edgecolor='black')\n\n\nplt.title(\"Comparing feature selection\")\nplt.xlabel('Feature number')\nplt.yticks(())\nplt.axis('tight')\nplt.legend(loc='upper right')\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`