scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
124 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
124 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
128 Bytes b/‎dev/_downloads/auto_examples_python.zip
128 Bytes
diff --git a/‎dev/_downloads/plot_svm_anova.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_svm_anova.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_svm_anova.py
Lines changed: 16 additions & 20 deletions b/‎dev/_downloads/plot_svm_anova.py
Lines changed: 16 additions & 20 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
7.98 KB b/‎dev/_downloads/scikit-learn-docs.pdf
7.98 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-187 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-187 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-187 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-187 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
133 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
133 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
133 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
133 Bytes
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n=================================================\nSVM-Anova: SVM with univariate feature selection\n=================================================\n\nThis example shows how to perform univariate feature selection before running a\nSVC (support vector classifier) to improve the classification scores.\n\n"
+        "\n=================================================\nSVM-Anova: SVM with univariate feature selection\n=================================================\n\nThis example shows how to perform univariate feature selection before running a\nSVC (support vector classifier) to improve the classification scores. We use\nthe iris dataset (4 features) and add 36 non-informative features. We can find\nthat our model achieves best performance when we select around 10% of features.\n\n"
       ]
     },
     {
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.feature_selection import SelectPercentile, chi2\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import SVC\n\n\n# #############################################################################\n# Import some data to play with\nX, y = load_digits(return_X_y=True)\n# Throw away data, to be in the curse of dimension settings\nX = X[:200]\ny = y[:200]\nn_samples = len(y)\nX = X.reshape((n_samples, -1))\n# add 200 non-informative features\nX = np.hstack((X, 2 * np.random.random((n_samples, 200))))\n\n# #############################################################################\n# Create a feature-selection transform and an instance of SVM that we\n# combine together to have an full-blown estimator\n\ntransform = SelectPercentile(chi2)\n\nclf = Pipeline([('anova', transform), ('svc', SVC(gamma=\"auto\"))])\n\n# #############################################################################\n# Plot the cross-validation score as a function of percentile of features\nscore_means = list()\nscore_stds = list()\npercentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)\n\nfor percentile in percentiles:\n    clf.set_params(anova__percentile=percentile)\n    # Compute cross-validation score using 1 CPU\n    this_scores = cross_val_score(clf, X, y, cv=5, n_jobs=1)\n    score_means.append(this_scores.mean())\n    score_stds.append(this_scores.std())\n\nplt.errorbar(percentiles, score_means, np.array(score_stds))\n\nplt.title(\n    'Performance of the SVM-Anova varying the percentile of features selected')\nplt.xlabel('Percentile')\nplt.ylabel('Prediction rate')\n\nplt.axis('tight')\nplt.show()"
+        "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_iris\nfrom sklearn.feature_selection import SelectPercentile, chi2\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.svm import SVC\n\n\n# #############################################################################\n# Import some data to play with\nX, y = load_iris(return_X_y=True)\n# Add non-informative features\nnp.random.seed(0)\nX = np.hstack((X, 2 * np.random.random((X.shape[0], 36))))\n\n# #############################################################################\n# Create a feature-selection transform, a scaler and an instance of SVM that we\n# combine together to have an full-blown estimator\nclf = Pipeline([('anova', SelectPercentile(chi2)),\n                ('scaler', StandardScaler()),\n                ('svc', SVC(gamma=\"auto\"))])\n\n# #############################################################################\n# Plot the cross-validation score as a function of percentile of features\nscore_means = list()\nscore_stds = list()\npercentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)\n\nfor percentile in percentiles:\n    clf.set_params(anova__percentile=percentile)\n    this_scores = cross_val_score(clf, X, y, cv=5)\n    score_means.append(this_scores.mean())\n    score_stds.append(this_scores.std())\n\nplt.errorbar(percentiles, score_means, np.array(score_stds))\nplt.title(\n    'Performance of the SVM-Anova varying the percentile of features selected')\nplt.xticks(np.linspace(0, 100, 11, endpoint=True))\nplt.xlabel('Percentile')\nplt.ylabel('Accuracy Score')\nplt.axis('tight')\nplt.show()"
       ]
     }
   ],
 
@@ -4,37 +4,35 @@
 =================================================
 
 This example shows how to perform univariate feature selection before running a
-SVC (support vector classifier) to improve the classification scores.
+SVC (support vector classifier) to improve the classification scores. We use
+the iris dataset (4 features) and add 36 non-informative features. We can find
+that our model achieves best performance when we select around 10% of features.
 """
 print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
-from sklearn.datasets import load_digits
+from sklearn.datasets import load_iris
 from sklearn.feature_selection import SelectPercentile, chi2
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
 
 
 # #############################################################################
 # Import some data to play with
-X, y = load_digits(return_X_y=True)
-# Throw away data, to be in the curse of dimension settings
-X = X[:200]
-y = y[:200]
-n_samples = len(y)
-X = X.reshape((n_samples, -1))
-# add 200 non-informative features
-X = np.hstack((X, 2 * np.random.random((n_samples, 200))))
+X, y = load_iris(return_X_y=True)
+# Add non-informative features
+np.random.seed(0)
+X = np.hstack((X, 2 * np.random.random((X.shape[0], 36))))
 
 # #############################################################################
-# Create a feature-selection transform and an instance of SVM that we
+# Create a feature-selection transform, a scaler and an instance of SVM that we
 # combine together to have an full-blown estimator
-
-transform = SelectPercentile(chi2)
-
-clf = Pipeline([('anova', transform), ('svc', SVC(gamma="auto"))])
+clf = Pipeline([('anova', SelectPercentile(chi2)),
+                ('scaler', StandardScaler()),
+                ('svc', SVC(gamma="auto"))])
 
 # #############################################################################
 # Plot the cross-validation score as a function of percentile of features
@@ -44,17 +42,15 @@
 
 for percentile in percentiles:
     clf.set_params(anova__percentile=percentile)
-    # Compute cross-validation score using 1 CPU
-    this_scores = cross_val_score(clf, X, y, cv=5, n_jobs=1)
+    this_scores = cross_val_score(clf, X, y, cv=5)
     score_means.append(this_scores.mean())
     score_stds.append(this_scores.std())
 
 plt.errorbar(percentiles, score_means, np.array(score_stds))
-
 plt.title(
     'Performance of the SVM-Anova varying the percentile of features selected')
+plt.xticks(np.linspace(0, 100, 11, endpoint=True))
 plt.xlabel('Percentile')
-plt.ylabel('Prediction rate')
-
+plt.ylabel('Accuracy Score')
 plt.axis('tight')
 plt.show()
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"cell_type": "markdown",`
`16`	`16`	`"metadata": {},`
`17`	`17`	`"source": [`
`18`		`- "\n=================================================\nSVM-Anova: SVM with univariate feature selection\n=================================================\n\nThis example shows how to perform univariate feature selection before running a\nSVC (support vector classifier) to improve the classification scores.\n\n"`
	`18`	`+ "\n=================================================\nSVM-Anova: SVM with univariate feature selection\n=================================================\n\nThis example shows how to perform univariate feature selection before running a\nSVC (support vector classifier) to improve the classification scores. We use\nthe iris dataset (4 features) and add 36 non-informative features. We can find\nthat our model achieves best performance when we select around 10% of features.\n\n"`
`19`	`19`	`]`
`20`	`20`	`},`
`21`	`21`	`{`
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_digits\nfrom sklearn.feature_selection import SelectPercentile, chi2\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import SVC\n\n\n# #############################################################################\n# Import some data to play with\nX, y = load_digits(return_X_y=True)\n# Throw away data, to be in the curse of dimension settings\nX = X[:200]\ny = y[:200]\nn_samples = len(y)\nX = X.reshape((n_samples, -1))\n# add 200 non-informative features\nX = np.hstack((X, 2 * np.random.random((n_samples, 200))))\n\n# #############################################################################\n# Create a feature-selection transform and an instance of SVM that we\n# combine together to have an full-blown estimator\n\ntransform = SelectPercentile(chi2)\n\nclf = Pipeline([('anova', transform), ('svc', SVC(gamma=\"auto\"))])\n\n# #############################################################################\n# Plot the cross-validation score as a function of percentile of features\nscore_means = list()\nscore_stds = list()\npercentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)\n\nfor percentile in percentiles:\n clf.set_params(anova__percentile=percentile)\n # Compute cross-validation score using 1 CPU\n this_scores = cross_val_score(clf, X, y, cv=5, n_jobs=1)\n score_means.append(this_scores.mean())\n score_stds.append(this_scores.std())\n\nplt.errorbar(percentiles, score_means, np.array(score_stds))\n\nplt.title(\n 'Performance of the SVM-Anova varying the percentile of features selected')\nplt.xlabel('Percentile')\nplt.ylabel('Prediction rate')\n\nplt.axis('tight')\nplt.show()"
	`29`	+ "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_iris\nfrom sklearn.feature_selection import SelectPercentile, chi2\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.svm import SVC\n\n\n# #############################################################################\n# Import some data to play with\nX, y = load_iris(return_X_y=True)\n# Add non-informative features\nnp.random.seed(0)\nX = np.hstack((X, 2 * np.random.random((X.shape[0], 36))))\n\n# #############################################################################\n# Create a feature-selection transform, a scaler and an instance of SVM that we\n# combine together to have an full-blown estimator\nclf = Pipeline([('anova', SelectPercentile(chi2)),\n ('scaler', StandardScaler()),\n ('svc', SVC(gamma=\"auto\"))])\n\n# #############################################################################\n# Plot the cross-validation score as a function of percentile of features\nscore_means = list()\nscore_stds = list()\npercentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)\n\nfor percentile in percentiles:\n clf.set_params(anova__percentile=percentile)\n this_scores = cross_val_score(clf, X, y, cv=5)\n score_means.append(this_scores.mean())\n score_stds.append(this_scores.std())\n\nplt.errorbar(percentiles, score_means, np.array(score_stds))\nplt.title(\n 'Performance of the SVM-Anova varying the percentile of features selected')\nplt.xticks(np.linspace(0, 100, 11, endpoint=True))\nplt.xlabel('Percentile')\nplt.ylabel('Accuracy Score')\nplt.axis('tight')\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`