scikit-learn
diff --git a/‎dev/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py
Lines changed: 12 additions & 9 deletions b/‎dev/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py
Lines changed: 12 additions & 9 deletions
diff --git a/‎dev/_downloads/055e8313e28f2f3b5fd508054dfe5fe0/plot_roc_crossval.ipynb
Lines changed: 26 additions & 1 deletion b/‎dev/_downloads/055e8313e28f2f3b5fd508054dfe5fe0/plot_roc_crossval.ipynb
Lines changed: 26 additions & 1 deletion
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-36 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-36 Bytes
diff --git a/‎dev/_downloads/41973816d3932cd07b75d8825fd2c13d/plot_svm_anova.py
Lines changed: 20 additions & 13 deletions b/‎dev/_downloads/41973816d3932cd07b75d8825fd2c13d/plot_svm_anova.py
Lines changed: 20 additions & 13 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.99 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.99 KB
diff --git a/‎dev/_downloads/6f4a6a0d8063b616c4aa4db2865de57c/plot_svm_anova.ipynb
Lines changed: 44 additions & 1 deletion b/‎dev/_downloads/6f4a6a0d8063b616c4aa4db2865de57c/plot_svm_anova.ipynb
Lines changed: 44 additions & 1 deletion
diff --git a/‎dev/_downloads/788b8c55a85f84a55e652c6048c4f623/plot_bayesian_ridge.py
Lines changed: 29 additions & 15 deletions b/‎dev/_downloads/788b8c55a85f84a55e652c6048c4f623/plot_bayesian_ridge.py
Lines changed: 29 additions & 15 deletions
@@ -30,16 +30,12 @@
 
 """
 
+# %%
+# Data IO and generation
+# ----------------------
 import numpy as np
-import matplotlib.pyplot as plt
-
-from sklearn import svm, datasets
-from sklearn.metrics import auc
-from sklearn.metrics import RocCurveDisplay
-from sklearn.model_selection import StratifiedKFold
 
-# #############################################################################
-# Data IO and generation
+from sklearn import datasets
 
 # Import some data to play with
 iris = datasets.load_iris()
@@ -52,8 +48,15 @@
 random_state = np.random.RandomState(0)
 X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
 
-# #############################################################################
+# %%
 # Classification and ROC analysis
+# -------------------------------
+import matplotlib.pyplot as plt
+
+from sklearn import svm
+from sklearn.metrics import auc
+from sklearn.metrics import RocCurveDisplay
+from sklearn.model_selection import StratifiedKFold
 
 # Run classifier with cross-validation and plot ROC curves
 cv = StratifiedKFold(n_splits=6)
 
@@ -18,6 +18,31 @@
         "\n# Receiver Operating Characteristic (ROC) with cross validation\n\nExample of Receiver Operating Characteristic (ROC) metric to evaluate\nclassifier output quality using cross-validation.\n\nROC curves typically feature true positive rate on the Y axis, and false\npositive rate on the X axis. This means that the top left corner of the plot is\nthe \"ideal\" point - a false positive rate of zero, and a true positive rate of\none. This is not very realistic, but it does mean that a larger area under the\ncurve (AUC) is usually better.\n\nThe \"steepness\" of ROC curves is also important, since it is ideal to maximize\nthe true positive rate while minimizing the false positive rate.\n\nThis example shows the ROC response of different datasets, created from K-fold\ncross-validation. Taking all of these curves, it is possible to calculate the\nmean area under curve, and see the variance of the curve when the\ntraining set is split into different subsets. This roughly shows how the\nclassifier output is affected by changes in the training data, and how\ndifferent the splits generated by K-fold cross-validation are from one another.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>See also :func:`sklearn.metrics.roc_auc_score`,\n             :func:`sklearn.model_selection.cross_val_score`,\n             `sphx_glr_auto_examples_model_selection_plot_roc.py`,</p></div>\n"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Data IO and generation\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n\nfrom sklearn import datasets\n\n# Import some data to play with\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\nX, y = X[y != 2], y[y != 2]\nn_samples, n_features = X.shape\n\n# Add noisy features\nrandom_state = np.random.RandomState(0)\nX = np.c_[X, random_state.randn(n_samples, 200 * n_features)]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Classification and ROC analysis\n\n"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -26,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import svm, datasets\nfrom sklearn.metrics import auc\nfrom sklearn.metrics import RocCurveDisplay\nfrom sklearn.model_selection import StratifiedKFold\n\n# #############################################################################\n# Data IO and generation\n\n# Import some data to play with\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\nX, y = X[y != 2], y[y != 2]\nn_samples, n_features = X.shape\n\n# Add noisy features\nrandom_state = np.random.RandomState(0)\nX = np.c_[X, random_state.randn(n_samples, 200 * n_features)]\n\n# #############################################################################\n# Classification and ROC analysis\n\n# Run classifier with cross-validation and plot ROC curves\ncv = StratifiedKFold(n_splits=6)\nclassifier = svm.SVC(kernel=\"linear\", probability=True, random_state=random_state)\n\ntprs = []\naucs = []\nmean_fpr = np.linspace(0, 1, 100)\n\nfig, ax = plt.subplots()\nfor i, (train, test) in enumerate(cv.split(X, y)):\n    classifier.fit(X[train], y[train])\n    viz = RocCurveDisplay.from_estimator(\n        classifier,\n        X[test],\n        y[test],\n        name=\"ROC fold {}\".format(i),\n        alpha=0.3,\n        lw=1,\n        ax=ax,\n    )\n    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n    interp_tpr[0] = 0.0\n    tprs.append(interp_tpr)\n    aucs.append(viz.roc_auc)\n\nax.plot([0, 1], [0, 1], linestyle=\"--\", lw=2, color=\"r\", label=\"Chance\", alpha=0.8)\n\nmean_tpr = np.mean(tprs, axis=0)\nmean_tpr[-1] = 1.0\nmean_auc = auc(mean_fpr, mean_tpr)\nstd_auc = np.std(aucs)\nax.plot(\n    mean_fpr,\n    mean_tpr,\n    color=\"b\",\n    label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n    lw=2,\n    alpha=0.8,\n)\n\nstd_tpr = np.std(tprs, axis=0)\ntprs_upper = np.minimum(mean_tpr + std_tpr, 1)\ntprs_lower = np.maximum(mean_tpr - std_tpr, 0)\nax.fill_between(\n    mean_fpr,\n    tprs_lower,\n    tprs_upper,\n    color=\"grey\",\n    alpha=0.2,\n    label=r\"$\\pm$ 1 std. dev.\",\n)\n\nax.set(\n    xlim=[-0.05, 1.05],\n    ylim=[-0.05, 1.05],\n    title=\"Receiver operating characteristic example\",\n)\nax.legend(loc=\"lower right\")\nplt.show()"
+        "import matplotlib.pyplot as plt\n\nfrom sklearn import svm\nfrom sklearn.metrics import auc\nfrom sklearn.metrics import RocCurveDisplay\nfrom sklearn.model_selection import StratifiedKFold\n\n# Run classifier with cross-validation and plot ROC curves\ncv = StratifiedKFold(n_splits=6)\nclassifier = svm.SVC(kernel=\"linear\", probability=True, random_state=random_state)\n\ntprs = []\naucs = []\nmean_fpr = np.linspace(0, 1, 100)\n\nfig, ax = plt.subplots()\nfor i, (train, test) in enumerate(cv.split(X, y)):\n    classifier.fit(X[train], y[train])\n    viz = RocCurveDisplay.from_estimator(\n        classifier,\n        X[test],\n        y[test],\n        name=\"ROC fold {}\".format(i),\n        alpha=0.3,\n        lw=1,\n        ax=ax,\n    )\n    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n    interp_tpr[0] = 0.0\n    tprs.append(interp_tpr)\n    aucs.append(viz.roc_auc)\n\nax.plot([0, 1], [0, 1], linestyle=\"--\", lw=2, color=\"r\", label=\"Chance\", alpha=0.8)\n\nmean_tpr = np.mean(tprs, axis=0)\nmean_tpr[-1] = 1.0\nmean_auc = auc(mean_fpr, mean_tpr)\nstd_auc = np.std(aucs)\nax.plot(\n    mean_fpr,\n    mean_tpr,\n    color=\"b\",\n    label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n    lw=2,\n    alpha=0.8,\n)\n\nstd_tpr = np.std(tprs, axis=0)\ntprs_upper = np.minimum(mean_tpr + std_tpr, 1)\ntprs_lower = np.maximum(mean_tpr - std_tpr, 0)\nax.fill_between(\n    mean_fpr,\n    tprs_lower,\n    tprs_upper,\n    color=\"grey\",\n    alpha=0.2,\n    label=r\"$\\pm$ 1 std. dev.\",\n)\n\nax.set(\n    xlim=[-0.05, 1.05],\n    ylim=[-0.05, 1.05],\n    title=\"Receiver operating characteristic example\",\n)\nax.legend(loc=\"lower right\")\nplt.show()"
       ]
     }
   ],
 
@@ -10,26 +10,29 @@
 
 """
 
+# %%
+# Load some data to play with
+# ---------------------------
 import numpy as np
-import matplotlib.pyplot as plt
 from sklearn.datasets import load_iris
-from sklearn.feature_selection import SelectPercentile, chi2
-from sklearn.model_selection import cross_val_score
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
-from sklearn.svm import SVC
-
 
-# #############################################################################
-# Import some data to play with
 X, y = load_iris(return_X_y=True)
+
 # Add non-informative features
-np.random.seed(0)
-X = np.hstack((X, 2 * np.random.random((X.shape[0], 36))))
+rng = np.random.RandomState(0)
+X = np.hstack((X, 2 * rng.random((X.shape[0], 36))))
+
+# %%
+# Create the pipeline
+# -------------------
+from sklearn.pipeline import Pipeline
+from sklearn.feature_selection import SelectPercentile, chi2
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
 
-# #############################################################################
 # Create a feature-selection transform, a scaler and an instance of SVM that we
 # combine together to have a full-blown estimator
+
 clf = Pipeline(
     [
         ("anova", SelectPercentile(chi2)),
@@ -38,8 +41,12 @@
     ]
 )
 
-# #############################################################################
+# %%
 # Plot the cross-validation score as a function of percentile of features
+# -----------------------------------------------------------------------
+import matplotlib.pyplot as plt
+from sklearn.model_selection import cross_val_score
+
 score_means = list()
 score_stds = list()
 percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)
 
@@ -18,6 +18,49 @@
         "\n# SVM-Anova: SVM with univariate feature selection\n\nThis example shows how to perform univariate feature selection before running a\nSVC (support vector classifier) to improve the classification scores. We use\nthe iris dataset (4 features) and add 36 non-informative features. We can find\nthat our model achieves best performance when we select around 10% of features.\n"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Load some data to play with\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nfrom sklearn.datasets import load_iris\n\nX, y = load_iris(return_X_y=True)\n\n# Add non-informative features\nrng = np.random.RandomState(0)\nX = np.hstack((X, 2 * rng.random((X.shape[0], 36))))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Create the pipeline\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.pipeline import Pipeline\nfrom sklearn.feature_selection import SelectPercentile, chi2\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.svm import SVC\n\n# Create a feature-selection transform, a scaler and an instance of SVM that we\n# combine together to have a full-blown estimator\n\nclf = Pipeline(\n    [\n        (\"anova\", SelectPercentile(chi2)),\n        (\"scaler\", StandardScaler()),\n        (\"svc\", SVC(gamma=\"auto\")),\n    ]\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plot the cross-validation score as a function of percentile of features\n\n"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -26,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import load_iris\nfrom sklearn.feature_selection import SelectPercentile, chi2\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.svm import SVC\n\n\n# #############################################################################\n# Import some data to play with\nX, y = load_iris(return_X_y=True)\n# Add non-informative features\nnp.random.seed(0)\nX = np.hstack((X, 2 * np.random.random((X.shape[0], 36))))\n\n# #############################################################################\n# Create a feature-selection transform, a scaler and an instance of SVM that we\n# combine together to have a full-blown estimator\nclf = Pipeline(\n    [\n        (\"anova\", SelectPercentile(chi2)),\n        (\"scaler\", StandardScaler()),\n        (\"svc\", SVC(gamma=\"auto\")),\n    ]\n)\n\n# #############################################################################\n# Plot the cross-validation score as a function of percentile of features\nscore_means = list()\nscore_stds = list()\npercentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)\n\nfor percentile in percentiles:\n    clf.set_params(anova__percentile=percentile)\n    this_scores = cross_val_score(clf, X, y)\n    score_means.append(this_scores.mean())\n    score_stds.append(this_scores.std())\n\nplt.errorbar(percentiles, score_means, np.array(score_stds))\nplt.title(\"Performance of the SVM-Anova varying the percentile of features selected\")\nplt.xticks(np.linspace(0, 100, 11, endpoint=True))\nplt.xlabel(\"Percentile\")\nplt.ylabel(\"Accuracy Score\")\nplt.axis(\"tight\")\nplt.show()"
+        "import matplotlib.pyplot as plt\nfrom sklearn.model_selection import cross_val_score\n\nscore_means = list()\nscore_stds = list()\npercentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)\n\nfor percentile in percentiles:\n    clf.set_params(anova__percentile=percentile)\n    this_scores = cross_val_score(clf, X, y)\n    score_means.append(this_scores.mean())\n    score_stds.append(this_scores.std())\n\nplt.errorbar(percentiles, score_means, np.array(score_stds))\nplt.title(\"Performance of the SVM-Anova varying the percentile of features selected\")\nplt.xticks(np.linspace(0, 100, 11, endpoint=True))\nplt.xlabel(\"Percentile\")\nplt.ylabel(\"Accuracy Score\")\nplt.axis(\"tight\")\nplt.show()"
       ]
     }
   ],
 
@@ -24,14 +24,12 @@
 
 """
 
+# %%
+# Generate simulated data with Gaussian weights
+# ---------------------------------------------
 import numpy as np
-import matplotlib.pyplot as plt
 from scipy import stats
 
-from sklearn.linear_model import BayesianRidge, LinearRegression
-
-# #############################################################################
-# Generating simulated data with Gaussian weights
 np.random.seed(0)
 n_samples, n_features = 100, 100
 X = np.random.randn(n_samples, n_features)  # Create Gaussian data
@@ -40,6 +38,7 @@
 w = np.zeros(n_features)
 # Only keep 10 weights of interest
 relevant_features = np.random.randint(0, n_features, 10)
+
 for i in relevant_features:
     w[i] = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(lambda_))
 # Create noise with a precision alpha of 50.
@@ -48,17 +47,22 @@
 # Create the target
 y = np.dot(X, w) + noise
 
-# #############################################################################
+# %%
 # Fit the Bayesian Ridge Regression and an OLS for comparison
+# -----------------------------------------------------------
+from sklearn.linear_model import BayesianRidge, LinearRegression
+
 clf = BayesianRidge(compute_score=True)
 clf.fit(X, y)
 
 ols = LinearRegression()
 ols.fit(X, y)
 
-# #############################################################################
-# Plot true weights, estimated weights, histogram of the weights, and
-# predictions with standard deviations
+# %%
+# Plot true weights and estimated weights
+# ---------------------------------------
+import matplotlib.pyplot as plt
+
 lw = 2
 plt.figure(figsize=(6, 5))
 plt.title("Weights of the model")
@@ -67,7 +71,11 @@
 plt.plot(ols.coef_, color="navy", linestyle="--", label="OLS estimate")
 plt.xlabel("Features")
 plt.ylabel("Values of the weights")
-plt.legend(loc="best", prop=dict(size=12))
+_ = plt.legend(loc="best", prop=dict(size=12))
+
+# %%
+# Plot histogram of the weights
+# -----------------------------
 
 plt.figure(figsize=(6, 5))
 plt.title("Histogram of the weights")
@@ -80,16 +88,23 @@
 )
 plt.ylabel("Features")
 plt.xlabel("Values of the weights")
-plt.legend(loc="upper left")
+_ = plt.legend(loc="upper left")
+
+# %%
+# Plot marginal log-likelihood
+# ----------------------------
 
 plt.figure(figsize=(6, 5))
 plt.title("Marginal log-likelihood")
 plt.plot(clf.scores_, color="navy", linewidth=lw)
 plt.ylabel("Score")
-plt.xlabel("Iterations")
+_ = plt.xlabel("Iterations")
+
+# %%
+# Plot some predictions for polynomial regression with standard deviations
+# ------------------------------------------------------------------------
 
 
-# Plotting some predictions for polynomial regression
 def f(x, noise_amount):
     y = np.sqrt(x) * np.sin(x)
     noise = np.random.normal(0, 1, len(x))
@@ -117,5 +132,4 @@ def f(x, noise_amount):
 plt.plot(X_plot, y_plot, color="gold", linewidth=lw, label="Ground Truth")
 plt.ylabel("Output y")
 plt.xlabel("Feature X")
-plt.legend(loc="lower left")
-plt.show()
+_ = plt.legend(loc="lower left")