scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-214 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-214 Bytes
diff --git a/‎dev/_downloads/64788a28f138e0d4ba98dbbbb9116be5/plot_rbm_logistic_classification.ipynb
Lines changed: 110 additions & 2 deletions b/‎dev/_downloads/64788a28f138e0d4ba98dbbbb9116be5/plot_rbm_logistic_classification.ipynb
Lines changed: 110 additions & 2 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.34 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.34 KB
diff --git a/‎dev/_downloads/74cf42fe2afd38be640572601152dbe6/plot_rbm_logistic_classification.py
Lines changed: 45 additions & 29 deletions b/‎dev/_downloads/74cf42fe2afd38be640572601152dbe6/plot_rbm_logistic_classification.py
Lines changed: 45 additions & 29 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
-3.73 KB b/‎dev/_downloads/scikit-learn-docs.zip
-3.73 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-166 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-166 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
156 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
156 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
7 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
7 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-81 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-81 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
723 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
723 Bytes
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Restricted Boltzmann Machine features for digit classification\n\nFor greyscale image data where pixel values can be interpreted as degrees of\nblackness on a white background, like handwritten digit recognition, the\nBernoulli Restricted Boltzmann machine model (:class:`BernoulliRBM\n<sklearn.neural_network.BernoulliRBM>`) can perform effective non-linear\nfeature extraction.\n\nIn order to learn good latent representations from a small dataset, we\nartificially generate more labeled data by perturbing the training data with\nlinear shifts of 1 pixel in each direction.\n\nThis example shows how to build a classification pipeline with a BernoulliRBM\nfeature extractor and a :class:`LogisticRegression\n<sklearn.linear_model.LogisticRegression>` classifier. The hyperparameters\nof the entire model (learning rate, hidden layer size, regularization)\nwere optimized by grid search, but the search is not reproduced here because\nof runtime constraints.\n\nLogistic regression on raw pixel values is presented for comparison. The\nexample shows that the features extracted by the BernoulliRBM help improve the\nclassification accuracy.\n"
+        "\n# Restricted Boltzmann Machine features for digit classification\n\nFor greyscale image data where pixel values can be interpreted as degrees of\nblackness on a white background, like handwritten digit recognition, the\nBernoulli Restricted Boltzmann machine model (:class:`BernoulliRBM\n<sklearn.neural_network.BernoulliRBM>`) can perform effective non-linear\nfeature extraction.\n"
       ]
     },
     {
@@ -26,7 +26,115 @@
       },
       "outputs": [],
       "source": [
-        "# Authors: Yann N. Dauphin, Vlad Niculae, Gabriel Synnaeve\n# License: BSD\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom scipy.ndimage import convolve\nfrom sklearn import linear_model, datasets, metrics\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.neural_network import BernoulliRBM\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import minmax_scale\nfrom sklearn.base import clone\n\n\n# #############################################################################\n# Setting up\n\n\ndef nudge_dataset(X, Y):\n    \"\"\"\n    This produces a dataset 5 times bigger than the original one,\n    by moving the 8x8 images in X around by 1px to left, right, down, up\n    \"\"\"\n    direction_vectors = [\n        [[0, 1, 0], [0, 0, 0], [0, 0, 0]],\n        [[0, 0, 0], [1, 0, 0], [0, 0, 0]],\n        [[0, 0, 0], [0, 0, 1], [0, 0, 0]],\n        [[0, 0, 0], [0, 0, 0], [0, 1, 0]],\n    ]\n\n    def shift(x, w):\n        return convolve(x.reshape((8, 8)), mode=\"constant\", weights=w).ravel()\n\n    X = np.concatenate(\n        [X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors]\n    )\n    Y = np.concatenate([Y for _ in range(5)], axis=0)\n    return X, Y\n\n\n# Load Data\nX, y = datasets.load_digits(return_X_y=True)\nX = np.asarray(X, \"float32\")\nX, Y = nudge_dataset(X, y)\nX = minmax_scale(X, feature_range=(0, 1))  # 0-1 scaling\n\nX_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)\n\n# Models we will use\nlogistic = linear_model.LogisticRegression(solver=\"newton-cg\", tol=1)\nrbm = BernoulliRBM(random_state=0, verbose=True)\n\nrbm_features_classifier = Pipeline(steps=[(\"rbm\", rbm), (\"logistic\", logistic)])\n\n# #############################################################################\n# Training\n\n# Hyper-parameters. These were set by cross-validation,\n# using a GridSearchCV. Here we are not performing cross-validation to\n# save time.\nrbm.learning_rate = 0.06\nrbm.n_iter = 10\n# More components tend to give better prediction performance, but larger\n# fitting time\nrbm.n_components = 100\nlogistic.C = 6000\n\n# Training RBM-Logistic Pipeline\nrbm_features_classifier.fit(X_train, Y_train)\n\n# Training the Logistic regression classifier directly on the pixel\nraw_pixel_classifier = clone(logistic)\nraw_pixel_classifier.C = 100.0\nraw_pixel_classifier.fit(X_train, Y_train)\n\n# #############################################################################\n# Evaluation\n\nY_pred = rbm_features_classifier.predict(X_test)\nprint(\n    \"Logistic regression using RBM features:\\n%s\\n\"\n    % (metrics.classification_report(Y_test, Y_pred))\n)\n\nY_pred = raw_pixel_classifier.predict(X_test)\nprint(\n    \"Logistic regression using raw pixel features:\\n%s\\n\"\n    % (metrics.classification_report(Y_test, Y_pred))\n)\n\n# #############################################################################\n# Plotting\n\nplt.figure(figsize=(4.2, 4))\nfor i, comp in enumerate(rbm.components_):\n    plt.subplot(10, 10, i + 1)\n    plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r, interpolation=\"nearest\")\n    plt.xticks(())\n    plt.yticks(())\nplt.suptitle(\"100 components extracted by RBM\", fontsize=16)\nplt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)\n\nplt.show()"
+        "# Authors: Yann N. Dauphin, Vlad Niculae, Gabriel Synnaeve\n# License: BSD"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Generate data\n\nIn order to learn good latent representations from a small dataset, we\nartificially generate more labeled data by perturbing the training data with\nlinear shifts of 1 pixel in each direction.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n\nfrom scipy.ndimage import convolve\n\nfrom sklearn import datasets\nfrom sklearn.preprocessing import minmax_scale\n\nfrom sklearn.model_selection import train_test_split\n\n\ndef nudge_dataset(X, Y):\n    \"\"\"\n    This produces a dataset 5 times bigger than the original one,\n    by moving the 8x8 images in X around by 1px to left, right, down, up\n    \"\"\"\n    direction_vectors = [\n        [[0, 1, 0], [0, 0, 0], [0, 0, 0]],\n        [[0, 0, 0], [1, 0, 0], [0, 0, 0]],\n        [[0, 0, 0], [0, 0, 1], [0, 0, 0]],\n        [[0, 0, 0], [0, 0, 0], [0, 1, 0]],\n    ]\n\n    def shift(x, w):\n        return convolve(x.reshape((8, 8)), mode=\"constant\", weights=w).ravel()\n\n    X = np.concatenate(\n        [X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors]\n    )\n    Y = np.concatenate([Y for _ in range(5)], axis=0)\n    return X, Y\n\n\nX, y = datasets.load_digits(return_X_y=True)\nX = np.asarray(X, \"float32\")\nX, Y = nudge_dataset(X, y)\nX = minmax_scale(X, feature_range=(0, 1))  # 0-1 scaling\n\nX_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Models definition\n\nWe build a classification pipeline with a BernoulliRBM feature extractor and\na :class:`LogisticRegression <sklearn.linear_model.LogisticRegression>`\nclassifier.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn import linear_model\nfrom sklearn.neural_network import BernoulliRBM\nfrom sklearn.pipeline import Pipeline\n\nlogistic = linear_model.LogisticRegression(solver=\"newton-cg\", tol=1)\nrbm = BernoulliRBM(random_state=0, verbose=True)\n\nrbm_features_classifier = Pipeline(steps=[(\"rbm\", rbm), (\"logistic\", logistic)])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Training\n\nThe hyperparameters of the entire model (learning rate, hidden layer size,\nregularization) were optimized by grid search, but the search is not\nreproduced here because of runtime constraints.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.base import clone\n\n# Hyper-parameters. These were set by cross-validation,\n# using a GridSearchCV. Here we are not performing cross-validation to\n# save time.\nrbm.learning_rate = 0.06\nrbm.n_iter = 10\n\n# More components tend to give better prediction performance, but larger\n# fitting time\nrbm.n_components = 100\nlogistic.C = 6000\n\n# Training RBM-Logistic Pipeline\nrbm_features_classifier.fit(X_train, Y_train)\n\n# Training the Logistic regression classifier directly on the pixel\nraw_pixel_classifier = clone(logistic)\nraw_pixel_classifier.C = 100.0\nraw_pixel_classifier.fit(X_train, Y_train)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Evaluation\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn import metrics\n\nY_pred = rbm_features_classifier.predict(X_test)\nprint(\n    \"Logistic regression using RBM features:\\n%s\\n\"\n    % (metrics.classification_report(Y_test, Y_pred))\n)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "Y_pred = raw_pixel_classifier.predict(X_test)\nprint(\n    \"Logistic regression using raw pixel features:\\n%s\\n\"\n    % (metrics.classification_report(Y_test, Y_pred))\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The features extracted by the BernoulliRBM help improve the classification\naccuracy with respect to the logistic regression on raw pixels.\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plotting\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n\nplt.figure(figsize=(4.2, 4))\nfor i, comp in enumerate(rbm.components_):\n    plt.subplot(10, 10, i + 1)\n    plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r, interpolation=\"nearest\")\n    plt.xticks(())\n    plt.yticks(())\nplt.suptitle(\"100 components extracted by RBM\", fontsize=16)\nplt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)\n\nplt.show()"
       ]
     }
   ],
 
@@ -9,40 +9,27 @@
 <sklearn.neural_network.BernoulliRBM>`) can perform effective non-linear
 feature extraction.
 
-In order to learn good latent representations from a small dataset, we
-artificially generate more labeled data by perturbing the training data with
-linear shifts of 1 pixel in each direction.
-
-This example shows how to build a classification pipeline with a BernoulliRBM
-feature extractor and a :class:`LogisticRegression
-<sklearn.linear_model.LogisticRegression>` classifier. The hyperparameters
-of the entire model (learning rate, hidden layer size, regularization)
-were optimized by grid search, but the search is not reproduced here because
-of runtime constraints.
-
-Logistic regression on raw pixel values is presented for comparison. The
-example shows that the features extracted by the BernoulliRBM help improve the
-classification accuracy.
-
 """
 
 # Authors: Yann N. Dauphin, Vlad Niculae, Gabriel Synnaeve
 # License: BSD
 
+# %%
+# Generate data
+# -------------
+#
+# In order to learn good latent representations from a small dataset, we
+# artificially generate more labeled data by perturbing the training data with
+# linear shifts of 1 pixel in each direction.
+
 import numpy as np
-import matplotlib.pyplot as plt
 
 from scipy.ndimage import convolve
-from sklearn import linear_model, datasets, metrics
-from sklearn.model_selection import train_test_split
-from sklearn.neural_network import BernoulliRBM
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import minmax_scale
-from sklearn.base import clone
 
+from sklearn import datasets
+from sklearn.preprocessing import minmax_scale
 
-# #############################################################################
-# Setting up
+from sklearn.model_selection import train_test_split
 
 
 def nudge_dataset(X, Y):
@@ -67,28 +54,46 @@ def shift(x, w):
     return X, Y
 
 
-# Load Data
 X, y = datasets.load_digits(return_X_y=True)
 X = np.asarray(X, "float32")
 X, Y = nudge_dataset(X, y)
 X = minmax_scale(X, feature_range=(0, 1))  # 0-1 scaling
 
 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
 
-# Models we will use
+# %%
+# Models definition
+# -----------------
+#
+# We build a classification pipeline with a BernoulliRBM feature extractor and
+# a :class:`LogisticRegression <sklearn.linear_model.LogisticRegression>`
+# classifier.
+
+from sklearn import linear_model
+from sklearn.neural_network import BernoulliRBM
+from sklearn.pipeline import Pipeline
+
 logistic = linear_model.LogisticRegression(solver="newton-cg", tol=1)
 rbm = BernoulliRBM(random_state=0, verbose=True)
 
 rbm_features_classifier = Pipeline(steps=[("rbm", rbm), ("logistic", logistic)])
 
-# #############################################################################
+# %%
 # Training
+# --------
+#
+# The hyperparameters of the entire model (learning rate, hidden layer size,
+# regularization) were optimized by grid search, but the search is not
+# reproduced here because of runtime constraints.
+
+from sklearn.base import clone
 
 # Hyper-parameters. These were set by cross-validation,
 # using a GridSearchCV. Here we are not performing cross-validation to
 # save time.
 rbm.learning_rate = 0.06
 rbm.n_iter = 10
+
 # More components tend to give better prediction performance, but larger
 # fitting time
 rbm.n_components = 100
@@ -102,23 +107,34 @@ def shift(x, w):
 raw_pixel_classifier.C = 100.0
 raw_pixel_classifier.fit(X_train, Y_train)
 
-# #############################################################################
+# %%
 # Evaluation
+# ----------
+
+from sklearn import metrics
 
 Y_pred = rbm_features_classifier.predict(X_test)
 print(
     "Logistic regression using RBM features:\n%s\n"
     % (metrics.classification_report(Y_test, Y_pred))
 )
 
+# %%
 Y_pred = raw_pixel_classifier.predict(X_test)
 print(
     "Logistic regression using raw pixel features:\n%s\n"
     % (metrics.classification_report(Y_test, Y_pred))
 )
 
-# #############################################################################
+# %%
+# The features extracted by the BernoulliRBM help improve the classification
+# accuracy with respect to the logistic regression on raw pixels.
+
+# %%
 # Plotting
+# --------
+
+import matplotlib.pyplot as plt
 
 plt.figure(figsize=(4.2, 4))
 for i, comp in enumerate(rbm.components_):