scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-790 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
-790 Bytes
diff --git a/‎dev/_downloads/23e3d7fa2388aef4e9a60c4a6caf166d/plot_face_recognition.ipynb
Lines changed: 153 additions & 2 deletions b/‎dev/_downloads/23e3d7fa2388aef4e9a60c4a6caf166d/plot_face_recognition.ipynb
Lines changed: 153 additions & 2 deletions
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.45 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.45 KB
diff --git a/‎dev/_downloads/b3a994b2ad66fe78bcedaf151ab78b07/plot_face_recognition.py
Lines changed: 34 additions & 37 deletions b/‎dev/_downloads/b3a994b2ad66fe78bcedaf151ab78b07/plot_face_recognition.py
Lines changed: 34 additions & 37 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
20.6 KB b/‎dev/_downloads/scikit-learn-docs.zip
20.6 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
7 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
7 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
8 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
8 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
21 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
21 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-241 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-241 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-4 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-4 Bytes
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Faces recognition example using eigenfaces and SVMs\n\nThe dataset used in this example is a preprocessed excerpt of the\n\"Labeled Faces in the Wild\", aka LFW_:\n\n  http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz (233MB)\n\n\nExpected results for the top 5 most represented people in the dataset:\n\n================== ============ ======= ========== =======\n                   precision    recall  f1-score   support\n================== ============ ======= ========== =======\n     Ariel Sharon       0.67      0.92      0.77        13\n     Colin Powell       0.75      0.78      0.76        60\n  Donald Rumsfeld       0.78      0.67      0.72        27\n    George W Bush       0.86      0.86      0.86       146\nGerhard Schroeder       0.76      0.76      0.76        25\n      Hugo Chavez       0.67      0.67      0.67        15\n       Tony Blair       0.81      0.69      0.75        36\n\n      avg / total       0.80      0.80      0.80       322\n================== ============ ======= ========== =======\n"
+        "\n# Faces recognition example using eigenfaces and SVMs\n\nThe dataset used in this example is a preprocessed excerpt of the\n\"Labeled Faces in the Wild\", aka LFW_:\n\n  http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz (233MB)\n\n"
       ]
     },
     {
@@ -26,7 +26,158 @@
       },
       "outputs": [],
       "source": [
-        "from time import time\nimport logging\nimport matplotlib.pyplot as plt\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.datasets import fetch_lfw_people\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.decomposition import PCA\nfrom sklearn.svm import SVC\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(message)s\")\n\n\n# #############################################################################\n# Download the data, if not already on disk and load it as numpy arrays\n\nlfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)\n\n# introspect the images arrays to find the shapes (for plotting)\nn_samples, h, w = lfw_people.images.shape\n\n# for machine learning we use the 2 data directly (as relative pixel\n# positions info is ignored by this model)\nX = lfw_people.data\nn_features = X.shape[1]\n\n# the label to predict is the id of the person\ny = lfw_people.target\ntarget_names = lfw_people.target_names\nn_classes = target_names.shape[0]\n\nprint(\"Total dataset size:\")\nprint(\"n_samples: %d\" % n_samples)\nprint(\"n_features: %d\" % n_features)\nprint(\"n_classes: %d\" % n_classes)\n\n\n# #############################################################################\n# Split into a training set and a test set using a stratified k fold\n\n# split into a training and testing set\nX_train, X_test, y_train, y_test = train_test_split(\n    X, y, test_size=0.25, random_state=42\n)\n\n\n# #############################################################################\n# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled\n# dataset): unsupervised feature extraction / dimensionality reduction\nn_components = 150\n\nprint(\n    \"Extracting the top %d eigenfaces from %d faces\" % (n_components, X_train.shape[0])\n)\nt0 = time()\npca = PCA(n_components=n_components, svd_solver=\"randomized\", whiten=True).fit(X_train)\nprint(\"done in %0.3fs\" % (time() - t0))\n\neigenfaces = pca.components_.reshape((n_components, h, w))\n\nprint(\"Projecting the input data on the eigenfaces orthonormal basis\")\nt0 = time()\nX_train_pca = pca.transform(X_train)\nX_test_pca = pca.transform(X_test)\nprint(\"done in %0.3fs\" % (time() - t0))\n\n\n# #############################################################################\n# Train a SVM classification model\n\nprint(\"Fitting the classifier to the training set\")\nt0 = time()\nparam_grid = {\n    \"C\": [1e3, 5e3, 1e4, 5e4, 1e5],\n    \"gamma\": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],\n}\nclf = GridSearchCV(SVC(kernel=\"rbf\", class_weight=\"balanced\"), param_grid)\nclf = clf.fit(X_train_pca, y_train)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint(\"Best estimator found by grid search:\")\nprint(clf.best_estimator_)\n\n\n# #############################################################################\n# Quantitative evaluation of the model quality on the test set\n\nprint(\"Predicting people's names on the test set\")\nt0 = time()\ny_pred = clf.predict(X_test_pca)\nprint(\"done in %0.3fs\" % (time() - t0))\n\nprint(classification_report(y_test, y_pred, target_names=target_names))\nprint(confusion_matrix(y_test, y_pred, labels=range(n_classes)))\n\n\n# #############################################################################\n# Qualitative evaluation of the predictions using matplotlib\n\n\ndef plot_gallery(images, titles, h, w, n_row=3, n_col=4):\n    \"\"\"Helper function to plot a gallery of portraits\"\"\"\n    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))\n    plt.subplots_adjust(bottom=0, left=0.01, right=0.99, top=0.90, hspace=0.35)\n    for i in range(n_row * n_col):\n        plt.subplot(n_row, n_col, i + 1)\n        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)\n        plt.title(titles[i], size=12)\n        plt.xticks(())\n        plt.yticks(())\n\n\n# plot the result of the prediction on a portion of the test set\n\n\ndef title(y_pred, y_test, target_names, i):\n    pred_name = target_names[y_pred[i]].rsplit(\" \", 1)[-1]\n    true_name = target_names[y_test[i]].rsplit(\" \", 1)[-1]\n    return \"predicted: %s\\ntrue:      %s\" % (pred_name, true_name)\n\n\nprediction_titles = [\n    title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])\n]\n\nplot_gallery(X_test, prediction_titles, h, w)\n\n# plot the gallery of the most significative eigenfaces\n\neigenface_titles = [\"eigenface %d\" % i for i in range(eigenfaces.shape[0])]\nplot_gallery(eigenfaces, eigenface_titles, h, w)\n\nplt.show()"
+        "from time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.datasets import fetch_lfw_people\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import ConfusionMatrixDisplay\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.svm import SVC\nfrom sklearn.utils.fixes import loguniform"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Download the data, if not already on disk and load it as numpy arrays\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)\n\n# introspect the images arrays to find the shapes (for plotting)\nn_samples, h, w = lfw_people.images.shape\n\n# for machine learning we use the 2 data directly (as relative pixel\n# positions info is ignored by this model)\nX = lfw_people.data\nn_features = X.shape[1]\n\n# the label to predict is the id of the person\ny = lfw_people.target\ntarget_names = lfw_people.target_names\nn_classes = target_names.shape[0]\n\nprint(\"Total dataset size:\")\nprint(\"n_samples: %d\" % n_samples)\nprint(\"n_features: %d\" % n_features)\nprint(\"n_classes: %d\" % n_classes)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Split into a training set and a test and keep 25% of the data for testing.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "X_train, X_test, y_train, y_test = train_test_split(\n    X, y, test_size=0.25, random_state=42\n)\n\nscaler = StandardScaler()\nX_train = scaler.fit_transform(X_train)\nX_test = scaler.transform(X_test)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled\ndataset): unsupervised feature extraction / dimensionality reduction\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "n_components = 150\n\nprint(\n    \"Extracting the top %d eigenfaces from %d faces\" % (n_components, X_train.shape[0])\n)\nt0 = time()\npca = PCA(n_components=n_components, svd_solver=\"randomized\", whiten=True).fit(X_train)\nprint(\"done in %0.3fs\" % (time() - t0))\n\neigenfaces = pca.components_.reshape((n_components, h, w))\n\nprint(\"Projecting the input data on the eigenfaces orthonormal basis\")\nt0 = time()\nX_train_pca = pca.transform(X_train)\nX_test_pca = pca.transform(X_test)\nprint(\"done in %0.3fs\" % (time() - t0))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Train a SVM classification model\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(\"Fitting the classifier to the training set\")\nt0 = time()\nparam_grid = {\n    \"C\": loguniform(1e3, 1e5),\n    \"gamma\": loguniform(1e-4, 1e-1),\n}\nclf = RandomizedSearchCV(\n    SVC(kernel=\"rbf\", class_weight=\"balanced\"), param_grid, n_iter=10\n)\nclf = clf.fit(X_train_pca, y_train)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint(\"Best estimator found by grid search:\")\nprint(clf.best_estimator_)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Quantitative evaluation of the model quality on the test set\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(\"Predicting people's names on the test set\")\nt0 = time()\ny_pred = clf.predict(X_test_pca)\nprint(\"done in %0.3fs\" % (time() - t0))\n\nprint(classification_report(y_test, y_pred, target_names=target_names))\nConfusionMatrixDisplay.from_estimator(\n    clf, X_test_pca, y_test, display_labels=target_names, xticks_rotation=\"vertical\"\n)\nplt.tight_layout()\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Qualitative evaluation of the predictions using matplotlib\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def plot_gallery(images, titles, h, w, n_row=3, n_col=4):\n    \"\"\"Helper function to plot a gallery of portraits\"\"\"\n    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))\n    plt.subplots_adjust(bottom=0, left=0.01, right=0.99, top=0.90, hspace=0.35)\n    for i in range(n_row * n_col):\n        plt.subplot(n_row, n_col, i + 1)\n        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)\n        plt.title(titles[i], size=12)\n        plt.xticks(())\n        plt.yticks(())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "plot the result of the prediction on a portion of the test set\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def title(y_pred, y_test, target_names, i):\n    pred_name = target_names[y_pred[i]].rsplit(\" \", 1)[-1]\n    true_name = target_names[y_test[i]].rsplit(\" \", 1)[-1]\n    return \"predicted: %s\\ntrue:      %s\" % (pred_name, true_name)\n\n\nprediction_titles = [\n    title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])\n]\n\nplot_gallery(X_test, prediction_titles, h, w)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "plot the gallery of the most significative eigenfaces\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "eigenface_titles = [\"eigenface %d\" % i for i in range(eigenfaces.shape[0])]\nplot_gallery(eigenfaces, eigenface_titles, h, w)\n\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Face recognition problem would be much more effectively solved by training\nconvolutional neural networks but this family of models is outside of the scope of\nthe scikit-learn library. Interested readers should instead try to use pytorch or\ntensorflow to implement such models.\n\n"
       ]
     }
   ],
 
@@ -10,42 +10,23 @@
 
 .. _LFW: http://vis-www.cs.umass.edu/lfw/
 
-Expected results for the top 5 most represented people in the dataset:
-
-================== ============ ======= ========== =======
-                   precision    recall  f1-score   support
-================== ============ ======= ========== =======
-     Ariel Sharon       0.67      0.92      0.77        13
-     Colin Powell       0.75      0.78      0.76        60
-  Donald Rumsfeld       0.78      0.67      0.72        27
-    George W Bush       0.86      0.86      0.86       146
-Gerhard Schroeder       0.76      0.76      0.76        25
-      Hugo Chavez       0.67      0.67      0.67        15
-       Tony Blair       0.81      0.69      0.75        36
-
-      avg / total       0.80      0.80      0.80       322
-================== ============ ======= ========== =======
-
 """
-
+# %%
 from time import time
-import logging
 import matplotlib.pyplot as plt
 
 from sklearn.model_selection import train_test_split
-from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import RandomizedSearchCV
 from sklearn.datasets import fetch_lfw_people
 from sklearn.metrics import classification_report
-from sklearn.metrics import confusion_matrix
+from sklearn.metrics import ConfusionMatrixDisplay
+from sklearn.preprocessing import StandardScaler
 from sklearn.decomposition import PCA
 from sklearn.svm import SVC
+from sklearn.utils.fixes import loguniform
 
 
-# Display progress logs on stdout
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
-
-
-# #############################################################################
+# %%
 # Download the data, if not already on disk and load it as numpy arrays
 
 lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
@@ -69,18 +50,21 @@
 print("n_classes: %d" % n_classes)
 
 
-# #############################################################################
-# Split into a training set and a test set using a stratified k fold
+# %%
+# Split into a training set and a test and keep 25% of the data for testing.
 
-# split into a training and testing set
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=42
 )
 
+scaler = StandardScaler()
+X_train = scaler.fit_transform(X_train)
+X_test = scaler.transform(X_test)
 
-# #############################################################################
+# %%
 # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
 # dataset): unsupervised feature extraction / dimensionality reduction
+
 n_components = 150
 
 print(
@@ -99,23 +83,25 @@
 print("done in %0.3fs" % (time() - t0))
 
 
-# #############################################################################
+# %%
 # Train a SVM classification model
 
 print("Fitting the classifier to the training set")
 t0 = time()
 param_grid = {
-    "C": [1e3, 5e3, 1e4, 5e4, 1e5],
-    "gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
+    "C": loguniform(1e3, 1e5),
+    "gamma": loguniform(1e-4, 1e-1),
 }
-clf = GridSearchCV(SVC(kernel="rbf", class_weight="balanced"), param_grid)
+clf = RandomizedSearchCV(
+    SVC(kernel="rbf", class_weight="balanced"), param_grid, n_iter=10
+)
 clf = clf.fit(X_train_pca, y_train)
 print("done in %0.3fs" % (time() - t0))
 print("Best estimator found by grid search:")
 print(clf.best_estimator_)
 
 
-# #############################################################################
+# %%
 # Quantitative evaluation of the model quality on the test set
 
 print("Predicting people's names on the test set")
@@ -124,10 +110,14 @@
 print("done in %0.3fs" % (time() - t0))
 
 print(classification_report(y_test, y_pred, target_names=target_names))
-print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
+ConfusionMatrixDisplay.from_estimator(
+    clf, X_test_pca, y_test, display_labels=target_names, xticks_rotation="vertical"
+)
+plt.tight_layout()
+plt.show()
 
 
-# #############################################################################
+# %%
 # Qualitative evaluation of the predictions using matplotlib
 
 
@@ -143,6 +133,7 @@ def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
         plt.yticks(())
 
 
+# %%
 # plot the result of the prediction on a portion of the test set
 
 
@@ -157,10 +148,16 @@ def title(y_pred, y_test, target_names, i):
 ]
 
 plot_gallery(X_test, prediction_titles, h, w)
-
+# %%
 # plot the gallery of the most significative eigenfaces
 
 eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
 plot_gallery(eigenfaces, eigenface_titles, h, w)
 
 plt.show()
+
+# %%
+# Face recognition problem would be much more effectively solved by training
+# convolutional neural networks but this family of models is outside of the scope of
+# the scikit-learn library. Interested readers should instead try to use pytorch or
+# tensorflow to implement such models.