scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
175 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
175 Bytes
diff --git a/‎dev/_downloads/3ed102fa8211c8d36f2331f0c5e1dcef/plot_model_complexity_influence.ipynb
Lines changed: 4 additions & 4 deletions b/‎dev/_downloads/3ed102fa8211c8d36f2331f0c5e1dcef/plot_model_complexity_influence.ipynb
Lines changed: 4 additions & 4 deletions
diff --git a/‎dev/_downloads/54c7ea3b3671861fbfb2161a6f0ab6d0/plot_nca_classification.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/54c7ea3b3671861fbfb2161a6f0ab6d0/plot_nca_classification.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
187 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
187 Bytes
diff --git a/‎dev/_downloads/b7792f6c26a74369f67bbe6f9ac41edf/plot_nca_classification.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/b7792f6c26a74369f67bbe6f9ac41edf/plot_nca_classification.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/ddd79923ba48c7f71fb17697baa1a22b/plot_model_complexity_influence.py
Lines changed: 20 additions & 14 deletions b/‎dev/_downloads/ddd79923ba48c7f71fb17697baa1a22b/plot_model_complexity_influence.py
Lines changed: 20 additions & 14 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
2.74 KB b/‎dev/_downloads/scikit-learn-docs.zip
2.74 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-2 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-2 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-4 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-4 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
294 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
294 Bytes
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Authors: Eustache Diemert <[email protected]>\n#          Maria Telenczuk <https://github.com/maikia>\n#          Guillaume Lemaitre <[email protected]>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import NuSVR\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n\n# Initialize random generator\nnp.random.seed(0)"
+        "# Authors: Eustache Diemert <[email protected]>\n#          Maria Telenczuk <https://github.com/maikia>\n#          Guillaume Lemaitre <[email protected]>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import NuSVR\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n# Initialize random generator\nnp.random.seed(0)"
       ]
     },
     {
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "def generate_data(case):\n    \"\"\"Generate regression/classification data.\"\"\"\n    if case == \"regression\":\n        X, y = datasets.load_diabetes(return_X_y=True)\n    elif case == \"classification\":\n        X, y = datasets.fetch_20newsgroups_vectorized(subset=\"all\", return_X_y=True)\n    X, y = shuffle(X, y)\n    offset = int(X.shape[0] * 0.8)\n    X_train, y_train = X[:offset], y[:offset]\n    X_test, y_test = X[offset:], y[offset:]\n\n    data = {\"X_train\": X_train, \"X_test\": X_test, \"y_train\": y_train, \"y_test\": y_test}\n    return data\n\n\nregression_data = generate_data(\"regression\")\nclassification_data = generate_data(\"classification\")"
+        "def generate_data(case):\n    \"\"\"Generate regression/classification data.\"\"\"\n    if case == \"regression\":\n        X, y = datasets.load_diabetes(return_X_y=True)\n        train_size = 0.8\n    elif case == \"classification\":\n        X, y = datasets.fetch_20newsgroups_vectorized(subset=\"all\", return_X_y=True)\n        train_size = 0.4  # to make the example run faster\n\n    X_train, X_test, y_train, y_test = train_test_split(\n        X, y, train_size=train_size, random_state=0\n    )\n\n    data = {\"X_train\": X_train, \"X_test\": X_test, \"y_train\": y_train, \"y_test\": y_test}\n    return data\n\n\nregression_data = generate_data(\"regression\")\nclassification_data = generate_data(\"classification\")"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "def _count_nonzero_coefficients(estimator):\n    a = estimator.coef_.toarray()\n    return np.count_nonzero(a)\n\n\nconfigurations = [\n    {\n        \"estimator\": SGDClassifier,\n        \"tuned_params\": {\n            \"penalty\": \"elasticnet\",\n            \"alpha\": 0.001,\n            \"loss\": \"modified_huber\",\n            \"fit_intercept\": True,\n            \"tol\": 1e-3,\n        },\n        \"changing_param\": \"l1_ratio\",\n        \"changing_param_values\": [0.25, 0.5, 0.75, 0.9],\n        \"complexity_label\": \"non_zero coefficients\",\n        \"complexity_computer\": _count_nonzero_coefficients,\n        \"prediction_performance_computer\": hamming_loss,\n        \"prediction_performance_label\": \"Hamming Loss (Misclassification Ratio)\",\n        \"postfit_hook\": lambda x: x.sparsify(),\n        \"data\": classification_data,\n        \"n_samples\": 30,\n    },\n    {\n        \"estimator\": NuSVR,\n        \"tuned_params\": {\"C\": 1e3, \"gamma\": 2 ** -15},\n        \"changing_param\": \"nu\",\n        \"changing_param_values\": [0.1, 0.25, 0.5, 0.75, 0.9],\n        \"complexity_label\": \"n_support_vectors\",\n        \"complexity_computer\": lambda x: len(x.support_vectors_),\n        \"data\": regression_data,\n        \"postfit_hook\": lambda x: x,\n        \"prediction_performance_computer\": mean_squared_error,\n        \"prediction_performance_label\": \"MSE\",\n        \"n_samples\": 30,\n    },\n    {\n        \"estimator\": GradientBoostingRegressor,\n        \"tuned_params\": {\"loss\": \"squared_error\"},\n        \"changing_param\": \"n_estimators\",\n        \"changing_param_values\": [10, 50, 100, 200, 500],\n        \"complexity_label\": \"n_trees\",\n        \"complexity_computer\": lambda x: x.n_estimators,\n        \"data\": regression_data,\n        \"postfit_hook\": lambda x: x,\n        \"prediction_performance_computer\": mean_squared_error,\n        \"prediction_performance_label\": \"MSE\",\n        \"n_samples\": 30,\n    },\n]"
+        "def _count_nonzero_coefficients(estimator):\n    a = estimator.coef_.toarray()\n    return np.count_nonzero(a)\n\n\nconfigurations = [\n    {\n        \"estimator\": SGDClassifier,\n        \"tuned_params\": {\n            \"penalty\": \"elasticnet\",\n            \"alpha\": 0.001,\n            \"loss\": \"modified_huber\",\n            \"fit_intercept\": True,\n            \"tol\": 1e-3,\n        },\n        \"changing_param\": \"l1_ratio\",\n        \"changing_param_values\": [0.25, 0.5, 0.75, 0.9],\n        \"complexity_label\": \"non_zero coefficients\",\n        \"complexity_computer\": _count_nonzero_coefficients,\n        \"prediction_performance_computer\": hamming_loss,\n        \"prediction_performance_label\": \"Hamming Loss (Misclassification Ratio)\",\n        \"postfit_hook\": lambda x: x.sparsify(),\n        \"data\": classification_data,\n        \"n_samples\": 5,\n    },\n    {\n        \"estimator\": NuSVR,\n        \"tuned_params\": {\"C\": 1e3, \"gamma\": 2 ** -15},\n        \"changing_param\": \"nu\",\n        \"changing_param_values\": [0.05, 0.1, 0.2, 0.35, 0.5],\n        \"complexity_label\": \"n_support_vectors\",\n        \"complexity_computer\": lambda x: len(x.support_vectors_),\n        \"data\": regression_data,\n        \"postfit_hook\": lambda x: x,\n        \"prediction_performance_computer\": mean_squared_error,\n        \"prediction_performance_label\": \"MSE\",\n        \"n_samples\": 15,\n    },\n    {\n        \"estimator\": GradientBoostingRegressor,\n        \"tuned_params\": {\n            \"loss\": \"squared_error\",\n            \"learning_rate\": 0.05,\n            \"max_depth\": 2,\n        },\n        \"changing_param\": \"n_estimators\",\n        \"changing_param_values\": [10, 25, 50, 75, 100],\n        \"complexity_label\": \"n_trees\",\n        \"complexity_computer\": lambda x: x.n_estimators,\n        \"data\": regression_data,\n        \"postfit_hook\": lambda x: x,\n        \"prediction_performance_computer\": mean_squared_error,\n        \"prediction_performance_label\": \"MSE\",\n        \"n_samples\": 15,\n    },\n]"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "def plot_influence(conf, mse_values, prediction_times, complexities):\n    \"\"\"\n    Plot influence of model complexity on both accuracy and latency.\n    \"\"\"\n\n    fig = plt.figure()\n    fig.subplots_adjust(right=0.75)\n\n    # first axes (prediction error)\n    ax1 = fig.add_subplot(111)\n    line1 = ax1.plot(complexities, mse_values, c=\"tab:blue\", ls=\"-\")[0]\n    ax1.set_xlabel(\"Model Complexity (%s)\" % conf[\"complexity_label\"])\n    y1_label = conf[\"prediction_performance_label\"]\n    ax1.set_ylabel(y1_label)\n\n    ax1.spines[\"left\"].set_color(line1.get_color())\n    ax1.yaxis.label.set_color(line1.get_color())\n    ax1.tick_params(axis=\"y\", colors=line1.get_color())\n\n    # second axes (latency)\n    ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)\n    line2 = ax2.plot(complexities, prediction_times, c=\"tab:orange\", ls=\"-\")[0]\n    ax2.yaxis.tick_right()\n    ax2.yaxis.set_label_position(\"right\")\n    y2_label = \"Time (s)\"\n    ax2.set_ylabel(y2_label)\n    ax1.spines[\"right\"].set_color(line2.get_color())\n    ax2.yaxis.label.set_color(line2.get_color())\n    ax2.tick_params(axis=\"y\", colors=line2.get_color())\n\n    plt.legend((line1, line2), (\"prediction error\", \"latency\"), loc=\"upper right\")\n\n    plt.title(\n        \"Influence of varying '%s' on %s\"\n        % (conf[\"changing_param\"], conf[\"estimator\"].__name__)\n    )\n\n\nfor conf in configurations:\n    prediction_performances, prediction_times, complexities = benchmark_influence(conf)\n    plot_influence(conf, prediction_performances, prediction_times, complexities)\nplt.show()"
+        "def plot_influence(conf, mse_values, prediction_times, complexities):\n    \"\"\"\n    Plot influence of model complexity on both accuracy and latency.\n    \"\"\"\n\n    fig = plt.figure()\n    fig.subplots_adjust(right=0.75)\n\n    # first axes (prediction error)\n    ax1 = fig.add_subplot(111)\n    line1 = ax1.plot(complexities, mse_values, c=\"tab:blue\", ls=\"-\")[0]\n    ax1.set_xlabel(\"Model Complexity (%s)\" % conf[\"complexity_label\"])\n    y1_label = conf[\"prediction_performance_label\"]\n    ax1.set_ylabel(y1_label)\n\n    ax1.spines[\"left\"].set_color(line1.get_color())\n    ax1.yaxis.label.set_color(line1.get_color())\n    ax1.tick_params(axis=\"y\", colors=line1.get_color())\n\n    # second axes (latency)\n    ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)\n    line2 = ax2.plot(complexities, prediction_times, c=\"tab:orange\", ls=\"-\")[0]\n    ax2.yaxis.tick_right()\n    ax2.yaxis.set_label_position(\"right\")\n    y2_label = \"Time (s)\"\n    ax2.set_ylabel(y2_label)\n    ax1.spines[\"right\"].set_color(line2.get_color())\n    ax2.yaxis.label.set_color(line2.get_color())\n    ax2.tick_params(axis=\"y\", colors=line2.get_color())\n\n    plt.legend(\n        (line1, line2), (\"prediction error\", \"prediction latency\"), loc=\"upper right\"\n    )\n\n    plt.title(\n        \"Influence of varying '%s' on %s\"\n        % (conf[\"changing_param\"], conf[\"estimator\"].__name__)\n    )\n\n\nfor conf in configurations:\n    prediction_performances, prediction_times, complexities = benchmark_influence(conf)\n    plot_influence(conf, prediction_performances, prediction_times, complexities)\nplt.show()"
       ]
     },
     {
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis\nfrom sklearn.pipeline import Pipeline\n\n\nn_neighbors = 1\n\ndataset = datasets.load_iris()\nX, y = dataset.data, dataset.target\n\n# we only take two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = X[:, [0, 2]]\n\nX_train, X_test, y_train, y_test = train_test_split(\n    X, y, stratify=y, test_size=0.7, random_state=42\n)\n\nh = 0.01  # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap([\"#FFAAAA\", \"#AAFFAA\", \"#AAAAFF\"])\ncmap_bold = ListedColormap([\"#FF0000\", \"#00FF00\", \"#0000FF\"])\n\nnames = [\"KNN\", \"NCA, KNN\"]\n\nclassifiers = [\n    Pipeline(\n        [\n            (\"scaler\", StandardScaler()),\n            (\"knn\", KNeighborsClassifier(n_neighbors=n_neighbors)),\n        ]\n    ),\n    Pipeline(\n        [\n            (\"scaler\", StandardScaler()),\n            (\"nca\", NeighborhoodComponentsAnalysis()),\n            (\"knn\", KNeighborsClassifier(n_neighbors=n_neighbors)),\n        ]\n    ),\n]\n\nx_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\ny_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\nxx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n\nfor name, clf in zip(names, classifiers):\n\n    clf.fit(X_train, y_train)\n    score = clf.score(X_test, y_test)\n\n    # Plot the decision boundary. For that, we will assign a color to each\n    # point in the mesh [x_min, x_max]x[y_min, y_max].\n    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n    # Put the result into a color plot\n    Z = Z.reshape(xx.shape)\n    plt.figure()\n    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=0.8)\n\n    # Plot also the training and testing points\n    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor=\"k\", s=20)\n    plt.xlim(xx.min(), xx.max())\n    plt.ylim(yy.min(), yy.max())\n    plt.title(\"{} (k = {})\".format(name, n_neighbors))\n    plt.text(\n        0.9,\n        0.1,\n        \"{:.2f}\".format(score),\n        size=15,\n        ha=\"center\",\n        va=\"center\",\n        transform=plt.gca().transAxes,\n    )\n\nplt.show()"
+        "# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis\nfrom sklearn.pipeline import Pipeline\n\n\nn_neighbors = 1\n\ndataset = datasets.load_iris()\nX, y = dataset.data, dataset.target\n\n# we only take two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = X[:, [0, 2]]\n\nX_train, X_test, y_train, y_test = train_test_split(\n    X, y, stratify=y, test_size=0.7, random_state=42\n)\n\nh = 0.05  # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap([\"#FFAAAA\", \"#AAFFAA\", \"#AAAAFF\"])\ncmap_bold = ListedColormap([\"#FF0000\", \"#00FF00\", \"#0000FF\"])\n\nnames = [\"KNN\", \"NCA, KNN\"]\n\nclassifiers = [\n    Pipeline(\n        [\n            (\"scaler\", StandardScaler()),\n            (\"knn\", KNeighborsClassifier(n_neighbors=n_neighbors)),\n        ]\n    ),\n    Pipeline(\n        [\n            (\"scaler\", StandardScaler()),\n            (\"nca\", NeighborhoodComponentsAnalysis()),\n            (\"knn\", KNeighborsClassifier(n_neighbors=n_neighbors)),\n        ]\n    ),\n]\n\nx_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\ny_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\nxx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n\nfor name, clf in zip(names, classifiers):\n\n    clf.fit(X_train, y_train)\n    score = clf.score(X_test, y_test)\n\n    # Plot the decision boundary. For that, we will assign a color to each\n    # point in the mesh [x_min, x_max]x[y_min, y_max].\n    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n    # Put the result into a color plot\n    Z = Z.reshape(xx.shape)\n    plt.figure()\n    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=0.8)\n\n    # Plot also the training and testing points\n    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor=\"k\", s=20)\n    plt.xlim(xx.min(), xx.max())\n    plt.ylim(yy.min(), yy.max())\n    plt.title(\"{} (k = {})\".format(name, n_neighbors))\n    plt.text(\n        0.9,\n        0.1,\n        \"{:.2f}\".format(score),\n        size=15,\n        ha=\"center\",\n        va=\"center\",\n        transform=plt.gca().transAxes,\n    )\n\nplt.show()"
       ]
     }
   ],
 
@@ -40,7 +40,7 @@
     X, y, stratify=y, test_size=0.7, random_state=42
 )
 
-h = 0.01  # step size in the mesh
+h = 0.05  # step size in the mesh
 
 # Create color maps
 cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])
 
@@ -42,14 +42,13 @@
 import matplotlib.pyplot as plt
 
 from sklearn import datasets
-from sklearn.utils import shuffle
+from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error
 from sklearn.svm import NuSVR
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.linear_model import SGDClassifier
 from sklearn.metrics import hamming_loss
 
-
 # Initialize random generator
 np.random.seed(0)
 
@@ -72,12 +71,14 @@ def generate_data(case):
     """Generate regression/classification data."""
     if case == "regression":
         X, y = datasets.load_diabetes(return_X_y=True)
+        train_size = 0.8
     elif case == "classification":
         X, y = datasets.fetch_20newsgroups_vectorized(subset="all", return_X_y=True)
-    X, y = shuffle(X, y)
-    offset = int(X.shape[0] * 0.8)
-    X_train, y_train = X[:offset], y[:offset]
-    X_test, y_test = X[offset:], y[offset:]
+        train_size = 0.4  # to make the example run faster
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, train_size=train_size, random_state=0
+    )
 
     data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
     return data
@@ -174,33 +175,37 @@ def _count_nonzero_coefficients(estimator):
         "prediction_performance_label": "Hamming Loss (Misclassification Ratio)",
         "postfit_hook": lambda x: x.sparsify(),
         "data": classification_data,
-        "n_samples": 30,
+        "n_samples": 5,
     },
     {
         "estimator": NuSVR,
         "tuned_params": {"C": 1e3, "gamma": 2 ** -15},
         "changing_param": "nu",
-        "changing_param_values": [0.1, 0.25, 0.5, 0.75, 0.9],
+        "changing_param_values": [0.05, 0.1, 0.2, 0.35, 0.5],
         "complexity_label": "n_support_vectors",
         "complexity_computer": lambda x: len(x.support_vectors_),
         "data": regression_data,
         "postfit_hook": lambda x: x,
         "prediction_performance_computer": mean_squared_error,
         "prediction_performance_label": "MSE",
-        "n_samples": 30,
+        "n_samples": 15,
     },
     {
         "estimator": GradientBoostingRegressor,
-        "tuned_params": {"loss": "squared_error"},
+        "tuned_params": {
+            "loss": "squared_error",
+            "learning_rate": 0.05,
+            "max_depth": 2,
+        },
         "changing_param": "n_estimators",
-        "changing_param_values": [10, 50, 100, 200, 500],
+        "changing_param_values": [10, 25, 50, 75, 100],
         "complexity_label": "n_trees",
         "complexity_computer": lambda x: x.n_estimators,
         "data": regression_data,
         "postfit_hook": lambda x: x,
         "prediction_performance_computer": mean_squared_error,
         "prediction_performance_label": "MSE",
-        "n_samples": 30,
+        "n_samples": 15,
     },
 ]
 
@@ -255,7 +260,9 @@ def plot_influence(conf, mse_values, prediction_times, complexities):
     ax2.yaxis.label.set_color(line2.get_color())
     ax2.tick_params(axis="y", colors=line2.get_color())
 
-    plt.legend((line1, line2), ("prediction error", "latency"), loc="upper right")
+    plt.legend(
+        (line1, line2), ("prediction error", "prediction latency"), loc="upper right"
+    )
 
     plt.title(
         "Influence of varying '%s' on %s"
@@ -268,7 +275,6 @@ def plot_influence(conf, mse_values, prediction_times, complexities):
     plot_influence(conf, prediction_performances, prediction_times, complexities)
 plt.show()
 
-
 ##############################################################################
 # Conclusion
 # ----------
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Authors: Eustache Diemert <[email protected]>\n# Maria Telenczuk <https://github.com/maikia>\n# Guillaume Lemaitre <[email protected]>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import NuSVR\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n\n# Initialize random generator\nnp.random.seed(0)"
	`29`	+ "# Authors: Eustache Diemert <[email protected]>\n# Maria Telenczuk <https://github.com/maikia>\n# Guillaume Lemaitre <[email protected]>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import NuSVR\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n# Initialize random generator\nnp.random.seed(0)"
`30`	`30`	`]`
`31`	`31`	`},`
`32`	`32`	`{`
`@@ -44,7 +44,7 @@`
`44`	`44`	`},`
`45`	`45`	`"outputs": [],`
`46`	`46`	`"source": [`
`47`		- "def generate_data(case):\n \"\"\"Generate regression/classification data.\"\"\"\n if case == \"regression\":\n X, y = datasets.load_diabetes(return_X_y=True)\n elif case == \"classification\":\n X, y = datasets.fetch_20newsgroups_vectorized(subset=\"all\", return_X_y=True)\n X, y = shuffle(X, y)\n offset = int(X.shape[0] * 0.8)\n X_train, y_train = X[:offset], y[:offset]\n X_test, y_test = X[offset:], y[offset:]\n\n data = {\"X_train\": X_train, \"X_test\": X_test, \"y_train\": y_train, \"y_test\": y_test}\n return data\n\n\nregression_data = generate_data(\"regression\")\nclassification_data = generate_data(\"classification\")"
	`47`	+ "def generate_data(case):\n \"\"\"Generate regression/classification data.\"\"\"\n if case == \"regression\":\n X, y = datasets.load_diabetes(return_X_y=True)\n train_size = 0.8\n elif case == \"classification\":\n X, y = datasets.fetch_20newsgroups_vectorized(subset=\"all\", return_X_y=True)\n train_size = 0.4 # to make the example run faster\n\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=train_size, random_state=0\n )\n\n data = {\"X_train\": X_train, \"X_test\": X_test, \"y_train\": y_train, \"y_test\": y_test}\n return data\n\n\nregression_data = generate_data(\"regression\")\nclassification_data = generate_data(\"classification\")"
`48`	`48`	`]`
`49`	`49`	`},`
`50`	`50`	`{`
`@@ -80,7 +80,7 @@`
`80`	`80`	`},`
`81`	`81`	`"outputs": [],`
`82`	`82`	`"source": [`
`83`		- "def _count_nonzero_coefficients(estimator):\n a = estimator.coef_.toarray()\n return np.count_nonzero(a)\n\n\nconfigurations = [\n {\n \"estimator\": SGDClassifier,\n \"tuned_params\": {\n \"penalty\": \"elasticnet\",\n \"alpha\": 0.001,\n \"loss\": \"modified_huber\",\n \"fit_intercept\": True,\n \"tol\": 1e-3,\n },\n \"changing_param\": \"l1_ratio\",\n \"changing_param_values\": [0.25, 0.5, 0.75, 0.9],\n \"complexity_label\": \"non_zero coefficients\",\n \"complexity_computer\": _count_nonzero_coefficients,\n \"prediction_performance_computer\": hamming_loss,\n \"prediction_performance_label\": \"Hamming Loss (Misclassification Ratio)\",\n \"postfit_hook\": lambda x: x.sparsify(),\n \"data\": classification_data,\n \"n_samples\": 30,\n },\n {\n \"estimator\": NuSVR,\n \"tuned_params\": {\"C\": 1e3, \"gamma\": 2 ** -15},\n \"changing_param\": \"nu\",\n \"changing_param_values\": [0.1, 0.25, 0.5, 0.75, 0.9],\n \"complexity_label\": \"n_support_vectors\",\n \"complexity_computer\": lambda x: len(x.support_vectors_),\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 30,\n },\n {\n \"estimator\": GradientBoostingRegressor,\n \"tuned_params\": {\"loss\": \"squared_error\"},\n \"changing_param\": \"n_estimators\",\n \"changing_param_values\": [10, 50, 100, 200, 500],\n \"complexity_label\": \"n_trees\",\n \"complexity_computer\": lambda x: x.n_estimators,\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 30,\n },\n]"
	`83`	+ "def _count_nonzero_coefficients(estimator):\n a = estimator.coef_.toarray()\n return np.count_nonzero(a)\n\n\nconfigurations = [\n {\n \"estimator\": SGDClassifier,\n \"tuned_params\": {\n \"penalty\": \"elasticnet\",\n \"alpha\": 0.001,\n \"loss\": \"modified_huber\",\n \"fit_intercept\": True,\n \"tol\": 1e-3,\n },\n \"changing_param\": \"l1_ratio\",\n \"changing_param_values\": [0.25, 0.5, 0.75, 0.9],\n \"complexity_label\": \"non_zero coefficients\",\n \"complexity_computer\": _count_nonzero_coefficients,\n \"prediction_performance_computer\": hamming_loss,\n \"prediction_performance_label\": \"Hamming Loss (Misclassification Ratio)\",\n \"postfit_hook\": lambda x: x.sparsify(),\n \"data\": classification_data,\n \"n_samples\": 5,\n },\n {\n \"estimator\": NuSVR,\n \"tuned_params\": {\"C\": 1e3, \"gamma\": 2 ** -15},\n \"changing_param\": \"nu\",\n \"changing_param_values\": [0.05, 0.1, 0.2, 0.35, 0.5],\n \"complexity_label\": \"n_support_vectors\",\n \"complexity_computer\": lambda x: len(x.support_vectors_),\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 15,\n },\n {\n \"estimator\": GradientBoostingRegressor,\n \"tuned_params\": {\n \"loss\": \"squared_error\",\n \"learning_rate\": 0.05,\n \"max_depth\": 2,\n },\n \"changing_param\": \"n_estimators\",\n \"changing_param_values\": [10, 25, 50, 75, 100],\n \"complexity_label\": \"n_trees\",\n \"complexity_computer\": lambda x: x.n_estimators,\n \"data\": regression_data,\n \"postfit_hook\": lambda x: x,\n \"prediction_performance_computer\": mean_squared_error,\n \"prediction_performance_label\": \"MSE\",\n \"n_samples\": 15,\n },\n]"
`84`	`84`	`]`
`85`	`85`	`},`
`86`	`86`	`{`
`@@ -98,7 +98,7 @@`
`98`	`98`	`},`
`99`	`99`	`"outputs": [],`
`100`	`100`	`"source": [`
`101`		- "def plot_influence(conf, mse_values, prediction_times, complexities):\n \"\"\"\n Plot influence of model complexity on both accuracy and latency.\n \"\"\"\n\n fig = plt.figure()\n fig.subplots_adjust(right=0.75)\n\n # first axes (prediction error)\n ax1 = fig.add_subplot(111)\n line1 = ax1.plot(complexities, mse_values, c=\"tab:blue\", ls=\"-\")[0]\n ax1.set_xlabel(\"Model Complexity (%s)\" % conf[\"complexity_label\"])\n y1_label = conf[\"prediction_performance_label\"]\n ax1.set_ylabel(y1_label)\n\n ax1.spines[\"left\"].set_color(line1.get_color())\n ax1.yaxis.label.set_color(line1.get_color())\n ax1.tick_params(axis=\"y\", colors=line1.get_color())\n\n # second axes (latency)\n ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)\n line2 = ax2.plot(complexities, prediction_times, c=\"tab:orange\", ls=\"-\")[0]\n ax2.yaxis.tick_right()\n ax2.yaxis.set_label_position(\"right\")\n y2_label = \"Time (s)\"\n ax2.set_ylabel(y2_label)\n ax1.spines[\"right\"].set_color(line2.get_color())\n ax2.yaxis.label.set_color(line2.get_color())\n ax2.tick_params(axis=\"y\", colors=line2.get_color())\n\n plt.legend((line1, line2), (\"prediction error\", \"latency\"), loc=\"upper right\")\n\n plt.title(\n \"Influence of varying '%s' on %s\"\n % (conf[\"changing_param\"], conf[\"estimator\"].__name__)\n )\n\n\nfor conf in configurations:\n prediction_performances, prediction_times, complexities = benchmark_influence(conf)\n plot_influence(conf, prediction_performances, prediction_times, complexities)\nplt.show()"
	`101`	+ "def plot_influence(conf, mse_values, prediction_times, complexities):\n \"\"\"\n Plot influence of model complexity on both accuracy and latency.\n \"\"\"\n\n fig = plt.figure()\n fig.subplots_adjust(right=0.75)\n\n # first axes (prediction error)\n ax1 = fig.add_subplot(111)\n line1 = ax1.plot(complexities, mse_values, c=\"tab:blue\", ls=\"-\")[0]\n ax1.set_xlabel(\"Model Complexity (%s)\" % conf[\"complexity_label\"])\n y1_label = conf[\"prediction_performance_label\"]\n ax1.set_ylabel(y1_label)\n\n ax1.spines[\"left\"].set_color(line1.get_color())\n ax1.yaxis.label.set_color(line1.get_color())\n ax1.tick_params(axis=\"y\", colors=line1.get_color())\n\n # second axes (latency)\n ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)\n line2 = ax2.plot(complexities, prediction_times, c=\"tab:orange\", ls=\"-\")[0]\n ax2.yaxis.tick_right()\n ax2.yaxis.set_label_position(\"right\")\n y2_label = \"Time (s)\"\n ax2.set_ylabel(y2_label)\n ax1.spines[\"right\"].set_color(line2.get_color())\n ax2.yaxis.label.set_color(line2.get_color())\n ax2.tick_params(axis=\"y\", colors=line2.get_color())\n\n plt.legend(\n (line1, line2), (\"prediction error\", \"prediction latency\"), loc=\"upper right\"\n )\n\n plt.title(\n \"Influence of varying '%s' on %s\"\n % (conf[\"changing_param\"], conf[\"estimator\"].__name__)\n )\n\n\nfor conf in configurations:\n prediction_performances, prediction_times, complexities = benchmark_influence(conf)\n plot_influence(conf, prediction_performances, prediction_times, complexities)\nplt.show()"
`102`	`102`	`]`
`103`	`103`	`},`
`104`	`104`	`{`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@`
`40`	`40`	`X, y, stratify=y, test_size=0.7, random_state=42`
`41`	`41`	`)`
`42`	`42`
`43`		`-h = 0.01 # step size in the mesh`
	`43`	`+h = 0.05 # step size in the mesh`
`44`	`44`
`45`	`45`	`# Create color maps`
`46`	`46`	`cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])`