partev
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/07960f9087d379e9d0da6350d6ee3f41/plot_classification_probability.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/07960f9087d379e9d0da6350d6ee3f41/plot_classification_probability.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.15 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.15 KB
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.19 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.19 KB
diff --git a/‎dev/_downloads/7012baed63b9a27f121bae611b8285c2/plot_cyclical_feature_engineering.ipynb
Lines changed: 7 additions & 7 deletions b/‎dev/_downloads/7012baed63b9a27f121bae611b8285c2/plot_cyclical_feature_engineering.ipynb
Lines changed: 7 additions & 7 deletions
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 1247837738e72319b7ec0038f32914c1
+config: 8a2948de0fee28e95ddaa7ff03489af3
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -15,7 +15,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Alexandre Gramfort <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.gaussian_process.kernels import RBF\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.svm import SVC\n\niris = datasets.load_iris()\nX = iris.data[:, 0:2]  # we only take the first two features for visualization\ny = iris.target\n\nn_features = X.shape[1]\n\nC = 10\nkernel = 1.0 * RBF([1.0, 1.0])  # for GPC\n\n# Create different classifiers.\nclassifiers = {\n    \"L1 logistic\": LogisticRegression(\n        C=C, penalty=\"l1\", solver=\"saga\", multi_class=\"multinomial\", max_iter=10000\n    ),\n    \"L2 logistic (Multinomial)\": LogisticRegression(\n        C=C, penalty=\"l2\", solver=\"saga\", multi_class=\"multinomial\", max_iter=10000\n    ),\n    \"L2 logistic (OvR)\": LogisticRegression(\n        C=C, penalty=\"l2\", solver=\"saga\", multi_class=\"ovr\", max_iter=10000\n    ),\n    \"Linear SVC\": SVC(kernel=\"linear\", C=C, probability=True, random_state=0),\n    \"GPC\": GaussianProcessClassifier(kernel),\n}\n\nn_classifiers = len(classifiers)\n\nplt.figure(figsize=(3 * 2, n_classifiers * 2))\nplt.subplots_adjust(bottom=0.2, top=0.95)\n\nxx = np.linspace(3, 9, 100)\nyy = np.linspace(1, 5, 100).T\nxx, yy = np.meshgrid(xx, yy)\nXfull = np.c_[xx.ravel(), yy.ravel()]\n\nfor index, (name, classifier) in enumerate(classifiers.items()):\n    classifier.fit(X, y)\n\n    y_pred = classifier.predict(X)\n    accuracy = accuracy_score(y, y_pred)\n    print(\"Accuracy (train) for %s: %0.1f%% \" % (name, accuracy * 100))\n\n    # View probabilities:\n    probas = classifier.predict_proba(Xfull)\n    n_classes = np.unique(y_pred).size\n    for k in range(n_classes):\n        plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1)\n        plt.title(\"Class %d\" % k)\n        if k == 0:\n            plt.ylabel(name)\n        imshow_handle = plt.imshow(\n            probas[:, k].reshape((100, 100)), extent=(3, 9, 1, 5), origin=\"lower\"\n        )\n        plt.xticks(())\n        plt.yticks(())\n        idx = y_pred == k\n        if idx.any():\n            plt.scatter(X[idx, 0], X[idx, 1], marker=\"o\", c=\"w\", edgecolor=\"k\")\n\nax = plt.axes([0.15, 0.04, 0.7, 0.05])\nplt.title(\"Probability\")\nplt.colorbar(imshow_handle, cax=ax, orientation=\"horizontal\")\n\nplt.show()"
+        "# Author: Alexandre Gramfort <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom matplotlib import cm\n\nfrom sklearn import datasets\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.gaussian_process.kernels import RBF\nfrom sklearn.inspection import DecisionBoundaryDisplay\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.svm import SVC\n\niris = datasets.load_iris()\nX = iris.data[:, 0:2]  # we only take the first two features for visualization\ny = iris.target\n\nn_features = X.shape[1]\n\nC = 10\nkernel = 1.0 * RBF([1.0, 1.0])  # for GPC\n\n# Create different classifiers.\nclassifiers = {\n    \"L1 logistic\": LogisticRegression(\n        C=C, penalty=\"l1\", solver=\"saga\", multi_class=\"multinomial\", max_iter=10000\n    ),\n    \"L2 logistic (Multinomial)\": LogisticRegression(\n        C=C, penalty=\"l2\", solver=\"saga\", multi_class=\"multinomial\", max_iter=10000\n    ),\n    \"L2 logistic (OvR)\": LogisticRegression(\n        C=C, penalty=\"l2\", solver=\"saga\", multi_class=\"ovr\", max_iter=10000\n    ),\n    \"Linear SVC\": SVC(kernel=\"linear\", C=C, probability=True, random_state=0),\n    \"GPC\": GaussianProcessClassifier(kernel),\n}\n\nn_classifiers = len(classifiers)\n\nfig, axes = plt.subplots(\n    nrows=n_classifiers,\n    ncols=len(iris.target_names),\n    figsize=(3 * 2, n_classifiers * 2),\n)\nfor classifier_idx, (name, classifier) in enumerate(classifiers.items()):\n    y_pred = classifier.fit(X, y).predict(X)\n    accuracy = accuracy_score(y, y_pred)\n    print(f\"Accuracy (train) for {name}: {accuracy:0.1%}\")\n    for label in np.unique(y):\n        # plot the probability estimate provided by the classifier\n        disp = DecisionBoundaryDisplay.from_estimator(\n            classifier,\n            X,\n            response_method=\"predict_proba\",\n            class_of_interest=label,\n            ax=axes[classifier_idx, label],\n            vmin=0,\n            vmax=1,\n        )\n        axes[classifier_idx, label].set_title(f\"Class {label}\")\n        # plot data predicted to belong to given class\n        mask_y_pred = y_pred == label\n        axes[classifier_idx, label].scatter(\n            X[mask_y_pred, 0], X[mask_y_pred, 1], marker=\"o\", c=\"w\", edgecolor=\"k\"\n        )\n        axes[classifier_idx, label].set(xticks=(), yticks=())\n    axes[classifier_idx, 0].set_ylabel(name)\n\nax = plt.axes([0.15, 0.04, 0.7, 0.02])\nplt.title(\"Probability\")\n_ = plt.colorbar(\n    cm.ScalarMappable(norm=None, cmap=\"viridis\"), cax=ax, orientation=\"horizontal\"\n)\n\nplt.show()"
       ]
     }
   ],
 
@@ -65,7 +65,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Let us rescale the target variable (number of hourly bike rentals) to predict\na relative demand so that the mean absolute error is more easily interpreted\nas a fraction of the maximum demand.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>The fit method of the models used in this notebook all minimize the\n    mean squared error to estimate the conditional mean instead of the mean\n    absolute error that would fit an estimator of the conditional median.\n\n    When reporting performance measure on the test set in the discussion, we\n    instead choose to focus on the mean absolute error that is more\n    intuitive than the (root) mean squared error. Note, however, that the\n    best models for one metric are also the best for the other in this\n    study.</p></div>\n\n"
+        "Let us rescale the target variable (number of hourly bike rentals) to predict\na relative demand so that the mean absolute error is more easily interpreted\nas a fraction of the maximum demand.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>The fit method of the models used in this notebook all minimize the\n    mean squared error to estimate the conditional mean.\n    The absolute error, however, would estimate the conditional median.\n\n    Nevertheless, when reporting performance measures on the test set in\n    the discussion, we choose to focus on the mean absolute error instead\n    of the (root) mean squared error because it is more intuitive to\n    interpret. Note, however, that in this study the best models for one\n    metric are also the best ones in terms of the other metric.</p></div>\n\n"
       ]
     },
     {
@@ -275,7 +275,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "All is well. We are now ready to do some predictive modeling!\n\n## Gradient Boosting\n\nGradient Boosting Regression with decision trees is often flexible enough to\nefficiently handle heteorogenous tabular data with a mix of categorical and\nnumerical features as long as the number of samples is large enough.\n\nHere, we do minimal ordinal encoding for the categorical variables and then\nlet the model know that it should treat those as categorical variables by\nusing a dedicated tree splitting rule. Since we use an ordinal encoder, we\npass the list of categorical values explicitly to use a logical order when\nencoding the categories as integers instead of the lexicographical order.\nThis also has the added benefit of preventing any issue with unknown\ncategories when using cross-validation.\n\nThe numerical variables need no preprocessing and, for the sake of simplicity,\nwe only try the default hyper-parameters for this model:\n\n"
+        "All is well. We are now ready to do some predictive modeling!\n\n## Gradient Boosting\n\nGradient Boosting Regression with decision trees is often flexible enough to\nefficiently handle heteorogenous tabular data with a mix of categorical and\nnumerical features as long as the number of samples is large enough.\n\nHere, we use the modern\n:class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support\nfor categorical features. Therefore, we only do minimal ordinal encoding for\nthe categorical variables and then\nlet the model know that it should treat those as categorical variables by\nusing a dedicated tree splitting rule. Since we use an ordinal encoder, we\npass the list of categorical values explicitly to use a logical order when\nencoding the categories as integers instead of the lexicographical order.\nThis also has the added benefit of preventing any issue with unknown\ncategories when using cross-validation.\n\nThe numerical variables need no preprocessing and, for the sake of simplicity,\nwe only try the default hyper-parameters for this model:\n\n"
       ]
     },
     {
@@ -286,7 +286,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\n\ncategorical_columns = [\n    \"weather\",\n    \"season\",\n    \"holiday\",\n    \"workingday\",\n]\ncategories = [\n    [\"clear\", \"misty\", \"rain\"],\n    [\"spring\", \"summer\", \"fall\", \"winter\"],\n    [\"False\", \"True\"],\n    [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n    ColumnTransformer(\n        transformers=[\n            (\"categorical\", ordinal_encoder, categorical_columns),\n        ],\n        remainder=\"passthrough\",\n        # Use short feature names to make it easier to specify the categorical\n        # variables in the HistGradientBoostingRegressor in the next\n        # step of the pipeline.\n        verbose_feature_names_out=False,\n    ),\n    HistGradientBoostingRegressor(\n        categorical_features=categorical_columns,\n        random_state=42,\n    ),\n).set_output(transform=\"pandas\")"
+        "from sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\n\ncategorical_columns = [\n    \"weather\",\n    \"season\",\n    \"holiday\",\n    \"workingday\",\n]\ncategories = [\n    [\"clear\", \"misty\", \"rain\"],\n    [\"spring\", \"summer\", \"fall\", \"winter\"],\n    [\"False\", \"True\"],\n    [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n    ColumnTransformer(\n        transformers=[\n            (\"categorical\", ordinal_encoder, categorical_columns),\n        ],\n        remainder=\"passthrough\",\n        # Use short feature names to make it easier to specify the categorical\n        # variables in the HistGradientBoostingRegressor in the next\n        # step of the pipeline.\n        verbose_feature_names_out=False,\n    ),\n    HistGradientBoostingRegressor(\n        max_iter=300,\n        early_stopping=True,\n        validation_fraction=0.1,\n        categorical_features=categorical_columns,\n        random_state=42,\n    ),\n).set_output(transform=\"pandas\")"
       ]
     },
     {
@@ -304,14 +304,14 @@
       },
       "outputs": [],
       "source": [
-        "def evaluate(model, X, y, cv):\n    cv_results = cross_validate(\n        model,\n        X,\n        y,\n        cv=cv,\n        scoring=[\"neg_mean_absolute_error\", \"neg_root_mean_squared_error\"],\n    )\n    mae = -cv_results[\"test_neg_mean_absolute_error\"]\n    rmse = -cv_results[\"test_neg_root_mean_squared_error\"]\n    print(\n        f\"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\\n\"\n        f\"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}\"\n    )\n\n\nevaluate(gbrt_pipeline, X, y, cv=ts_cv)"
+        "import numpy as np\n\n\ndef evaluate(model, X, y, cv, model_prop=None, model_step=None):\n    cv_results = cross_validate(\n        model,\n        X,\n        y,\n        cv=cv,\n        scoring=[\"neg_mean_absolute_error\", \"neg_root_mean_squared_error\"],\n        return_estimator=model_prop is not None,\n    )\n    if model_prop is not None:\n        if model_step is not None:\n            values = [\n                getattr(m[model_step], model_prop) for m in cv_results[\"estimator\"]\n            ]\n        else:\n            values = [getattr(m, model_prop) for m in cv_results[\"estimator\"]]\n        print(f\"Mean model.{model_prop} = {np.mean(values)}\")\n    mae = -cv_results[\"test_neg_mean_absolute_error\"]\n    rmse = -cv_results[\"test_neg_root_mean_squared_error\"]\n    print(\n        f\"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\\n\"\n        f\"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}\"\n    )\n\n\nevaluate(\n    gbrt_pipeline,\n    X,\n    y,\n    cv=ts_cv,\n    model_prop=\"n_iter_\",\n    model_step=\"histgradientboostingregressor\",\n)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "This model has an average error around 4 to 5% of the maximum demand. This is\nquite good for a first trial without any hyper-parameter tuning! We just had\nto make the categorical variables explicit. Note that the time related\nfeatures are passed as is, i.e. without processing them. But this is not much\nof a problem for tree-based models as they can learn a non-monotonic\nrelationship between ordinal input features and the target.\n\nThis is not the case for linear regression models as we will see in the\nfollowing.\n\n## Naive linear regression\n\nAs usual for linear models, categorical variables need to be one-hot encoded.\nFor consistency, we scale the numerical features to the same 0-1 range using\nclass:`sklearn.preprocessing.MinMaxScaler`, although in this case it does not\nimpact the results much because they are already on comparable scales:\n\n"
+        "We see that we set `max_iter` large enough such that early stopping took place.\n\nThis model has an average error around 4 to 5% of the maximum demand. This is\nquite good for a first trial without any hyper-parameter tuning! We just had\nto make the categorical variables explicit. Note that the time related\nfeatures are passed as is, i.e. without processing them. But this is not much\nof a problem for tree-based models as they can learn a non-monotonic\nrelationship between ordinal input features and the target.\n\nThis is not the case for linear regression models as we will see in the\nfollowing.\n\n## Naive linear regression\n\nAs usual for linear models, categorical variables need to be one-hot encoded.\nFor consistency, we scale the numerical features to the same 0-1 range using\n:class:`~sklearn.preprocessing.MinMaxScaler`, although in this case it does not\nimpact the results much because they are already on comparable scales:\n\n"
       ]
     },
     {
@@ -322,14 +322,14 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\n\nfrom sklearn.linear_model import RidgeCV\nfrom sklearn.preprocessing import MinMaxScaler, OneHotEncoder\n\none_hot_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)\nalphas = np.logspace(-6, 6, 25)\nnaive_linear_pipeline = make_pipeline(\n    ColumnTransformer(\n        transformers=[\n            (\"categorical\", one_hot_encoder, categorical_columns),\n        ],\n        remainder=MinMaxScaler(),\n    ),\n    RidgeCV(alphas=alphas),\n)\n\n\nevaluate(naive_linear_pipeline, X, y, cv=ts_cv)"
+        "from sklearn.linear_model import RidgeCV\nfrom sklearn.preprocessing import MinMaxScaler, OneHotEncoder\n\none_hot_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)\nalphas = np.logspace(-6, 6, 25)\nnaive_linear_pipeline = make_pipeline(\n    ColumnTransformer(\n        transformers=[\n            (\"categorical\", one_hot_encoder, categorical_columns),\n        ],\n        remainder=MinMaxScaler(),\n    ),\n    RidgeCV(alphas=alphas),\n)\n\n\nevaluate(\n    naive_linear_pipeline, X, y, cv=ts_cv, model_prop=\"alpha_\", model_step=\"ridgecv\"\n)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "The performance is not good: the average error is around 14% of the maximum\ndemand. This is more than three times higher than the average error of the\ngradient boosting model. We can suspect that the naive original encoding\n(merely min-max scaled) of the periodic time-related features might prevent\nthe linear regression model to properly leverage the time information: linear\nregression does not automatically model non-monotonic relationships between\nthe input features and the target. Non-linear terms have to be engineered in\nthe input.\n\nFor example, the raw numerical encoding of the `\"hour\"` feature prevents the\nlinear model from recognizing that an increase of hour in the morning from 6\nto 8 should have a strong positive impact on the number of bike rentals while\nan increase of similar magnitude in the evening from 18 to 20 should have a\nstrong negative impact on the predicted number of bike rentals.\n\n## Time-steps as categories\n\nSince the time features are encoded in a discrete manner using integers (24\nunique values in the \"hours\" feature), we could decide to treat those as\ncategorical variables using a one-hot encoding and thereby ignore any\nassumption implied by the ordering of the hour values.\n\nUsing one-hot encoding for the time features gives the linear model a lot\nmore flexibility as we introduce one additional feature per discrete time\nlevel.\n\n"
+        "It is affirmative to see that the selected `alpha_` is in our specified\nrange.\n\nThe performance is not good: the average error is around 14% of the maximum\ndemand. This is more than three times higher than the average error of the\ngradient boosting model. We can suspect that the naive original encoding\n(merely min-max scaled) of the periodic time-related features might prevent\nthe linear regression model to properly leverage the time information: linear\nregression does not automatically model non-monotonic relationships between\nthe input features and the target. Non-linear terms have to be engineered in\nthe input.\n\nFor example, the raw numerical encoding of the `\"hour\"` feature prevents the\nlinear model from recognizing that an increase of hour in the morning from 6\nto 8 should have a strong positive impact on the number of bike rentals while\nan increase of similar magnitude in the evening from 18 to 20 should have a\nstrong negative impact on the predicted number of bike rentals.\n\n## Time-steps as categories\n\nSince the time features are encoded in a discrete manner using integers (24\nunique values in the \"hours\" feature), we could decide to treat those as\ncategorical variables using a one-hot encoding and thereby ignore any\nassumption implied by the ordering of the hour values.\n\nUsing one-hot encoding for the time features gives the linear model a lot\nmore flexibility as we introduce one additional feature per discrete time\nlevel.\n\n"
       ]
     },
     {
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`},`
`16`	`16`	`"outputs": [],`
`17`	`17`	`"source": [`
`18`		- "# Author: Alexandre Gramfort <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.gaussian_process.kernels import RBF\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.svm import SVC\n\niris = datasets.load_iris()\nX = iris.data[:, 0:2] # we only take the first two features for visualization\ny = iris.target\n\nn_features = X.shape[1]\n\nC = 10\nkernel = 1.0 * RBF([1.0, 1.0]) # for GPC\n\n# Create different classifiers.\nclassifiers = {\n \"L1 logistic\": LogisticRegression(\n C=C, penalty=\"l1\", solver=\"saga\", multi_class=\"multinomial\", max_iter=10000\n ),\n \"L2 logistic (Multinomial)\": LogisticRegression(\n C=C, penalty=\"l2\", solver=\"saga\", multi_class=\"multinomial\", max_iter=10000\n ),\n \"L2 logistic (OvR)\": LogisticRegression(\n C=C, penalty=\"l2\", solver=\"saga\", multi_class=\"ovr\", max_iter=10000\n ),\n \"Linear SVC\": SVC(kernel=\"linear\", C=C, probability=True, random_state=0),\n \"GPC\": GaussianProcessClassifier(kernel),\n}\n\nn_classifiers = len(classifiers)\n\nplt.figure(figsize=(3 * 2, n_classifiers * 2))\nplt.subplots_adjust(bottom=0.2, top=0.95)\n\nxx = np.linspace(3, 9, 100)\nyy = np.linspace(1, 5, 100).T\nxx, yy = np.meshgrid(xx, yy)\nXfull = np.c_[xx.ravel(), yy.ravel()]\n\nfor index, (name, classifier) in enumerate(classifiers.items()):\n classifier.fit(X, y)\n\n y_pred = classifier.predict(X)\n accuracy = accuracy_score(y, y_pred)\n print(\"Accuracy (train) for %s: %0.1f%% \" % (name, accuracy * 100))\n\n # View probabilities:\n probas = classifier.predict_proba(Xfull)\n n_classes = np.unique(y_pred).size\n for k in range(n_classes):\n plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1)\n plt.title(\"Class %d\" % k)\n if k == 0:\n plt.ylabel(name)\n imshow_handle = plt.imshow(\n probas[:, k].reshape((100, 100)), extent=(3, 9, 1, 5), origin=\"lower\"\n )\n plt.xticks(())\n plt.yticks(())\n idx = y_pred == k\n if idx.any():\n plt.scatter(X[idx, 0], X[idx, 1], marker=\"o\", c=\"w\", edgecolor=\"k\")\n\nax = plt.axes([0.15, 0.04, 0.7, 0.05])\nplt.title(\"Probability\")\nplt.colorbar(imshow_handle, cax=ax, orientation=\"horizontal\")\n\nplt.show()"
	`18`	+ "# Author: Alexandre Gramfort <[email protected]>\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom matplotlib import cm\n\nfrom sklearn import datasets\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.gaussian_process.kernels import RBF\nfrom sklearn.inspection import DecisionBoundaryDisplay\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.svm import SVC\n\niris = datasets.load_iris()\nX = iris.data[:, 0:2] # we only take the first two features for visualization\ny = iris.target\n\nn_features = X.shape[1]\n\nC = 10\nkernel = 1.0 * RBF([1.0, 1.0]) # for GPC\n\n# Create different classifiers.\nclassifiers = {\n \"L1 logistic\": LogisticRegression(\n C=C, penalty=\"l1\", solver=\"saga\", multi_class=\"multinomial\", max_iter=10000\n ),\n \"L2 logistic (Multinomial)\": LogisticRegression(\n C=C, penalty=\"l2\", solver=\"saga\", multi_class=\"multinomial\", max_iter=10000\n ),\n \"L2 logistic (OvR)\": LogisticRegression(\n C=C, penalty=\"l2\", solver=\"saga\", multi_class=\"ovr\", max_iter=10000\n ),\n \"Linear SVC\": SVC(kernel=\"linear\", C=C, probability=True, random_state=0),\n \"GPC\": GaussianProcessClassifier(kernel),\n}\n\nn_classifiers = len(classifiers)\n\nfig, axes = plt.subplots(\n nrows=n_classifiers,\n ncols=len(iris.target_names),\n figsize=(3 * 2, n_classifiers * 2),\n)\nfor classifier_idx, (name, classifier) in enumerate(classifiers.items()):\n y_pred = classifier.fit(X, y).predict(X)\n accuracy = accuracy_score(y, y_pred)\n print(f\"Accuracy (train) for {name}: {accuracy:0.1%}\")\n for label in np.unique(y):\n # plot the probability estimate provided by the classifier\n disp = DecisionBoundaryDisplay.from_estimator(\n classifier,\n X,\n response_method=\"predict_proba\",\n class_of_interest=label,\n ax=axes[classifier_idx, label],\n vmin=0,\n vmax=1,\n )\n axes[classifier_idx, label].set_title(f\"Class {label}\")\n # plot data predicted to belong to given class\n mask_y_pred = y_pred == label\n axes[classifier_idx, label].scatter(\n X[mask_y_pred, 0], X[mask_y_pred, 1], marker=\"o\", c=\"w\", edgecolor=\"k\"\n )\n axes[classifier_idx, label].set(xticks=(), yticks=())\n axes[classifier_idx, 0].set_ylabel(name)\n\nax = plt.axes([0.15, 0.04, 0.7, 0.02])\nplt.title(\"Probability\")\n_ = plt.colorbar(\n cm.ScalarMappable(norm=None, cmap=\"viridis\"), cax=ax, orientation=\"horizontal\"\n)\n\nplt.show()"
`19`	`19`	`]`
`20`	`20`	`}`
`21`	`21`	`],`
Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@`
`65`	`65`	`"cell_type": "markdown",`
`66`	`66`	`"metadata": {},`
`67`	`67`	`"source": [`
`68`		- "Let us rescale the target variable (number of hourly bike rentals) to predict\na relative demand so that the mean absolute error is more easily interpreted\nas a fraction of the maximum demand.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>The fit method of the models used in this notebook all minimize the\n mean squared error to estimate the conditional mean instead of the mean\n absolute error that would fit an estimator of the conditional median.\n\n When reporting performance measure on the test set in the discussion, we\n instead choose to focus on the mean absolute error that is more\n intuitive than the (root) mean squared error. Note, however, that the\n best models for one metric are also the best for the other in this\n study.</p></div>\n\n"
	`68`	+ "Let us rescale the target variable (number of hourly bike rentals) to predict\na relative demand so that the mean absolute error is more easily interpreted\nas a fraction of the maximum demand.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>The fit method of the models used in this notebook all minimize the\n mean squared error to estimate the conditional mean.\n The absolute error, however, would estimate the conditional median.\n\n Nevertheless, when reporting performance measures on the test set in\n the discussion, we choose to focus on the mean absolute error instead\n of the (root) mean squared error because it is more intuitive to\n interpret. Note, however, that in this study the best models for one\n metric are also the best ones in terms of the other metric.</p></div>\n\n"
`69`	`69`	`]`
`70`	`70`	`},`
`71`	`71`	`{`
`@@ -275,7 +275,7 @@`
`275`	`275`	`"cell_type": "markdown",`
`276`	`276`	`"metadata": {},`
`277`	`277`	`"source": [`
`278`		- "All is well. We are now ready to do some predictive modeling!\n\n## Gradient Boosting\n\nGradient Boosting Regression with decision trees is often flexible enough to\nefficiently handle heteorogenous tabular data with a mix of categorical and\nnumerical features as long as the number of samples is large enough.\n\nHere, we do minimal ordinal encoding for the categorical variables and then\nlet the model know that it should treat those as categorical variables by\nusing a dedicated tree splitting rule. Since we use an ordinal encoder, we\npass the list of categorical values explicitly to use a logical order when\nencoding the categories as integers instead of the lexicographical order.\nThis also has the added benefit of preventing any issue with unknown\ncategories when using cross-validation.\n\nThe numerical variables need no preprocessing and, for the sake of simplicity,\nwe only try the default hyper-parameters for this model:\n\n"
	`278`	+ "All is well. We are now ready to do some predictive modeling!\n\n## Gradient Boosting\n\nGradient Boosting Regression with decision trees is often flexible enough to\nefficiently handle heteorogenous tabular data with a mix of categorical and\nnumerical features as long as the number of samples is large enough.\n\nHere, we use the modern\n:class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support\nfor categorical features. Therefore, we only do minimal ordinal encoding for\nthe categorical variables and then\nlet the model know that it should treat those as categorical variables by\nusing a dedicated tree splitting rule. Since we use an ordinal encoder, we\npass the list of categorical values explicitly to use a logical order when\nencoding the categories as integers instead of the lexicographical order.\nThis also has the added benefit of preventing any issue with unknown\ncategories when using cross-validation.\n\nThe numerical variables need no preprocessing and, for the sake of simplicity,\nwe only try the default hyper-parameters for this model:\n\n"
`279`	`279`	`]`
`280`	`280`	`},`
`281`	`281`	`{`
`@@ -286,7 +286,7 @@`
`286`	`286`	`},`
`287`	`287`	`"outputs": [],`
`288`	`288`	`"source": [`
`289`		- "from sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\n\ncategorical_columns = [\n \"weather\",\n \"season\",\n \"holiday\",\n \"workingday\",\n]\ncategories = [\n [\"clear\", \"misty\", \"rain\"],\n [\"spring\", \"summer\", \"fall\", \"winter\"],\n [\"False\", \"True\"],\n [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n ColumnTransformer(\n transformers=[\n (\"categorical\", ordinal_encoder, categorical_columns),\n ],\n remainder=\"passthrough\",\n # Use short feature names to make it easier to specify the categorical\n # variables in the HistGradientBoostingRegressor in the next\n # step of the pipeline.\n verbose_feature_names_out=False,\n ),\n HistGradientBoostingRegressor(\n categorical_features=categorical_columns,\n random_state=42,\n ),\n).set_output(transform=\"pandas\")"
	`289`	+ "from sklearn.compose import ColumnTransformer\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_validate\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OrdinalEncoder\n\ncategorical_columns = [\n \"weather\",\n \"season\",\n \"holiday\",\n \"workingday\",\n]\ncategories = [\n [\"clear\", \"misty\", \"rain\"],\n [\"spring\", \"summer\", \"fall\", \"winter\"],\n [\"False\", \"True\"],\n [\"False\", \"True\"],\n]\nordinal_encoder = OrdinalEncoder(categories=categories)\n\n\ngbrt_pipeline = make_pipeline(\n ColumnTransformer(\n transformers=[\n (\"categorical\", ordinal_encoder, categorical_columns),\n ],\n remainder=\"passthrough\",\n # Use short feature names to make it easier to specify the categorical\n # variables in the HistGradientBoostingRegressor in the next\n # step of the pipeline.\n verbose_feature_names_out=False,\n ),\n HistGradientBoostingRegressor(\n max_iter=300,\n early_stopping=True,\n validation_fraction=0.1,\n categorical_features=categorical_columns,\n random_state=42,\n ),\n).set_output(transform=\"pandas\")"
`290`	`290`	`]`
`291`	`291`	`},`
`292`	`292`	`{`
`@@ -304,14 +304,14 @@`
`304`	`304`	`},`
`305`	`305`	`"outputs": [],`
`306`	`306`	`"source": [`
`307`		- "def evaluate(model, X, y, cv):\n cv_results = cross_validate(\n model,\n X,\n y,\n cv=cv,\n scoring=[\"neg_mean_absolute_error\", \"neg_root_mean_squared_error\"],\n )\n mae = -cv_results[\"test_neg_mean_absolute_error\"]\n rmse = -cv_results[\"test_neg_root_mean_squared_error\"]\n print(\n f\"Mean Absolute Error: {mae.mean():.3f} +/- {mae.std():.3f}\\n\"\n f\"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}\"\n )\n\n\nevaluate(gbrt_pipeline, X, y, cv=ts_cv)"
	`307`	+ "import numpy as np\n\n\ndef evaluate(model, X, y, cv, model_prop=None, model_step=None):\n cv_results = cross_validate(\n model,\n X,\n y,\n cv=cv,\n scoring=[\"neg_mean_absolute_error\", \"neg_root_mean_squared_error\"],\n return_estimator=model_prop is not None,\n )\n if model_prop is not None:\n if model_step is not None:\n values = [\n getattr(m[model_step], model_prop) for m in cv_results[\"estimator\"]\n ]\n else:\n values = [getattr(m, model_prop) for m in cv_results[\"estimator\"]]\n print(f\"Mean model.{model_prop} = {np.mean(values)}\")\n mae = -cv_results[\"test_neg_mean_absolute_error\"]\n rmse = -cv_results[\"test_neg_root_mean_squared_error\"]\n print(\n f\"Mean Absolute Error: {mae.mean():.3f} +/- {mae.std():.3f}\\n\"\n f\"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}\"\n )\n\n\nevaluate(\n gbrt_pipeline,\n X,\n y,\n cv=ts_cv,\n model_prop=\"n_iter_\",\n model_step=\"histgradientboostingregressor\",\n)"
`308`	`308`	`]`
`309`	`309`	`},`
`310`	`310`	`{`
`311`	`311`	`"cell_type": "markdown",`
`312`	`312`	`"metadata": {},`
`313`	`313`	`"source": [`
`314`		- "This model has an average error around 4 to 5% of the maximum demand. This is\nquite good for a first trial without any hyper-parameter tuning! We just had\nto make the categorical variables explicit. Note that the time related\nfeatures are passed as is, i.e. without processing them. But this is not much\nof a problem for tree-based models as they can learn a non-monotonic\nrelationship between ordinal input features and the target.\n\nThis is not the case for linear regression models as we will see in the\nfollowing.\n\n## Naive linear regression\n\nAs usual for linear models, categorical variables need to be one-hot encoded.\nFor consistency, we scale the numerical features to the same 0-1 range using\nclass:`sklearn.preprocessing.MinMaxScaler`, although in this case it does not\nimpact the results much because they are already on comparable scales:\n\n"
	`314`	+ "We see that we set `max_iter` large enough such that early stopping took place.\n\nThis model has an average error around 4 to 5% of the maximum demand. This is\nquite good for a first trial without any hyper-parameter tuning! We just had\nto make the categorical variables explicit. Note that the time related\nfeatures are passed as is, i.e. without processing them. But this is not much\nof a problem for tree-based models as they can learn a non-monotonic\nrelationship between ordinal input features and the target.\n\nThis is not the case for linear regression models as we will see in the\nfollowing.\n\n## Naive linear regression\n\nAs usual for linear models, categorical variables need to be one-hot encoded.\nFor consistency, we scale the numerical features to the same 0-1 range using\n:class:`~sklearn.preprocessing.MinMaxScaler`, although in this case it does not\nimpact the results much because they are already on comparable scales:\n\n"
`315`	`315`	`]`
`316`	`316`	`},`
`317`	`317`	`{`
`@@ -322,14 +322,14 @@`
`322`	`322`	`},`
`323`	`323`	`"outputs": [],`
`324`	`324`	`"source": [`
`325`		- "import numpy as np\n\nfrom sklearn.linear_model import RidgeCV\nfrom sklearn.preprocessing import MinMaxScaler, OneHotEncoder\n\none_hot_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)\nalphas = np.logspace(-6, 6, 25)\nnaive_linear_pipeline = make_pipeline(\n ColumnTransformer(\n transformers=[\n (\"categorical\", one_hot_encoder, categorical_columns),\n ],\n remainder=MinMaxScaler(),\n ),\n RidgeCV(alphas=alphas),\n)\n\n\nevaluate(naive_linear_pipeline, X, y, cv=ts_cv)"
	`325`	+ "from sklearn.linear_model import RidgeCV\nfrom sklearn.preprocessing import MinMaxScaler, OneHotEncoder\n\none_hot_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)\nalphas = np.logspace(-6, 6, 25)\nnaive_linear_pipeline = make_pipeline(\n ColumnTransformer(\n transformers=[\n (\"categorical\", one_hot_encoder, categorical_columns),\n ],\n remainder=MinMaxScaler(),\n ),\n RidgeCV(alphas=alphas),\n)\n\n\nevaluate(\n naive_linear_pipeline, X, y, cv=ts_cv, model_prop=\"alpha_\", model_step=\"ridgecv\"\n)"
`326`	`326`	`]`
`327`	`327`	`},`
`328`	`328`	`{`
`329`	`329`	`"cell_type": "markdown",`
`330`	`330`	`"metadata": {},`
`331`	`331`	`"source": [`
`332`		- "The performance is not good: the average error is around 14% of the maximum\ndemand. This is more than three times higher than the average error of the\ngradient boosting model. We can suspect that the naive original encoding\n(merely min-max scaled) of the periodic time-related features might prevent\nthe linear regression model to properly leverage the time information: linear\nregression does not automatically model non-monotonic relationships between\nthe input features and the target. Non-linear terms have to be engineered in\nthe input.\n\nFor example, the raw numerical encoding of the `\"hour\"` feature prevents the\nlinear model from recognizing that an increase of hour in the morning from 6\nto 8 should have a strong positive impact on the number of bike rentals while\nan increase of similar magnitude in the evening from 18 to 20 should have a\nstrong negative impact on the predicted number of bike rentals.\n\n## Time-steps as categories\n\nSince the time features are encoded in a discrete manner using integers (24\nunique values in the \"hours\" feature), we could decide to treat those as\ncategorical variables using a one-hot encoding and thereby ignore any\nassumption implied by the ordering of the hour values.\n\nUsing one-hot encoding for the time features gives the linear model a lot\nmore flexibility as we introduce one additional feature per discrete time\nlevel.\n\n"
	`332`	+ "It is affirmative to see that the selected `alpha_` is in our specified\nrange.\n\nThe performance is not good: the average error is around 14% of the maximum\ndemand. This is more than three times higher than the average error of the\ngradient boosting model. We can suspect that the naive original encoding\n(merely min-max scaled) of the periodic time-related features might prevent\nthe linear regression model to properly leverage the time information: linear\nregression does not automatically model non-monotonic relationships between\nthe input features and the target. Non-linear terms have to be engineered in\nthe input.\n\nFor example, the raw numerical encoding of the `\"hour\"` feature prevents the\nlinear model from recognizing that an increase of hour in the morning from 6\nto 8 should have a strong positive impact on the number of bike rentals while\nan increase of similar magnitude in the evening from 18 to 20 should have a\nstrong negative impact on the predicted number of bike rentals.\n\n## Time-steps as categories\n\nSince the time features are encoded in a discrete manner using integers (24\nunique values in the \"hours\" feature), we could decide to treat those as\ncategorical variables using a one-hot encoding and thereby ignore any\nassumption implied by the ordering of the hour values.\n\nUsing one-hot encoding for the time features gives the linear model a lot\nmore flexibility as we introduce one additional feature per discrete time\nlevel.\n\n"
`333`	`333`	`]`
`334`	`334`	`},`
`335`	`335`	`{`