scikit-learn
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/0ca65f327d0d82be7fdda748f857d5b4/plot_poisson_regression_non_normal_loss.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/0ca65f327d0d82be7fdda748f857d5b4/plot_poisson_regression_non_normal_loss.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
-6 Bytes b/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
-6 Bytes
diff --git a/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
-6 Bytes b/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
-6 Bytes
diff --git a/‎dev/_downloads/f686bae9e47a0517ddbf86ced97151b6/plot_poisson_regression_non_normal_loss.py
Lines changed: 4 additions & 4 deletions b/‎dev/_downloads/f686bae9e47a0517ddbf86ced97151b6/plot_poisson_regression_non_normal_loss.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
-19.2 KB b/‎dev/_downloads/scikit-learn-docs.pdf
-19.2 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
101 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
101 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
101 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
101 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
549 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
549 Bytes
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 7bef3f4cbec57a7ed13fd6acc3d0e3d6
+config: 346f7da1d8766fdb4e3779d11f156832
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -213,7 +213,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "The ``Ridge`` regression model can predict very low expected frequencies\nthat do not match the data. It can therefore severly under-estimate the risk\nfor some policyholders.\n\n``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency\nbetween predicted and observed targets, especially for low predicted target\nvalues.\n\nHowever, for some business applications, we are not necessarily interested\nin the ability of the model to predict the expected frequency value, but\ninstead to predict which policyholder groups are the riskiest and which are\nthe safest. In this case, the model evaluation would cast the problem as a\nranking problem rather than a regression problem.\n\nTo compare the 3 models within this perspective, one can plot the fraction of\nthe number of claims vs the fraction of exposure for test samples ordered by\nthe model predictions, from riskiest to safest according to each model:\n\n"
+        "The ``Ridge`` regression model can predict very low expected frequencies\nthat do not match the data. It can therefore severly under-estimate the risk\nfor some policyholders.\n\n``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency\nbetween predicted and observed targets, especially for low predicted target\nvalues.\n\nHowever, for some business applications, we are not necessarily interested\nin the ability of the model to predict the expected frequency value, but\ninstead to predict which policyholder groups are the riskiest and which are\nthe safest. In this case, the model evaluation would cast the problem as a\nranking problem rather than a regression problem.\n\nTo compare the 3 models within this perspective, one can plot the fraction of\nthe number of claims vs the fraction of exposure for test samples ordered by\nthe model predictions, from safest to riskiest  according to each model:\n\n"
       ]
     },
     {
@@ -224,7 +224,7 @@
       },
       "outputs": [],
       "source": [
-        "def _cumulated_claims(y_true, y_pred, exposure):\n    idx_sort = np.argsort(y_pred)[::-1]  # from riskiest to safest\n    sorted_exposure = exposure[idx_sort]\n    sorted_frequencies = y_true[idx_sort]\n    cumulated_exposure = np.cumsum(sorted_exposure)\n    cumulated_exposure /= cumulated_exposure[-1]\n    cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)\n    cumulated_claims /= cumulated_claims[-1]\n    return cumulated_exposure, cumulated_claims\n\n\nfig, ax = plt.subplots(figsize=(8, 8))\n\nfor model in [ridge, poisson, rf]:\n    y_pred = model.predict(df_test)\n    cum_exposure, cum_claims = _cumulated_claims(\n        df_test[\"Frequency\"].values,\n        y_pred,\n        df_test[\"Exposure\"].values)\n    area = auc(cum_exposure, cum_claims)\n    label = \"{} (area under curve: {:.3f})\".format(\n        model[-1].__class__.__name__, area)\n    ax.plot(cum_exposure, cum_claims, linestyle=\"-\", label=label)\n\n# Oracle model: y_pred == y_test\ncum_exposure, cum_claims = _cumulated_claims(\n    df_test[\"Frequency\"].values,\n    df_test[\"Frequency\"].values,\n    df_test[\"Exposure\"].values)\narea = auc(cum_exposure, cum_claims)\nlabel = \"Oracle (area under curve: {:.3f})\".format(area)\nax.plot(cum_exposure, cum_claims, linestyle=\"-.\", color=\"gray\", label=label)\n\n# Random Baseline\nax.plot([0, 1], [0, 1], linestyle=\"--\", color=\"black\",\n        label=\"Random baseline\")\nax.set(\n    title=\"Cumulated number of claims by model\",\n    xlabel='Fraction of exposure (from riskiest to safest)',\n    ylabel='Fraction of number of claims'\n)\nax.legend(loc=\"lower right\")"
+        "def _cumulated_claims(y_true, y_pred, exposure):\n    idx_sort = np.argsort(y_pred)  # from safest to riskiest\n    sorted_exposure = exposure[idx_sort]\n    sorted_frequencies = y_true[idx_sort]\n    cumulated_exposure = np.cumsum(sorted_exposure)\n    cumulated_exposure /= cumulated_exposure[-1]\n    cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)\n    cumulated_claims /= cumulated_claims[-1]\n    return cumulated_exposure, cumulated_claims\n\n\nfig, ax = plt.subplots(figsize=(8, 8))\n\nfor model in [ridge, poisson, rf]:\n    y_pred = model.predict(df_test)\n    cum_exposure, cum_claims = _cumulated_claims(\n        df_test[\"Frequency\"].values,\n        y_pred,\n        df_test[\"Exposure\"].values)\n    area = auc(cum_exposure, cum_claims)\n    label = \"{} (area under curve: {:.3f})\".format(\n        model[-1].__class__.__name__, area)\n    ax.plot(cum_exposure, cum_claims, linestyle=\"-\", label=label)\n\n# Oracle model: y_pred == y_test\ncum_exposure, cum_claims = _cumulated_claims(\n    df_test[\"Frequency\"].values,\n    df_test[\"Frequency\"].values,\n    df_test[\"Exposure\"].values)\narea = auc(cum_exposure, cum_claims)\nlabel = \"Oracle (area under curve: {:.3f})\".format(area)\nax.plot(cum_exposure, cum_claims, linestyle=\"-.\", color=\"gray\", label=label)\n\n# Random Baseline\nax.plot([0, 1], [0, 1], linestyle=\"--\", color=\"black\",\n        label=\"Random baseline\")\nax.set(\n    title=\"Cumulated number of claims by model\",\n    xlabel='Fraction of exposure (from safest to riskiest)',\n    ylabel='Fraction of number of claims'\n)\nax.legend(loc=\"upper left\")"
       ]
     },
     {
 
@@ -393,11 +393,11 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
 #
 # To compare the 3 models within this perspective, one can plot the fraction of
 # the number of claims vs the fraction of exposure for test samples ordered by
-# the model predictions, from riskiest to safest according to each model:
+# the model predictions, from safest to riskiest  according to each model:
 
 
 def _cumulated_claims(y_true, y_pred, exposure):
-    idx_sort = np.argsort(y_pred)[::-1]  # from riskiest to safest
+    idx_sort = np.argsort(y_pred)  # from safest to riskiest
     sorted_exposure = exposure[idx_sort]
     sorted_frequencies = y_true[idx_sort]
     cumulated_exposure = np.cumsum(sorted_exposure)
@@ -434,10 +434,10 @@ def _cumulated_claims(y_true, y_pred, exposure):
         label="Random baseline")
 ax.set(
     title="Cumulated number of claims by model",
-    xlabel='Fraction of exposure (from riskiest to safest)',
+    xlabel='Fraction of exposure (from safest to riskiest)',
     ylabel='Fraction of number of claims'
 )
-ax.legend(loc="lower right")
+ax.legend(loc="upper left")
 
 ##############################################################################
 # This plot reveals that the random forest model is slightly better at ranking
Original file line number	Diff line number	Diff line change
`@@ -213,7 +213,7 @@`
`213`	`213`	`"cell_type": "markdown",`
`214`	`214`	`"metadata": {},`
`215`	`215`	`"source": [`
`216`		- "The ``Ridge`` regression model can predict very low expected frequencies\nthat do not match the data. It can therefore severly under-estimate the risk\nfor some policyholders.\n\n``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency\nbetween predicted and observed targets, especially for low predicted target\nvalues.\n\nHowever, for some business applications, we are not necessarily interested\nin the ability of the model to predict the expected frequency value, but\ninstead to predict which policyholder groups are the riskiest and which are\nthe safest. In this case, the model evaluation would cast the problem as a\nranking problem rather than a regression problem.\n\nTo compare the 3 models within this perspective, one can plot the fraction of\nthe number of claims vs the fraction of exposure for test samples ordered by\nthe model predictions, from riskiest to safest according to each model:\n\n"
	`216`	+ "The ``Ridge`` regression model can predict very low expected frequencies\nthat do not match the data. It can therefore severly under-estimate the risk\nfor some policyholders.\n\n``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency\nbetween predicted and observed targets, especially for low predicted target\nvalues.\n\nHowever, for some business applications, we are not necessarily interested\nin the ability of the model to predict the expected frequency value, but\ninstead to predict which policyholder groups are the riskiest and which are\nthe safest. In this case, the model evaluation would cast the problem as a\nranking problem rather than a regression problem.\n\nTo compare the 3 models within this perspective, one can plot the fraction of\nthe number of claims vs the fraction of exposure for test samples ordered by\nthe model predictions, from safest to riskiest according to each model:\n\n"
`217`	`217`	`]`
`218`	`218`	`},`
`219`	`219`	`{`
`@@ -224,7 +224,7 @@`
`224`	`224`	`},`
`225`	`225`	`"outputs": [],`
`226`	`226`	`"source": [`
`227`		- "def _cumulated_claims(y_true, y_pred, exposure):\n idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest\n sorted_exposure = exposure[idx_sort]\n sorted_frequencies = y_true[idx_sort]\n cumulated_exposure = np.cumsum(sorted_exposure)\n cumulated_exposure /= cumulated_exposure[-1]\n cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)\n cumulated_claims /= cumulated_claims[-1]\n return cumulated_exposure, cumulated_claims\n\n\nfig, ax = plt.subplots(figsize=(8, 8))\n\nfor model in [ridge, poisson, rf]:\n y_pred = model.predict(df_test)\n cum_exposure, cum_claims = _cumulated_claims(\n df_test[\"Frequency\"].values,\n y_pred,\n df_test[\"Exposure\"].values)\n area = auc(cum_exposure, cum_claims)\n label = \"{} (area under curve: {:.3f})\".format(\n model[-1].__class__.__name__, area)\n ax.plot(cum_exposure, cum_claims, linestyle=\"-\", label=label)\n\n# Oracle model: y_pred == y_test\ncum_exposure, cum_claims = _cumulated_claims(\n df_test[\"Frequency\"].values,\n df_test[\"Frequency\"].values,\n df_test[\"Exposure\"].values)\narea = auc(cum_exposure, cum_claims)\nlabel = \"Oracle (area under curve: {:.3f})\".format(area)\nax.plot(cum_exposure, cum_claims, linestyle=\"-.\", color=\"gray\", label=label)\n\n# Random Baseline\nax.plot([0, 1], [0, 1], linestyle=\"--\", color=\"black\",\n label=\"Random baseline\")\nax.set(\n title=\"Cumulated number of claims by model\",\n xlabel='Fraction of exposure (from riskiest to safest)',\n ylabel='Fraction of number of claims'\n)\nax.legend(loc=\"lower right\")"
	`227`	+ "def _cumulated_claims(y_true, y_pred, exposure):\n idx_sort = np.argsort(y_pred) # from safest to riskiest\n sorted_exposure = exposure[idx_sort]\n sorted_frequencies = y_true[idx_sort]\n cumulated_exposure = np.cumsum(sorted_exposure)\n cumulated_exposure /= cumulated_exposure[-1]\n cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)\n cumulated_claims /= cumulated_claims[-1]\n return cumulated_exposure, cumulated_claims\n\n\nfig, ax = plt.subplots(figsize=(8, 8))\n\nfor model in [ridge, poisson, rf]:\n y_pred = model.predict(df_test)\n cum_exposure, cum_claims = _cumulated_claims(\n df_test[\"Frequency\"].values,\n y_pred,\n df_test[\"Exposure\"].values)\n area = auc(cum_exposure, cum_claims)\n label = \"{} (area under curve: {:.3f})\".format(\n model[-1].__class__.__name__, area)\n ax.plot(cum_exposure, cum_claims, linestyle=\"-\", label=label)\n\n# Oracle model: y_pred == y_test\ncum_exposure, cum_claims = _cumulated_claims(\n df_test[\"Frequency\"].values,\n df_test[\"Frequency\"].values,\n df_test[\"Exposure\"].values)\narea = auc(cum_exposure, cum_claims)\nlabel = \"Oracle (area under curve: {:.3f})\".format(area)\nax.plot(cum_exposure, cum_claims, linestyle=\"-.\", color=\"gray\", label=label)\n\n# Random Baseline\nax.plot([0, 1], [0, 1], linestyle=\"--\", color=\"black\",\n label=\"Random baseline\")\nax.set(\n title=\"Cumulated number of claims by model\",\n xlabel='Fraction of exposure (from safest to riskiest)',\n ylabel='Fraction of number of claims'\n)\nax.legend(loc=\"upper left\")"
`228`	`228`	`]`
`229`	`229`	`},`
`230`	`230`	`{`