Skip to content

Commit 2e26067

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 018c6dc57d21c89c7d1278c686c7d5d62f32ee48
1 parent 2143511 commit 2e26067

File tree

1,228 files changed

+4289
-5025
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,228 files changed

+4289
-5025
lines changed

dev/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: 7bef3f4cbec57a7ed13fd6acc3d0e3d6
3+
config: 346f7da1d8766fdb4e3779d11f156832
44
tags: 645f666f9bcd5a90fca523b33c5a78b7

dev/_downloads/0ca65f327d0d82be7fdda748f857d5b4/plot_poisson_regression_non_normal_loss.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@
213213
"cell_type": "markdown",
214214
"metadata": {},
215215
"source": [
216-
"The ``Ridge`` regression model can predict very low expected frequencies\nthat do not match the data. It can therefore severly under-estimate the risk\nfor some policyholders.\n\n``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency\nbetween predicted and observed targets, especially for low predicted target\nvalues.\n\nHowever, for some business applications, we are not necessarily interested\nin the ability of the model to predict the expected frequency value, but\ninstead to predict which policyholder groups are the riskiest and which are\nthe safest. In this case, the model evaluation would cast the problem as a\nranking problem rather than a regression problem.\n\nTo compare the 3 models within this perspective, one can plot the fraction of\nthe number of claims vs the fraction of exposure for test samples ordered by\nthe model predictions, from riskiest to safest according to each model:\n\n"
216+
"The ``Ridge`` regression model can predict very low expected frequencies\nthat do not match the data. It can therefore severly under-estimate the risk\nfor some policyholders.\n\n``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency\nbetween predicted and observed targets, especially for low predicted target\nvalues.\n\nHowever, for some business applications, we are not necessarily interested\nin the ability of the model to predict the expected frequency value, but\ninstead to predict which policyholder groups are the riskiest and which are\nthe safest. In this case, the model evaluation would cast the problem as a\nranking problem rather than a regression problem.\n\nTo compare the 3 models within this perspective, one can plot the fraction of\nthe number of claims vs the fraction of exposure for test samples ordered by\nthe model predictions, from safest to riskiest according to each model:\n\n"
217217
]
218218
},
219219
{
@@ -224,7 +224,7 @@
224224
},
225225
"outputs": [],
226226
"source": [
227-
"def _cumulated_claims(y_true, y_pred, exposure):\n idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest\n sorted_exposure = exposure[idx_sort]\n sorted_frequencies = y_true[idx_sort]\n cumulated_exposure = np.cumsum(sorted_exposure)\n cumulated_exposure /= cumulated_exposure[-1]\n cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)\n cumulated_claims /= cumulated_claims[-1]\n return cumulated_exposure, cumulated_claims\n\n\nfig, ax = plt.subplots(figsize=(8, 8))\n\nfor model in [ridge, poisson, rf]:\n y_pred = model.predict(df_test)\n cum_exposure, cum_claims = _cumulated_claims(\n df_test[\"Frequency\"].values,\n y_pred,\n df_test[\"Exposure\"].values)\n area = auc(cum_exposure, cum_claims)\n label = \"{} (area under curve: {:.3f})\".format(\n model[-1].__class__.__name__, area)\n ax.plot(cum_exposure, cum_claims, linestyle=\"-\", label=label)\n\n# Oracle model: y_pred == y_test\ncum_exposure, cum_claims = _cumulated_claims(\n df_test[\"Frequency\"].values,\n df_test[\"Frequency\"].values,\n df_test[\"Exposure\"].values)\narea = auc(cum_exposure, cum_claims)\nlabel = \"Oracle (area under curve: {:.3f})\".format(area)\nax.plot(cum_exposure, cum_claims, linestyle=\"-.\", color=\"gray\", label=label)\n\n# Random Baseline\nax.plot([0, 1], [0, 1], linestyle=\"--\", color=\"black\",\n label=\"Random baseline\")\nax.set(\n title=\"Cumulated number of claims by model\",\n xlabel='Fraction of exposure (from riskiest to safest)',\n ylabel='Fraction of number of claims'\n)\nax.legend(loc=\"lower right\")"
227+
"def _cumulated_claims(y_true, y_pred, exposure):\n idx_sort = np.argsort(y_pred) # from safest to riskiest\n sorted_exposure = exposure[idx_sort]\n sorted_frequencies = y_true[idx_sort]\n cumulated_exposure = np.cumsum(sorted_exposure)\n cumulated_exposure /= cumulated_exposure[-1]\n cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)\n cumulated_claims /= cumulated_claims[-1]\n return cumulated_exposure, cumulated_claims\n\n\nfig, ax = plt.subplots(figsize=(8, 8))\n\nfor model in [ridge, poisson, rf]:\n y_pred = model.predict(df_test)\n cum_exposure, cum_claims = _cumulated_claims(\n df_test[\"Frequency\"].values,\n y_pred,\n df_test[\"Exposure\"].values)\n area = auc(cum_exposure, cum_claims)\n label = \"{} (area under curve: {:.3f})\".format(\n model[-1].__class__.__name__, area)\n ax.plot(cum_exposure, cum_claims, linestyle=\"-\", label=label)\n\n# Oracle model: y_pred == y_test\ncum_exposure, cum_claims = _cumulated_claims(\n df_test[\"Frequency\"].values,\n df_test[\"Frequency\"].values,\n df_test[\"Exposure\"].values)\narea = auc(cum_exposure, cum_claims)\nlabel = \"Oracle (area under curve: {:.3f})\".format(area)\nax.plot(cum_exposure, cum_claims, linestyle=\"-.\", color=\"gray\", label=label)\n\n# Random Baseline\nax.plot([0, 1], [0, 1], linestyle=\"--\", color=\"black\",\n label=\"Random baseline\")\nax.set(\n title=\"Cumulated number of claims by model\",\n xlabel='Fraction of exposure (from safest to riskiest)',\n ylabel='Fraction of number of claims'\n)\nax.legend(loc=\"upper left\")"
228228
]
229229
},
230230
{
Binary file not shown.
Binary file not shown.

dev/_downloads/f686bae9e47a0517ddbf86ced97151b6/plot_poisson_regression_non_normal_loss.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -393,11 +393,11 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
393393
#
394394
# To compare the 3 models within this perspective, one can plot the fraction of
395395
# the number of claims vs the fraction of exposure for test samples ordered by
396-
# the model predictions, from riskiest to safest according to each model:
396+
# the model predictions, from safest to riskiest according to each model:
397397

398398

399399
def _cumulated_claims(y_true, y_pred, exposure):
400-
idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest
400+
idx_sort = np.argsort(y_pred) # from safest to riskiest
401401
sorted_exposure = exposure[idx_sort]
402402
sorted_frequencies = y_true[idx_sort]
403403
cumulated_exposure = np.cumsum(sorted_exposure)
@@ -434,10 +434,10 @@ def _cumulated_claims(y_true, y_pred, exposure):
434434
label="Random baseline")
435435
ax.set(
436436
title="Cumulated number of claims by model",
437-
xlabel='Fraction of exposure (from riskiest to safest)',
437+
xlabel='Fraction of exposure (from safest to riskiest)',
438438
ylabel='Fraction of number of claims'
439439
)
440-
ax.legend(loc="lower right")
440+
ax.legend(loc="upper left")
441441

442442
##############################################################################
443443
# This plot reveals that the random forest model is slightly better at ranking

dev/_downloads/scikit-learn-docs.pdf

-19.2 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes
101 Bytes
101 Bytes
549 Bytes

0 commit comments

Comments
 (0)