|
37 | 37 | <http://dx.doi.org/10.2139/ssrn.3164764>`_
|
38 | 38 |
|
39 | 39 | """
|
40 |
| - |
| 40 | +# %% |
41 | 41 | # Authors: Christian Lorentzen <[email protected]>
|
42 | 42 | # Roman Yurchak <[email protected]>
|
43 | 43 | # Olivier Grisel <[email protected]>
|
|
50 | 50 | import pandas as pd
|
51 | 51 |
|
52 | 52 | from sklearn.datasets import fetch_openml
|
53 |
| -from sklearn.compose import ColumnTransformer |
54 |
| -from sklearn.linear_model import PoissonRegressor, GammaRegressor |
55 |
| -from sklearn.linear_model import TweedieRegressor |
56 | 53 | from sklearn.metrics import mean_tweedie_deviance
|
57 |
| -from sklearn.model_selection import train_test_split |
58 |
| -from sklearn.pipeline import make_pipeline |
59 |
| -from sklearn.preprocessing import FunctionTransformer, OneHotEncoder |
60 |
| -from sklearn.preprocessing import StandardScaler, KBinsDiscretizer |
61 |
| - |
62 |
| -from sklearn.metrics import mean_absolute_error, mean_squared_error, auc |
| 54 | +from sklearn.metrics import mean_absolute_error |
| 55 | +from sklearn.metrics import mean_squared_error |
63 | 56 |
|
64 | 57 |
|
65 | 58 | def load_mtpl2(n_samples=100000):
|
@@ -215,6 +208,11 @@ def score_estimator(
|
215 | 208 | # containing the number of claims (``ClaimNb``), with the freMTPL2sev table,
|
216 | 209 | # containing the claim amount (``ClaimAmount``) for the same policy ids
|
217 | 210 | # (``IDpol``).
|
| 211 | +from sklearn.pipeline import make_pipeline |
| 212 | +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder |
| 213 | +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer |
| 214 | +from sklearn.compose import ColumnTransformer |
| 215 | + |
218 | 216 |
|
219 | 217 | df = load_mtpl2(n_samples=60000)
|
220 | 218 |
|
@@ -271,6 +269,9 @@ def score_estimator(
|
271 | 269 | # constant rate in a given time interval (``Exposure``, in units of years).
|
272 | 270 | # Here we model the frequency ``y = ClaimNb / Exposure``, which is still a
|
273 | 271 | # (scaled) Poisson distribution, and use ``Exposure`` as `sample_weight`.
|
| 272 | +from sklearn.model_selection import train_test_split |
| 273 | +from sklearn.linear_model import PoissonRegressor |
| 274 | + |
274 | 275 |
|
275 | 276 | df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)
|
276 | 277 |
|
@@ -366,6 +367,8 @@ def score_estimator(
|
366 | 367 | # on :math:`(0, \infty)`, not :math:`[0, \infty)`.
|
367 | 368 | # - We use ``ClaimNb`` as `sample_weight` to account for policies that contain
|
368 | 369 | # more than one claim.
|
| 370 | +from sklearn.linear_model import GammaRegressor |
| 371 | + |
369 | 372 |
|
370 | 373 | mask_train = df_train["ClaimAmount"] > 0
|
371 | 374 | mask_test = df_test["ClaimAmount"] > 0
|
@@ -474,6 +477,8 @@ def score_estimator(
|
474 | 477 | # models side by side, i.e. we compare them at identical values of `power`.
|
475 | 478 | # Ideally, we hope that one model will be consistently better than the other,
|
476 | 479 | # regardless of `power`.
|
| 480 | +from sklearn.linear_model import TweedieRegressor |
| 481 | + |
477 | 482 |
|
478 | 483 | glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, max_iter=10000)
|
479 | 484 | glm_pure_premium.fit(
|
@@ -545,29 +550,37 @@ def score_estimator(
|
545 | 550 | print(pd.DataFrame(res).set_index("subset").T)
|
546 | 551 |
|
547 | 552 | # %%
|
| 553 | +# |
548 | 554 | # Finally, we can compare the two models using a plot of cumulated claims: for
|
549 |
| -# each model, the policyholders are ranked from safest to riskiest and the |
550 |
| -# fraction of observed total cumulated claims is plotted on the y axis. This |
551 |
| -# plot is often called the ordered Lorenz curve of the model. |
| 555 | +# each model, the policyholders are ranked from safest to riskiest based on the |
| 556 | +# model predictions and the fraction of observed total cumulated claims is |
| 557 | +# plotted on the y axis. This plot is often called the ordered Lorenz curve of |
| 558 | +# the model. |
552 | 559 | #
|
553 |
| -# The Gini coefficient (based on the area under the curve) can be used as a |
554 |
| -# model selection metric to quantify the ability of the model to rank |
555 |
| -# policyholders. Note that this metric does not reflect the ability of the |
556 |
| -# models to make accurate predictions in terms of absolute value of total |
557 |
| -# claim amounts but only in terms of relative amounts as a ranking metric. |
| 560 | +# The Gini coefficient (based on the area between the curve and the diagonal) |
| 561 | +# can be used as a model selection metric to quantify the ability of the model |
| 562 | +# to rank policyholders. Note that this metric does not reflect the ability of |
| 563 | +# the models to make accurate predictions in terms of absolute value of total |
| 564 | +# claim amounts but only in terms of relative amounts as a ranking metric. The |
| 565 | +# Gini coefficient is upper bounded by 1.0 but even an oracle model that ranks |
| 566 | +# the policyholders by the observed claim amounts cannot reach a score of 1.0. |
558 | 567 | #
|
559 |
| -# Both models are able to rank policyholders by risky-ness significantly |
560 |
| -# better than chance although they are also both far from perfect due to the |
561 |
| -# natural difficulty of the prediction problem from few features. |
| 568 | +# We observe that both models are able to rank policyholders by risky-ness |
| 569 | +# significantly better than chance although they are also both far from the |
| 570 | +# oracle model due to the natural difficulty of the prediction problem from a |
| 571 | +# few features: most accidents are not predictable and can be caused by |
| 572 | +# environmental circumstances that are not described at all by the input |
| 573 | +# features of the models. |
562 | 574 | #
|
563 |
| -# Note that the Gini index only characterize the ranking performance of the |
564 |
| -# model but not its calibration: any monotonic transformation of the |
565 |
| -# predictions leaves the Gini index of the model unchanged. |
| 575 | +# Note that the Gini index only characterizes the ranking performance of the |
| 576 | +# model but not its calibration: any monotonic transformation of the predictions |
| 577 | +# leaves the Gini index of the model unchanged. |
566 | 578 | #
|
567 |
| -# Finally one should highlight that the Compound Poisson Gamma model that |
568 |
| -# is directly fit on the pure premium is operationally simpler to develop and |
569 |
| -# maintain as it consists in a single scikit-learn estimator instead of a |
570 |
| -# pair of models, each with its own set of hyperparameters. |
| 579 | +# Finally one should highlight that the Compound Poisson Gamma model that is |
| 580 | +# directly fit on the pure premium is operationally simpler to develop and |
| 581 | +# maintain as it consists of a single scikit-learn estimator instead of a pair |
| 582 | +# of models, each with its own set of hyperparameters. |
| 583 | +from sklearn.metrics import auc |
571 | 584 |
|
572 | 585 |
|
573 | 586 | def lorenz_curve(y_true, y_pred, exposure):
|
|
0 commit comments