Skip to content

Commit 1ff69df

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 85787fcb56cf6b04115cb8fa5f48ba85a8bd6e06
1 parent b6632e0 commit 1ff69df

File tree

1,227 files changed

+4434
-4374
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,227 files changed

+4434
-4374
lines changed
Binary file not shown.
Binary file not shown.

dev/_downloads/86c888008757148890daaf43d664fa71/plot_tweedie_regression_insurance_claims.py

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
<http://dx.doi.org/10.2139/ssrn.3164764>`_
3838
3939
"""
40-
40+
# %%
4141
# Authors: Christian Lorentzen <[email protected]>
4242
# Roman Yurchak <[email protected]>
4343
# Olivier Grisel <[email protected]>
@@ -50,16 +50,9 @@
5050
import pandas as pd
5151

5252
from sklearn.datasets import fetch_openml
53-
from sklearn.compose import ColumnTransformer
54-
from sklearn.linear_model import PoissonRegressor, GammaRegressor
55-
from sklearn.linear_model import TweedieRegressor
5653
from sklearn.metrics import mean_tweedie_deviance
57-
from sklearn.model_selection import train_test_split
58-
from sklearn.pipeline import make_pipeline
59-
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
60-
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
61-
62-
from sklearn.metrics import mean_absolute_error, mean_squared_error, auc
54+
from sklearn.metrics import mean_absolute_error
55+
from sklearn.metrics import mean_squared_error
6356

6457

6558
def load_mtpl2(n_samples=100000):
@@ -215,6 +208,11 @@ def score_estimator(
215208
# containing the number of claims (``ClaimNb``), with the freMTPL2sev table,
216209
# containing the claim amount (``ClaimAmount``) for the same policy ids
217210
# (``IDpol``).
211+
from sklearn.pipeline import make_pipeline
212+
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
213+
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
214+
from sklearn.compose import ColumnTransformer
215+
218216

219217
df = load_mtpl2(n_samples=60000)
220218

@@ -271,6 +269,9 @@ def score_estimator(
271269
# constant rate in a given time interval (``Exposure``, in units of years).
272270
# Here we model the frequency ``y = ClaimNb / Exposure``, which is still a
273271
# (scaled) Poisson distribution, and use ``Exposure`` as `sample_weight`.
272+
from sklearn.model_selection import train_test_split
273+
from sklearn.linear_model import PoissonRegressor
274+
274275

275276
df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)
276277

@@ -366,6 +367,8 @@ def score_estimator(
366367
# on :math:`(0, \infty)`, not :math:`[0, \infty)`.
367368
# - We use ``ClaimNb`` as `sample_weight` to account for policies that contain
368369
# more than one claim.
370+
from sklearn.linear_model import GammaRegressor
371+
369372

370373
mask_train = df_train["ClaimAmount"] > 0
371374
mask_test = df_test["ClaimAmount"] > 0
@@ -474,6 +477,8 @@ def score_estimator(
474477
# models side by side, i.e. we compare them at identical values of `power`.
475478
# Ideally, we hope that one model will be consistently better than the other,
476479
# regardless of `power`.
480+
from sklearn.linear_model import TweedieRegressor
481+
477482

478483
glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, max_iter=10000)
479484
glm_pure_premium.fit(
@@ -545,29 +550,37 @@ def score_estimator(
545550
print(pd.DataFrame(res).set_index("subset").T)
546551

547552
# %%
553+
#
548554
# Finally, we can compare the two models using a plot of cumulated claims: for
549-
# each model, the policyholders are ranked from safest to riskiest and the
550-
# fraction of observed total cumulated claims is plotted on the y axis. This
551-
# plot is often called the ordered Lorenz curve of the model.
555+
# each model, the policyholders are ranked from safest to riskiest based on the
556+
# model predictions and the fraction of observed total cumulated claims is
557+
# plotted on the y axis. This plot is often called the ordered Lorenz curve of
558+
# the model.
552559
#
553-
# The Gini coefficient (based on the area under the curve) can be used as a
554-
# model selection metric to quantify the ability of the model to rank
555-
# policyholders. Note that this metric does not reflect the ability of the
556-
# models to make accurate predictions in terms of absolute value of total
557-
# claim amounts but only in terms of relative amounts as a ranking metric.
560+
# The Gini coefficient (based on the area between the curve and the diagonal)
561+
# can be used as a model selection metric to quantify the ability of the model
562+
# to rank policyholders. Note that this metric does not reflect the ability of
563+
# the models to make accurate predictions in terms of absolute value of total
564+
# claim amounts but only in terms of relative amounts as a ranking metric. The
565+
# Gini coefficient is upper bounded by 1.0 but even an oracle model that ranks
566+
# the policyholders by the observed claim amounts cannot reach a score of 1.0.
558567
#
559-
# Both models are able to rank policyholders by risky-ness significantly
560-
# better than chance although they are also both far from perfect due to the
561-
# natural difficulty of the prediction problem from few features.
568+
# We observe that both models are able to rank policyholders by risky-ness
569+
# significantly better than chance although they are also both far from the
570+
# oracle model due to the natural difficulty of the prediction problem from a
571+
# few features: most accidents are not predictable and can be caused by
572+
# environmental circumstances that are not described at all by the input
573+
# features of the models.
562574
#
563-
# Note that the Gini index only characterize the ranking performance of the
564-
# model but not its calibration: any monotonic transformation of the
565-
# predictions leaves the Gini index of the model unchanged.
575+
# Note that the Gini index only characterizes the ranking performance of the
576+
# model but not its calibration: any monotonic transformation of the predictions
577+
# leaves the Gini index of the model unchanged.
566578
#
567-
# Finally one should highlight that the Compound Poisson Gamma model that
568-
# is directly fit on the pure premium is operationally simpler to develop and
569-
# maintain as it consists in a single scikit-learn estimator instead of a
570-
# pair of models, each with its own set of hyperparameters.
579+
# Finally one should highlight that the Compound Poisson Gamma model that is
580+
# directly fit on the pure premium is operationally simpler to develop and
581+
# maintain as it consists of a single scikit-learn estimator instead of a pair
582+
# of models, each with its own set of hyperparameters.
583+
from sklearn.metrics import auc
571584

572585

573586
def lorenz_curve(y_true, y_pred, exposure):

dev/_downloads/a97bf662e52d471b04e1ab480c0ad7f2/plot_tweedie_regression_insurance_claims.ipynb

Lines changed: 14 additions & 7 deletions
Large diffs are not rendered by default.

dev/_downloads/scikit-learn-docs.zip

-5.69 KB
Binary file not shown.
36 Bytes
-6 Bytes
162 Bytes
-2 Bytes
-1.17 KB

0 commit comments

Comments
 (0)