scikit-learn
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
111 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
111 Bytes
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
112 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
112 Bytes
diff --git a/‎dev/_downloads/86c888008757148890daaf43d664fa71/plot_tweedie_regression_insurance_claims.py
Lines changed: 4 additions & 3 deletions b/‎dev/_downloads/86c888008757148890daaf43d664fa71/plot_tweedie_regression_insurance_claims.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎dev/_downloads/a97bf662e52d471b04e1ab480c0ad7f2/plot_tweedie_regression_insurance_claims.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/a97bf662e52d471b04e1ab480c0ad7f2/plot_tweedie_regression_insurance_claims.ipynb
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: b8552c1840e582d8069bb272f008b1c5
+config: a8992306144a58818a4d64d1a9e413cf
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -222,15 +222,16 @@ def score_estimator(
 
 df = load_mtpl2()
 
-# Note: filter out claims with zero amount, as the severity model
-# requires strictly positive target values.
-df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0
 
 # Correct for unreasonable observations (that might be data error)
 # and a few exceptionally large claim amounts
 df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
 df["Exposure"] = df["Exposure"].clip(upper=1)
 df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)
+# If the claim amount is 0, then we do not count it as a claim. The loss function
+# used by the severity model needs strictly positive claim amounts. This way
+# frequency and severity are more consistent with each other.
+df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0
 
 log_scale_transformer = make_pipeline(
     FunctionTransformer(func=np.log), StandardScaler()
 
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "from sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import (\n    FunctionTransformer,\n    KBinsDiscretizer,\n    OneHotEncoder,\n    StandardScaler,\n)\n\ndf = load_mtpl2()\n\n# Note: filter out claims with zero amount, as the severity model\n# requires strictly positive target values.\ndf.loc[(df[\"ClaimAmount\"] == 0) & (df[\"ClaimNb\"] >= 1), \"ClaimNb\"] = 0\n\n# Correct for unreasonable observations (that might be data error)\n# and a few exceptionally large claim amounts\ndf[\"ClaimNb\"] = df[\"ClaimNb\"].clip(upper=4)\ndf[\"Exposure\"] = df[\"Exposure\"].clip(upper=1)\ndf[\"ClaimAmount\"] = df[\"ClaimAmount\"].clip(upper=200000)\n\nlog_scale_transformer = make_pipeline(\n    FunctionTransformer(func=np.log), StandardScaler()\n)\n\ncolumn_trans = ColumnTransformer(\n    [\n        (\n            \"binned_numeric\",\n            KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),\n            [\"VehAge\", \"DrivAge\"],\n        ),\n        (\n            \"onehot_categorical\",\n            OneHotEncoder(),\n            [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n        ),\n        (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n        (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n    ],\n    remainder=\"drop\",\n)\nX = column_trans.fit_transform(df)\n\n# Insurances companies are interested in modeling the Pure Premium, that is\n# the expected total claim amount per unit of exposure for each policyholder\n# in their portfolio:\ndf[\"PurePremium\"] = df[\"ClaimAmount\"] / df[\"Exposure\"]\n\n# This can be indirectly approximated by a 2-step modeling: the product of the\n# Frequency times the average claim amount per claim:\ndf[\"Frequency\"] = df[\"ClaimNb\"] / df[\"Exposure\"]\ndf[\"AvgClaimAmount\"] = df[\"ClaimAmount\"] / np.fmax(df[\"ClaimNb\"], 1)\n\nwith pd.option_context(\"display.max_columns\", 15):\n    print(df[df.ClaimAmount > 0].head())"
+        "from sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import (\n    FunctionTransformer,\n    KBinsDiscretizer,\n    OneHotEncoder,\n    StandardScaler,\n)\n\ndf = load_mtpl2()\n\n\n# Correct for unreasonable observations (that might be data error)\n# and a few exceptionally large claim amounts\ndf[\"ClaimNb\"] = df[\"ClaimNb\"].clip(upper=4)\ndf[\"Exposure\"] = df[\"Exposure\"].clip(upper=1)\ndf[\"ClaimAmount\"] = df[\"ClaimAmount\"].clip(upper=200000)\n# If the claim amount is 0, then we do not count it as a claim. The loss function\n# used by the severity model needs strictly positive claim amounts. This way\n# frequency and severity are more consistent with each other.\ndf.loc[(df[\"ClaimAmount\"] == 0) & (df[\"ClaimNb\"] >= 1), \"ClaimNb\"] = 0\n\nlog_scale_transformer = make_pipeline(\n    FunctionTransformer(func=np.log), StandardScaler()\n)\n\ncolumn_trans = ColumnTransformer(\n    [\n        (\n            \"binned_numeric\",\n            KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),\n            [\"VehAge\", \"DrivAge\"],\n        ),\n        (\n            \"onehot_categorical\",\n            OneHotEncoder(),\n            [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n        ),\n        (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n        (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n    ],\n    remainder=\"drop\",\n)\nX = column_trans.fit_transform(df)\n\n# Insurances companies are interested in modeling the Pure Premium, that is\n# the expected total claim amount per unit of exposure for each policyholder\n# in their portfolio:\ndf[\"PurePremium\"] = df[\"ClaimAmount\"] / df[\"Exposure\"]\n\n# This can be indirectly approximated by a 2-step modeling: the product of the\n# Frequency times the average claim amount per claim:\ndf[\"Frequency\"] = df[\"ClaimNb\"] / df[\"Exposure\"]\ndf[\"AvgClaimAmount\"] = df[\"ClaimAmount\"] / np.fmax(df[\"ClaimNb\"], 1)\n\nwith pd.option_context(\"display.max_columns\", 15):\n    print(df[df.ClaimAmount > 0].head())"
       ]
     },
     {
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@`
`44`	`44`	`},`
`45`	`45`	`"outputs": [],`
`46`	`46`	`"source": [`
`47`		- "from sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import (\n FunctionTransformer,\n KBinsDiscretizer,\n OneHotEncoder,\n StandardScaler,\n)\n\ndf = load_mtpl2()\n\n# Note: filter out claims with zero amount, as the severity model\n# requires strictly positive target values.\ndf.loc[(df[\"ClaimAmount\"] == 0) & (df[\"ClaimNb\"] >= 1), \"ClaimNb\"] = 0\n\n# Correct for unreasonable observations (that might be data error)\n# and a few exceptionally large claim amounts\ndf[\"ClaimNb\"] = df[\"ClaimNb\"].clip(upper=4)\ndf[\"Exposure\"] = df[\"Exposure\"].clip(upper=1)\ndf[\"ClaimAmount\"] = df[\"ClaimAmount\"].clip(upper=200000)\n\nlog_scale_transformer = make_pipeline(\n FunctionTransformer(func=np.log), StandardScaler()\n)\n\ncolumn_trans = ColumnTransformer(\n [\n (\n \"binned_numeric\",\n KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),\n [\"VehAge\", \"DrivAge\"],\n ),\n (\n \"onehot_categorical\",\n OneHotEncoder(),\n [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n ),\n (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n ],\n remainder=\"drop\",\n)\nX = column_trans.fit_transform(df)\n\n# Insurances companies are interested in modeling the Pure Premium, that is\n# the expected total claim amount per unit of exposure for each policyholder\n# in their portfolio:\ndf[\"PurePremium\"] = df[\"ClaimAmount\"] / df[\"Exposure\"]\n\n# This can be indirectly approximated by a 2-step modeling: the product of the\n# Frequency times the average claim amount per claim:\ndf[\"Frequency\"] = df[\"ClaimNb\"] / df[\"Exposure\"]\ndf[\"AvgClaimAmount\"] = df[\"ClaimAmount\"] / np.fmax(df[\"ClaimNb\"], 1)\n\nwith pd.option_context(\"display.max_columns\", 15):\n print(df[df.ClaimAmount > 0].head())"
	`47`	+ "from sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import (\n FunctionTransformer,\n KBinsDiscretizer,\n OneHotEncoder,\n StandardScaler,\n)\n\ndf = load_mtpl2()\n\n\n# Correct for unreasonable observations (that might be data error)\n# and a few exceptionally large claim amounts\ndf[\"ClaimNb\"] = df[\"ClaimNb\"].clip(upper=4)\ndf[\"Exposure\"] = df[\"Exposure\"].clip(upper=1)\ndf[\"ClaimAmount\"] = df[\"ClaimAmount\"].clip(upper=200000)\n# If the claim amount is 0, then we do not count it as a claim. The loss function\n# used by the severity model needs strictly positive claim amounts. This way\n# frequency and severity are more consistent with each other.\ndf.loc[(df[\"ClaimAmount\"] == 0) & (df[\"ClaimNb\"] >= 1), \"ClaimNb\"] = 0\n\nlog_scale_transformer = make_pipeline(\n FunctionTransformer(func=np.log), StandardScaler()\n)\n\ncolumn_trans = ColumnTransformer(\n [\n (\n \"binned_numeric\",\n KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),\n [\"VehAge\", \"DrivAge\"],\n ),\n (\n \"onehot_categorical\",\n OneHotEncoder(),\n [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n ),\n (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n ],\n remainder=\"drop\",\n)\nX = column_trans.fit_transform(df)\n\n# Insurances companies are interested in modeling the Pure Premium, that is\n# the expected total claim amount per unit of exposure for each policyholder\n# in their portfolio:\ndf[\"PurePremium\"] = df[\"ClaimAmount\"] / df[\"Exposure\"]\n\n# This can be indirectly approximated by a 2-step modeling: the product of the\n# Frequency times the average claim amount per claim:\ndf[\"Frequency\"] = df[\"ClaimNb\"] / df[\"Exposure\"]\ndf[\"AvgClaimAmount\"] = df[\"ClaimAmount\"] / np.fmax(df[\"ClaimNb\"], 1)\n\nwith pd.option_context(\"display.max_columns\", 15):\n print(df[df.ClaimAmount > 0].head())"
`48`	`48`	`]`
`49`	`49`	`},`
`50`	`50`	`{`