Skip to content

Commit f019c80

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 0e332293511a5b952039ad12defe626dfb023b67
1 parent e681c60 commit f019c80

File tree

1,221 files changed

+3689
-3665
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,221 files changed

+3689
-3665
lines changed
Binary file not shown.

dev/_downloads/86c888008757148890daaf43d664fa71/plot_tweedie_regression_insurance_claims.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def load_mtpl2(n_samples=100000):
8686
df["ClaimAmount"].fillna(0, inplace=True)
8787

8888
# unquote string fields
89-
for column_name in df.columns[df.dtypes.values == np.object]:
89+
for column_name in df.columns[df.dtypes.values == object]:
9090
df[column_name] = df[column_name].str.strip("'")
9191
return df.iloc[:n_samples]
9292

dev/_downloads/a97bf662e52d471b04e1ab480c0ad7f2/plot_tweedie_regression_insurance_claims.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"print(__doc__)\n\n# Authors: Christian Lorentzen <[email protected]>\n# Roman Yurchak <[email protected]>\n# Olivier Grisel <[email protected]>\n# License: BSD 3 clause\nfrom functools import partial\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.linear_model import PoissonRegressor, GammaRegressor\nfrom sklearn.linear_model import TweedieRegressor\nfrom sklearn.metrics import mean_tweedie_deviance\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import FunctionTransformer, OneHotEncoder\nfrom sklearn.preprocessing import StandardScaler, KBinsDiscretizer\n\nfrom sklearn.metrics import mean_absolute_error, mean_squared_error, auc\n\n\ndef load_mtpl2(n_samples=100000):\n \"\"\"Fetch the French Motor Third-Party Liability Claims dataset.\n\n Parameters\n ----------\n n_samples: int, default=100000\n number of samples to select (for faster run time). Full dataset has\n 678013 samples.\n \"\"\"\n # freMTPL2freq dataset from https://www.openml.org/d/41214\n df_freq = fetch_openml(data_id=41214, as_frame=True)['data']\n df_freq['IDpol'] = df_freq['IDpol'].astype(int)\n df_freq.set_index('IDpol', inplace=True)\n\n # freMTPL2sev dataset from https://www.openml.org/d/41215\n df_sev = fetch_openml(data_id=41215, as_frame=True)['data']\n\n # sum ClaimAmount over identical IDs\n df_sev = df_sev.groupby('IDpol').sum()\n\n df = df_freq.join(df_sev, how=\"left\")\n df[\"ClaimAmount\"].fillna(0, inplace=True)\n\n # unquote string fields\n for column_name in df.columns[df.dtypes.values == np.object]:\n df[column_name] = df[column_name].str.strip(\"'\")\n return df.iloc[:n_samples]\n\n\ndef plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,\n title=None, ax=None, fill_legend=False):\n \"\"\"Plot observed and predicted - aggregated per feature level.\n\n Parameters\n ----------\n df : DataFrame\n input data\n feature: str\n a column name of df for the feature to be plotted\n weight : str\n column name of df with the values of weights or exposure\n observed : str\n a column name of df with the observed target\n predicted : DataFrame\n a dataframe, with the same index as df, with the predicted target\n fill_legend : bool, default=False\n whether to show fill_between legend\n \"\"\"\n # aggregate observed and predicted variables by feature level\n df_ = df.loc[:, [feature, weight]].copy()\n df_[\"observed\"] = df[observed] * df[weight]\n df_[\"predicted\"] = predicted * df[weight]\n df_ = (\n df_.groupby([feature])[[weight, \"observed\", \"predicted\"]]\n .sum()\n .assign(observed=lambda x: x[\"observed\"] / x[weight])\n .assign(predicted=lambda x: x[\"predicted\"] / x[weight])\n )\n\n ax = df_.loc[:, [\"observed\", \"predicted\"]].plot(style=\".\", ax=ax)\n y_max = df_.loc[:, [\"observed\", \"predicted\"]].values.max() * 0.8\n p2 = ax.fill_between(\n df_.index,\n 0,\n y_max * df_[weight] / df_[weight].values.max(),\n color=\"g\",\n alpha=0.1,\n )\n if fill_legend:\n ax.legend([p2], [\"{} distribution\".format(feature)])\n ax.set(\n ylabel=y_label if y_label is not None else None,\n title=title if title is not None else \"Train: Observed vs Predicted\",\n )\n\n\ndef score_estimator(\n estimator, X_train, X_test, df_train, df_test, target, weights,\n tweedie_powers=None,\n):\n \"\"\"Evaluate an estimator on train and test sets with different metrics\"\"\"\n\n metrics = [\n (\"D\u00b2 explained\", None), # Use default scorer if it exists\n (\"mean abs. error\", mean_absolute_error),\n (\"mean squared error\", mean_squared_error),\n ]\n if tweedie_powers:\n metrics += [(\n \"mean Tweedie dev p={:.4f}\".format(power),\n partial(mean_tweedie_deviance, power=power)\n ) for power in tweedie_powers]\n\n res = []\n for subset_label, X, df in [\n (\"train\", X_train, df_train),\n (\"test\", X_test, df_test),\n ]:\n y, _weights = df[target], df[weights]\n for score_label, metric in metrics:\n if isinstance(estimator, tuple) and len(estimator) == 2:\n # Score the model consisting of the product of frequency and\n # severity models.\n est_freq, est_sev = estimator\n y_pred = est_freq.predict(X) * est_sev.predict(X)\n else:\n y_pred = estimator.predict(X)\n\n if metric is None:\n if not hasattr(estimator, \"score\"):\n continue\n score = estimator.score(X, y, sample_weight=_weights)\n else:\n score = metric(y, y_pred, sample_weight=_weights)\n\n res.append(\n {\"subset\": subset_label, \"metric\": score_label, \"score\": score}\n )\n\n res = (\n pd.DataFrame(res)\n .set_index([\"metric\", \"subset\"])\n .score.unstack(-1)\n .round(4)\n .loc[:, ['train', 'test']]\n )\n return res"
29+
"print(__doc__)\n\n# Authors: Christian Lorentzen <[email protected]>\n# Roman Yurchak <[email protected]>\n# Olivier Grisel <[email protected]>\n# License: BSD 3 clause\nfrom functools import partial\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.linear_model import PoissonRegressor, GammaRegressor\nfrom sklearn.linear_model import TweedieRegressor\nfrom sklearn.metrics import mean_tweedie_deviance\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import FunctionTransformer, OneHotEncoder\nfrom sklearn.preprocessing import StandardScaler, KBinsDiscretizer\n\nfrom sklearn.metrics import mean_absolute_error, mean_squared_error, auc\n\n\ndef load_mtpl2(n_samples=100000):\n \"\"\"Fetch the French Motor Third-Party Liability Claims dataset.\n\n Parameters\n ----------\n n_samples: int, default=100000\n number of samples to select (for faster run time). Full dataset has\n 678013 samples.\n \"\"\"\n # freMTPL2freq dataset from https://www.openml.org/d/41214\n df_freq = fetch_openml(data_id=41214, as_frame=True)['data']\n df_freq['IDpol'] = df_freq['IDpol'].astype(int)\n df_freq.set_index('IDpol', inplace=True)\n\n # freMTPL2sev dataset from https://www.openml.org/d/41215\n df_sev = fetch_openml(data_id=41215, as_frame=True)['data']\n\n # sum ClaimAmount over identical IDs\n df_sev = df_sev.groupby('IDpol').sum()\n\n df = df_freq.join(df_sev, how=\"left\")\n df[\"ClaimAmount\"].fillna(0, inplace=True)\n\n # unquote string fields\n for column_name in df.columns[df.dtypes.values == object]:\n df[column_name] = df[column_name].str.strip(\"'\")\n return df.iloc[:n_samples]\n\n\ndef plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,\n title=None, ax=None, fill_legend=False):\n \"\"\"Plot observed and predicted - aggregated per feature level.\n\n Parameters\n ----------\n df : DataFrame\n input data\n feature: str\n a column name of df for the feature to be plotted\n weight : str\n column name of df with the values of weights or exposure\n observed : str\n a column name of df with the observed target\n predicted : DataFrame\n a dataframe, with the same index as df, with the predicted target\n fill_legend : bool, default=False\n whether to show fill_between legend\n \"\"\"\n # aggregate observed and predicted variables by feature level\n df_ = df.loc[:, [feature, weight]].copy()\n df_[\"observed\"] = df[observed] * df[weight]\n df_[\"predicted\"] = predicted * df[weight]\n df_ = (\n df_.groupby([feature])[[weight, \"observed\", \"predicted\"]]\n .sum()\n .assign(observed=lambda x: x[\"observed\"] / x[weight])\n .assign(predicted=lambda x: x[\"predicted\"] / x[weight])\n )\n\n ax = df_.loc[:, [\"observed\", \"predicted\"]].plot(style=\".\", ax=ax)\n y_max = df_.loc[:, [\"observed\", \"predicted\"]].values.max() * 0.8\n p2 = ax.fill_between(\n df_.index,\n 0,\n y_max * df_[weight] / df_[weight].values.max(),\n color=\"g\",\n alpha=0.1,\n )\n if fill_legend:\n ax.legend([p2], [\"{} distribution\".format(feature)])\n ax.set(\n ylabel=y_label if y_label is not None else None,\n title=title if title is not None else \"Train: Observed vs Predicted\",\n )\n\n\ndef score_estimator(\n estimator, X_train, X_test, df_train, df_test, target, weights,\n tweedie_powers=None,\n):\n \"\"\"Evaluate an estimator on train and test sets with different metrics\"\"\"\n\n metrics = [\n (\"D\u00b2 explained\", None), # Use default scorer if it exists\n (\"mean abs. error\", mean_absolute_error),\n (\"mean squared error\", mean_squared_error),\n ]\n if tweedie_powers:\n metrics += [(\n \"mean Tweedie dev p={:.4f}\".format(power),\n partial(mean_tweedie_deviance, power=power)\n ) for power in tweedie_powers]\n\n res = []\n for subset_label, X, df in [\n (\"train\", X_train, df_train),\n (\"test\", X_test, df_test),\n ]:\n y, _weights = df[target], df[weights]\n for score_label, metric in metrics:\n if isinstance(estimator, tuple) and len(estimator) == 2:\n # Score the model consisting of the product of frequency and\n # severity models.\n est_freq, est_sev = estimator\n y_pred = est_freq.predict(X) * est_sev.predict(X)\n else:\n y_pred = estimator.predict(X)\n\n if metric is None:\n if not hasattr(estimator, \"score\"):\n continue\n score = estimator.score(X, y, sample_weight=_weights)\n else:\n score = metric(y, y_pred, sample_weight=_weights)\n\n res.append(\n {\"subset\": subset_label, \"metric\": score_label, \"score\": score}\n )\n\n res = (\n pd.DataFrame(res)\n .set_index([\"metric\", \"subset\"])\n .score.unstack(-1)\n .round(4)\n .loc[:, ['train', 'test']]\n )\n return res"
3030
]
3131
},
3232
{
Binary file not shown.

dev/_downloads/scikit-learn-docs.pdf

-3.77 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes

0 commit comments

Comments
 (0)