Skip to content

Commit 2971d75

Browse files
committed
Pushing the docs to dev/ for branch: master, commit b8d1226d69bc4930d73c83394d2fee22dfe7ce2c
1 parent d40fc04 commit 2971d75

File tree

1,044 files changed

+5568
-3848
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,044 files changed

+5568
-3848
lines changed
6.41 KB
Binary file not shown.
5.42 KB
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Imputing missing values with variants of IterativeImputer\n\n\nThe :class:`sklearn.impute.IterativeImputer` class is very flexible - it can be\nused with a variety of estimators to do round-robin regression, treating every\nvariable as an output in turn.\n\nIn this example we compare some estimators for the purpose of missing feature\nimputation with :class:`sklearn.imputeIterativeImputer`::\n\n :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression\n :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression\n :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R\n :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN\n imputation approaches\n\nOf particular interest is the ability of\n:class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a\npopular imputation package for R. In this example, we have chosen to use\n:class:`sklearn.ensemble.ExtraTreesRegressor` instead of\n:class:`sklearn.ensemble.RandomForestRegressor` (as in missForest) due to its\nincreased speed.\n\nNote that :class:`sklearn.neighbors.KNeighborsRegressor` is different from KNN\nimputation, which learns from samples with missing values by using a distance\nmetric that accounts for missing values, rather than imputing them.\n\nThe goal is to compare different estimators to see which one is best for the\n:class:`sklearn.impute.IterativeImputer` when using a\n:class:`sklearn.linear_model.BayesianRidge` estimator on the California housing\ndataset with a single value randomly removed from each row.\n\nFor this particular pattern of missing values we see that\n:class:`sklearn.ensemble.ExtraTreesRegressor` and\n:class:`sklearn.linear_model.BayesianRidge` give the best results.\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\nfrom sklearn.datasets import fetch_california_housing\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.impute import IterativeImputer\nfrom sklearn.linear_model import BayesianRidge\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.ensemble import ExtraTreesRegressor\nfrom sklearn.neighbors import KNeighborsRegressor\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.model_selection import cross_val_score\n\nN_SPLITS = 5\n\nrng = np.random.RandomState(0)\n\nX_full, y_full = fetch_california_housing(return_X_y=True)\nn_samples, n_features = X_full.shape\n\n# Estimate the score on the entire dataset, with no missing values\nbr_estimator = BayesianRidge()\nscore_full_data = pd.DataFrame(\n cross_val_score(\n br_estimator, X_full, y_full, scoring='neg_mean_squared_error',\n cv=N_SPLITS\n ),\n columns=['Full Data']\n)\n\n# Add a single missing value to each row\nX_missing = X_full.copy()\ny_missing = y_full\nmissing_samples = np.arange(n_samples)\nmissing_features = rng.choice(n_features, n_samples, replace=True)\nX_missing[missing_samples, missing_features] = np.nan\n\n# Estimate the score after imputation (mean and median strategies)\nscore_simple_imputer = pd.DataFrame()\nfor strategy in ('mean', 'median'):\n estimator = make_pipeline(\n SimpleImputer(missing_values=np.nan, strategy=strategy),\n br_estimator\n )\n score_simple_imputer[strategy] = cross_val_score(\n estimator, X_missing, y_missing, scoring='neg_mean_squared_error',\n cv=N_SPLITS\n )\n\n# Estimate the score after iterative imputation of the missing values\n# with different estimators\nestimators = [\n BayesianRidge(),\n DecisionTreeRegressor(max_features='sqrt', random_state=0),\n ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0),\n KNeighborsRegressor(n_neighbors=15)\n]\nscore_iterative_imputer = pd.DataFrame()\nfor estimator in estimators:\n estimator = make_pipeline(\n IterativeImputer(random_state=0, estimator=estimator),\n br_estimator\n )\n score_iterative_imputer[estimator.__class__.__name__] = \\\n cross_val_score(\n estimator, X_missing, y_missing, scoring='neg_mean_squared_error',\n cv=N_SPLITS\n )\n\nscores = pd.concat(\n [score_full_data, score_simple_imputer, score_iterative_imputer],\n keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1\n)\n\n# plot boston results\nfig, ax = plt.subplots(figsize=(13, 6))\nmeans = -scores.mean()\nerrors = scores.std()\nmeans.plot.barh(xerr=errors, ax=ax)\nax.set_title('California Housing Regression with Different Imputation Methods')\nax.set_xlabel('MSE (smaller is better)')\nax.set_yticks(np.arange(means.shape[0]))\nax.set_yticklabels([\" w/ \".join(label) for label in means.index.get_values()])\nplt.tight_layout(pad=1)\nplt.show()"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.8"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
"""
2+
=========================================================
3+
Imputing missing values with variants of IterativeImputer
4+
=========================================================
5+
6+
The :class:`sklearn.impute.IterativeImputer` class is very flexible - it can be
7+
used with a variety of estimators to do round-robin regression, treating every
8+
variable as an output in turn.
9+
10+
In this example we compare some estimators for the purpose of missing feature
11+
imputation with :class:`sklearn.imputeIterativeImputer`::
12+
13+
:class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression
14+
:class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression
15+
:class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R
16+
:class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN
17+
imputation approaches
18+
19+
Of particular interest is the ability of
20+
:class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a
21+
popular imputation package for R. In this example, we have chosen to use
22+
:class:`sklearn.ensemble.ExtraTreesRegressor` instead of
23+
:class:`sklearn.ensemble.RandomForestRegressor` (as in missForest) due to its
24+
increased speed.
25+
26+
Note that :class:`sklearn.neighbors.KNeighborsRegressor` is different from KNN
27+
imputation, which learns from samples with missing values by using a distance
28+
metric that accounts for missing values, rather than imputing them.
29+
30+
The goal is to compare different estimators to see which one is best for the
31+
:class:`sklearn.impute.IterativeImputer` when using a
32+
:class:`sklearn.linear_model.BayesianRidge` estimator on the California housing
33+
dataset with a single value randomly removed from each row.
34+
35+
For this particular pattern of missing values we see that
36+
:class:`sklearn.ensemble.ExtraTreesRegressor` and
37+
:class:`sklearn.linear_model.BayesianRidge` give the best results.
38+
"""
39+
print(__doc__)
40+
41+
import numpy as np
42+
import matplotlib.pyplot as plt
43+
import pandas as pd
44+
45+
from sklearn.datasets import fetch_california_housing
46+
from sklearn.impute import SimpleImputer
47+
from sklearn.impute import IterativeImputer
48+
from sklearn.linear_model import BayesianRidge
49+
from sklearn.tree import DecisionTreeRegressor
50+
from sklearn.ensemble import ExtraTreesRegressor
51+
from sklearn.neighbors import KNeighborsRegressor
52+
from sklearn.pipeline import make_pipeline
53+
from sklearn.model_selection import cross_val_score
54+
55+
N_SPLITS = 5
56+
57+
rng = np.random.RandomState(0)
58+
59+
X_full, y_full = fetch_california_housing(return_X_y=True)
60+
n_samples, n_features = X_full.shape
61+
62+
# Estimate the score on the entire dataset, with no missing values
63+
br_estimator = BayesianRidge()
64+
score_full_data = pd.DataFrame(
65+
cross_val_score(
66+
br_estimator, X_full, y_full, scoring='neg_mean_squared_error',
67+
cv=N_SPLITS
68+
),
69+
columns=['Full Data']
70+
)
71+
72+
# Add a single missing value to each row
73+
X_missing = X_full.copy()
74+
y_missing = y_full
75+
missing_samples = np.arange(n_samples)
76+
missing_features = rng.choice(n_features, n_samples, replace=True)
77+
X_missing[missing_samples, missing_features] = np.nan
78+
79+
# Estimate the score after imputation (mean and median strategies)
80+
score_simple_imputer = pd.DataFrame()
81+
for strategy in ('mean', 'median'):
82+
estimator = make_pipeline(
83+
SimpleImputer(missing_values=np.nan, strategy=strategy),
84+
br_estimator
85+
)
86+
score_simple_imputer[strategy] = cross_val_score(
87+
estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
88+
cv=N_SPLITS
89+
)
90+
91+
# Estimate the score after iterative imputation of the missing values
92+
# with different estimators
93+
estimators = [
94+
BayesianRidge(),
95+
DecisionTreeRegressor(max_features='sqrt', random_state=0),
96+
ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0),
97+
KNeighborsRegressor(n_neighbors=15)
98+
]
99+
score_iterative_imputer = pd.DataFrame()
100+
for estimator in estimators:
101+
estimator = make_pipeline(
102+
IterativeImputer(random_state=0, estimator=estimator),
103+
br_estimator
104+
)
105+
score_iterative_imputer[estimator.__class__.__name__] = \
106+
cross_val_score(
107+
estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
108+
cv=N_SPLITS
109+
)
110+
111+
scores = pd.concat(
112+
[score_full_data, score_simple_imputer, score_iterative_imputer],
113+
keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1
114+
)
115+
116+
# plot boston results
117+
fig, ax = plt.subplots(figsize=(13, 6))
118+
means = -scores.mean()
119+
errors = scores.std()
120+
means.plot.barh(xerr=errors, ax=ax)
121+
ax.set_title('California Housing Regression with Different Imputation Methods')
122+
ax.set_xlabel('MSE (smaller is better)')
123+
ax.set_yticks(np.arange(means.shape[0]))
124+
ax.set_yticklabels([" w/ ".join(label) for label in means.index.get_values()])
125+
plt.tight_layout(pad=1)
126+
plt.show()

0 commit comments

Comments
 (0)