scikit-learn
diff --git a/‎dev/_downloads/missing_values.py
Lines changed: 18 additions & 3 deletions b/‎dev/_downloads/missing_values.py
Lines changed: 18 additions & 3 deletions
diff --git a/‎dev/_images/plot_agglomerative_clustering.png
24 Bytes b/‎dev/_images/plot_agglomerative_clustering.png
24 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering1.png
24 Bytes b/‎dev/_images/plot_agglomerative_clustering1.png
24 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_001.png
500 Bytes b/‎dev/_images/plot_agglomerative_clustering_001.png
500 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_0011.png
500 Bytes b/‎dev/_images/plot_agglomerative_clustering_0011.png
500 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_002.png
460 Bytes b/‎dev/_images/plot_agglomerative_clustering_002.png
460 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_0021.png
460 Bytes b/‎dev/_images/plot_agglomerative_clustering_0021.png
460 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_003.png
-365 Bytes b/‎dev/_images/plot_agglomerative_clustering_003.png
-365 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_0031.png
-365 Bytes b/‎dev/_images/plot_agglomerative_clustering_0031.png
-365 Bytes
diff --git a/‎dev/_images/plot_agglomerative_clustering_004.png
-316 Bytes b/‎dev/_images/plot_agglomerative_clustering_004.png
-316 Bytes
@@ -8,16 +8,22 @@
 Imputing does not always improve the predictions, so please check via cross-validation.
 Sometimes dropping rows or using marker values is more effective.
 
+In this example, we artificially mark some of the elements in complete
+dataset as missing. Then we estimate performance using the complete dataset,
+dataset without the missing samples, after imputation without the indicator
+matrix and imputation with the indicator matrix for the missing values.
+
 Missing values can be replaced by the mean, the median or the most frequent
 value using the ``strategy`` hyper-parameter.
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
 Script output::
 
-  Score with the entire dataset = 0.56
+  Score with the complete dataset = 0.56
   Score without the samples containing missing values = 0.48
   Score after imputation of the missing values = 0.55
+  Score after imputation with indicator features = 0.57
 
 In this case, imputing helps the classifier get close to the original score.
   
@@ -40,11 +46,11 @@
 # Estimate the score on the entire dataset, with no missing values
 estimator = RandomForestRegressor(random_state=0, n_estimators=100)
 score = cross_val_score(estimator, X_full, y_full).mean()
-print("Score with the entire dataset = %.2f" % score)
+print("Score with the complete dataset = %.2f" % score)
 
 # Add missing values in 75% of the lines
 missing_rate = 0.75
-n_missing_samples = np.floor(n_samples * missing_rate)
+n_missing_samples = int(n_samples * missing_rate)
 missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                       dtype=np.bool),
                              np.ones(n_missing_samples,
@@ -70,3 +76,12 @@
                                                        n_estimators=100))])
 score = cross_val_score(estimator, X_missing, y_missing).mean()
 print("Score after imputation of the missing values = %.2f" % score)
+
+# Estimate score after imputation of the missing values with indicator matrix
+estimator = Pipeline([("imputer", Imputer(missing_values=0,
+                                          strategy="mean",
+                                          axis=0, add_indicator_features=True)),
+                      ("forest", RandomForestRegressor(random_state=0,
+                                                       n_estimators=100))])
+score = cross_val_score(estimator, X_missing, y_missing).mean()
+print("Score after imputation with indicator features = %.2f" % score)