scikit-learn
diff --git a/‎dev/_downloads/missing_values.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/missing_values.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/missing_values.py
Lines changed: 3 additions & 18 deletions b/‎dev/_downloads/missing_values.py
Lines changed: 3 additions & 18 deletions
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-54 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-54 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-54 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-54 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
51 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
51 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
51 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
51 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-435 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-435 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
-435 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
-435 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
4 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
4 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0041.png
4 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0041.png
4 Bytes
@@ -15,7 +15,7 @@
     }, 
     {
       "source": [
-        "\n# Imputing missing values before building an estimator\n\n\nThis example shows that imputing the missing values can give better results\nthan discarding the samples containing any missing value.\nImputing does not always improve the predictions, so please check via cross-validation.\nSometimes dropping rows or using marker values is more effective.\n\nIn this example, we artificially mark some of the elements in complete\ndataset as missing. Then we estimate performance using the complete dataset,\ndataset without the missing samples, after imputation without the indicator\nmatrix and imputation with the indicator matrix for the missing values.\n\nMissing values can be replaced by the mean, the median or the most frequent\nvalue using the ``strategy`` hyper-parameter.\nThe median is a more robust estimator for data with high magnitude variables\nwhich could dominate results (otherwise known as a 'long tail').\n\nScript output::\n\n  Score with the complete dataset = 0.56\n  Score without the samples containing missing values = 0.48\n  Score after imputation of the missing values = 0.55\n  Score after imputation with indicator features = 0.57\n\nIn this case, imputing helps the classifier get close to the original score.\n  \n"
+        "\n# Imputing missing values before building an estimator\n\n\nThis example shows that imputing the missing values can give better results\nthan discarding the samples containing any missing value.\nImputing does not always improve the predictions, so please check via cross-validation.\nSometimes dropping rows or using marker values is more effective.\n\nMissing values can be replaced by the mean, the median or the most frequent\nvalue using the ``strategy`` hyper-parameter.\nThe median is a more robust estimator for data with high magnitude variables\nwhich could dominate results (otherwise known as a 'long tail').\n\nScript output::\n\n  Score with the entire dataset = 0.56\n  Score without the samples containing missing values = 0.48\n  Score after imputation of the missing values = 0.55\n\nIn this case, imputing helps the classifier get close to the original score.\n  \n"
       ], 
       "cell_type": "markdown", 
       "metadata": {}
@@ -24,7 +24,7 @@
       "execution_count": null, 
       "cell_type": "code", 
       "source": [
-        "import numpy as np\n\nfrom sklearn.datasets import load_boston\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import Imputer\nfrom sklearn.model_selection import cross_val_score\n\nrng = np.random.RandomState(0)\n\ndataset = load_boston()\nX_full, y_full = dataset.data, dataset.target\nn_samples = X_full.shape[0]\nn_features = X_full.shape[1]\n\n# Estimate the score on the entire dataset, with no missing values\nestimator = RandomForestRegressor(random_state=0, n_estimators=100)\nscore = cross_val_score(estimator, X_full, y_full).mean()\nprint(\"Score with the complete dataset = %.2f\" % score)\n\n# Add missing values in 75% of the lines\nmissing_rate = 0.75\nn_missing_samples = int(n_samples * missing_rate)\nmissing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,\n                                      dtype=np.bool),\n                             np.ones(n_missing_samples,\n                                     dtype=np.bool)))\nrng.shuffle(missing_samples)\nmissing_features = rng.randint(0, n_features, n_missing_samples)\n\n# Estimate the score without the lines containing missing values\nX_filtered = X_full[~missing_samples, :]\ny_filtered = y_full[~missing_samples]\nestimator = RandomForestRegressor(random_state=0, n_estimators=100)\nscore = cross_val_score(estimator, X_filtered, y_filtered).mean()\nprint(\"Score without the samples containing missing values = %.2f\" % score)\n\n# Estimate the score after imputation of the missing values\nX_missing = X_full.copy()\nX_missing[np.where(missing_samples)[0], missing_features] = 0\ny_missing = y_full.copy()\nestimator = Pipeline([(\"imputer\", Imputer(missing_values=0,\n                                          strategy=\"mean\",\n                                          axis=0)),\n                      (\"forest\", RandomForestRegressor(random_state=0,\n                                                       n_estimators=100))])\nscore = cross_val_score(estimator, X_missing, y_missing).mean()\nprint(\"Score after imputation of the missing values = %.2f\" % score)\n\n# Estimate score after imputation of the missing values with indicator matrix\nestimator = Pipeline([(\"imputer\", Imputer(missing_values=0,\n                                          strategy=\"mean\",\n                                          axis=0, add_indicator_features=True)),\n                      (\"forest\", RandomForestRegressor(random_state=0,\n                                                       n_estimators=100))])\nscore = cross_val_score(estimator, X_missing, y_missing).mean()\nprint(\"Score after imputation with indicator features = %.2f\" % score)"
+        "import numpy as np\n\nfrom sklearn.datasets import load_boston\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import Imputer\nfrom sklearn.model_selection import cross_val_score\n\nrng = np.random.RandomState(0)\n\ndataset = load_boston()\nX_full, y_full = dataset.data, dataset.target\nn_samples = X_full.shape[0]\nn_features = X_full.shape[1]\n\n# Estimate the score on the entire dataset, with no missing values\nestimator = RandomForestRegressor(random_state=0, n_estimators=100)\nscore = cross_val_score(estimator, X_full, y_full).mean()\nprint(\"Score with the entire dataset = %.2f\" % score)\n\n# Add missing values in 75% of the lines\nmissing_rate = 0.75\nn_missing_samples = np.floor(n_samples * missing_rate)\nmissing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,\n                                      dtype=np.bool),\n                             np.ones(n_missing_samples,\n                                     dtype=np.bool)))\nrng.shuffle(missing_samples)\nmissing_features = rng.randint(0, n_features, n_missing_samples)\n\n# Estimate the score without the lines containing missing values\nX_filtered = X_full[~missing_samples, :]\ny_filtered = y_full[~missing_samples]\nestimator = RandomForestRegressor(random_state=0, n_estimators=100)\nscore = cross_val_score(estimator, X_filtered, y_filtered).mean()\nprint(\"Score without the samples containing missing values = %.2f\" % score)\n\n# Estimate the score after imputation of the missing values\nX_missing = X_full.copy()\nX_missing[np.where(missing_samples)[0], missing_features] = 0\ny_missing = y_full.copy()\nestimator = Pipeline([(\"imputer\", Imputer(missing_values=0,\n                                          strategy=\"mean\",\n                                          axis=0)),\n                      (\"forest\", RandomForestRegressor(random_state=0,\n                                                       n_estimators=100))])\nscore = cross_val_score(estimator, X_missing, y_missing).mean()\nprint(\"Score after imputation of the missing values = %.2f\" % score)"
       ], 
       "outputs": [], 
       "metadata": {
 
@@ -8,22 +8,16 @@
 Imputing does not always improve the predictions, so please check via cross-validation.
 Sometimes dropping rows or using marker values is more effective.
 
-In this example, we artificially mark some of the elements in complete
-dataset as missing. Then we estimate performance using the complete dataset,
-dataset without the missing samples, after imputation without the indicator
-matrix and imputation with the indicator matrix for the missing values.
-
 Missing values can be replaced by the mean, the median or the most frequent
 value using the ``strategy`` hyper-parameter.
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
 Script output::
 
-  Score with the complete dataset = 0.56
+  Score with the entire dataset = 0.56
   Score without the samples containing missing values = 0.48
   Score after imputation of the missing values = 0.55
-  Score after imputation with indicator features = 0.57
 
 In this case, imputing helps the classifier get close to the original score.
   
@@ -46,11 +40,11 @@
 # Estimate the score on the entire dataset, with no missing values
 estimator = RandomForestRegressor(random_state=0, n_estimators=100)
 score = cross_val_score(estimator, X_full, y_full).mean()
-print("Score with the complete dataset = %.2f" % score)
+print("Score with the entire dataset = %.2f" % score)
 
 # Add missing values in 75% of the lines
 missing_rate = 0.75
-n_missing_samples = int(n_samples * missing_rate)
+n_missing_samples = np.floor(n_samples * missing_rate)
 missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                       dtype=np.bool),
                              np.ones(n_missing_samples,
@@ -76,12 +70,3 @@
                                                        n_estimators=100))])
 score = cross_val_score(estimator, X_missing, y_missing).mean()
 print("Score after imputation of the missing values = %.2f" % score)
-
-# Estimate score after imputation of the missing values with indicator matrix
-estimator = Pipeline([("imputer", Imputer(missing_values=0,
-                                          strategy="mean",
-                                          axis=0, add_indicator_features=True)),
-                      ("forest", RandomForestRegressor(random_state=0,
-                                                       n_estimators=100))])
-score = cross_val_score(estimator, X_missing, y_missing).mean()
-print("Score after imputation with indicator features = %.2f" % score)