bsipocz
diff --git a/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
386 Bytes b/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
386 Bytes
diff --git a/‎dev/_downloads/3bd5b40c045ee4efd6b519876b8553cb/plot_forest_importances.py
Lines changed: 6 additions & 1 deletion b/‎dev/_downloads/3bd5b40c045ee4efd6b519876b8553cb/plot_forest_importances.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎dev/_downloads/679566501b743cb339497968edb9d62f/plot_gradient_boosting_regression.py
Lines changed: 5 additions & 0 deletions b/‎dev/_downloads/679566501b743cb339497968edb9d62f/plot_gradient_boosting_regression.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎dev/_downloads/9fbbe00328ea0a237498701b1e8827fa/plot_forest_importances.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/9fbbe00328ea0a237498701b1e8827fa/plot_forest_importances.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/cdc6134a701824f26cc08df7bd1e479a/plot_gradient_boosting_regression.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/cdc6134a701824f26cc08df7bd1e479a/plot_gradient_boosting_regression.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
394 Bytes b/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
394 Bytes
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
343 Bytes b/‎dev/_downloads/scikit-learn-docs.pdf
343 Bytes
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
23 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
23 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
23 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
23 Bytes
@@ -10,6 +10,11 @@
 
 As expected, the plot suggests that 3 features are informative, while the
 remaining are not.
+
+Warning: impurity-based feature importances can be misleading for high
+cardinality features (many unique values). See
+:func:`sklearn.inspection.permutation_importance` as an alternative.
+
 """
 print(__doc__)
 
@@ -49,7 +54,7 @@
 plt.figure()
 plt.title("Feature importances")
 plt.bar(range(X.shape[1]), importances[indices],
-       color="r", yerr=std[indices], align="center")
+        color="r", yerr=std[indices], align="center")
 plt.xticks(range(X.shape[1]), indices)
 plt.xlim([-1, X.shape[1]])
 plt.show()
@@ -63,6 +63,11 @@
 
 # #############################################################################
 # Plot impurity-based feature importance
+#
+# Warning: impurity-based feature importances can be misleading for
+# high cardinality features (many unique values). See
+# :func:`sklearn.inspection.permutation_importance` as an alternative.
+
 feature_importance = clf.feature_importances_
 # make importances relative to max importance
 feature_importance = 100.0 * (feature_importance / feature_importance.max())
 
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Feature importances with forests of trees\n\n\nThis examples shows the use of forests of trees to evaluate the importance of\nfeatures on an artificial classification task. The red bars are\nthe impurity-based feature importances of the forest,\nalong with their inter-trees variability.\n\nAs expected, the plot suggests that 3 features are informative, while the\nremaining are not.\n"
+        "\n# Feature importances with forests of trees\n\n\nThis examples shows the use of forests of trees to evaluate the importance of\nfeatures on an artificial classification task. The red bars are\nthe impurity-based feature importances of the forest,\nalong with their inter-trees variability.\n\nAs expected, the plot suggests that 3 features are informative, while the\nremaining are not.\n\nWarning: impurity-based feature importances can be misleading for high\ncardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.\n"
       ]
     },
     {
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import ExtraTreesClassifier\n\n# Build a classification task using 3 informative features\nX, y = make_classification(n_samples=1000,\n                           n_features=10,\n                           n_informative=3,\n                           n_redundant=0,\n                           n_repeated=0,\n                           n_classes=2,\n                           random_state=0,\n                           shuffle=False)\n\n# Build a forest and compute the impurity-based feature importances\nforest = ExtraTreesClassifier(n_estimators=250,\n                              random_state=0)\n\nforest.fit(X, y)\nimportances = forest.feature_importances_\nstd = np.std([tree.feature_importances_ for tree in forest.estimators_],\n             axis=0)\nindices = np.argsort(importances)[::-1]\n\n# Print the feature ranking\nprint(\"Feature ranking:\")\n\nfor f in range(X.shape[1]):\n    print(\"%d. feature %d (%f)\" % (f + 1, indices[f], importances[indices[f]]))\n\n# Plot the impurity-based feature importances of the forest\nplt.figure()\nplt.title(\"Feature importances\")\nplt.bar(range(X.shape[1]), importances[indices],\n       color=\"r\", yerr=std[indices], align=\"center\")\nplt.xticks(range(X.shape[1]), indices)\nplt.xlim([-1, X.shape[1]])\nplt.show()"
+        "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import ExtraTreesClassifier\n\n# Build a classification task using 3 informative features\nX, y = make_classification(n_samples=1000,\n                           n_features=10,\n                           n_informative=3,\n                           n_redundant=0,\n                           n_repeated=0,\n                           n_classes=2,\n                           random_state=0,\n                           shuffle=False)\n\n# Build a forest and compute the impurity-based feature importances\nforest = ExtraTreesClassifier(n_estimators=250,\n                              random_state=0)\n\nforest.fit(X, y)\nimportances = forest.feature_importances_\nstd = np.std([tree.feature_importances_ for tree in forest.estimators_],\n             axis=0)\nindices = np.argsort(importances)[::-1]\n\n# Print the feature ranking\nprint(\"Feature ranking:\")\n\nfor f in range(X.shape[1]):\n    print(\"%d. feature %d (%f)\" % (f + 1, indices[f], importances[indices[f]]))\n\n# Plot the impurity-based feature importances of the forest\nplt.figure()\nplt.title(\"Feature importances\")\nplt.bar(range(X.shape[1]), importances[indices],\n        color=\"r\", yerr=std[indices], align=\"center\")\nplt.xticks(range(X.shape[1]), indices)\nplt.xlim([-1, X.shape[1]])\nplt.show()"
       ]
     }
   ],
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import ensemble\nfrom sklearn import datasets\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import mean_squared_error\n\n# #############################################################################\n# Load data\nboston = datasets.load_boston()\nX, y = shuffle(boston.data, boston.target, random_state=13)\nX = X.astype(np.float32)\noffset = int(X.shape[0] * 0.9)\nX_train, y_train = X[:offset], y[:offset]\nX_test, y_test = X[offset:], y[offset:]\n\n# #############################################################################\n# Fit regression model\nparams = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,\n          'learning_rate': 0.01, 'loss': 'ls'}\nclf = ensemble.GradientBoostingRegressor(**params)\n\nclf.fit(X_train, y_train)\nmse = mean_squared_error(y_test, clf.predict(X_test))\nprint(\"MSE: %.4f\" % mse)\n\n# #############################################################################\n# Plot training deviance\n\n# compute test set deviance\ntest_score = np.zeros((params['n_estimators'],), dtype=np.float64)\n\nfor i, y_pred in enumerate(clf.staged_predict(X_test)):\n    test_score[i] = clf.loss_(y_test, y_pred)\n\nplt.figure(figsize=(12, 6))\nplt.subplot(1, 2, 1)\nplt.title('Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',\n         label='Training Set Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',\n         label='Test Set Deviance')\nplt.legend(loc='upper right')\nplt.xlabel('Boosting Iterations')\nplt.ylabel('Deviance')\n\n# #############################################################################\n# Plot impurity-based feature importance\nfeature_importance = clf.feature_importances_\n# make importances relative to max importance\nfeature_importance = 100.0 * (feature_importance / feature_importance.max())\nsorted_idx = np.argsort(feature_importance)\npos = np.arange(sorted_idx.shape[0]) + .5\nplt.subplot(1, 2, 2)\nplt.barh(pos, feature_importance[sorted_idx], align='center')\nplt.yticks(pos, boston.feature_names[sorted_idx])\nplt.xlabel('Relative Importance')\nplt.title('Variable Importance')\nplt.show()"
+        "print(__doc__)\n\n# Author: Peter Prettenhofer <[email protected]>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import ensemble\nfrom sklearn import datasets\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import mean_squared_error\n\n# #############################################################################\n# Load data\nboston = datasets.load_boston()\nX, y = shuffle(boston.data, boston.target, random_state=13)\nX = X.astype(np.float32)\noffset = int(X.shape[0] * 0.9)\nX_train, y_train = X[:offset], y[:offset]\nX_test, y_test = X[offset:], y[offset:]\n\n# #############################################################################\n# Fit regression model\nparams = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,\n          'learning_rate': 0.01, 'loss': 'ls'}\nclf = ensemble.GradientBoostingRegressor(**params)\n\nclf.fit(X_train, y_train)\nmse = mean_squared_error(y_test, clf.predict(X_test))\nprint(\"MSE: %.4f\" % mse)\n\n# #############################################################################\n# Plot training deviance\n\n# compute test set deviance\ntest_score = np.zeros((params['n_estimators'],), dtype=np.float64)\n\nfor i, y_pred in enumerate(clf.staged_predict(X_test)):\n    test_score[i] = clf.loss_(y_test, y_pred)\n\nplt.figure(figsize=(12, 6))\nplt.subplot(1, 2, 1)\nplt.title('Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',\n         label='Training Set Deviance')\nplt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',\n         label='Test Set Deviance')\nplt.legend(loc='upper right')\nplt.xlabel('Boosting Iterations')\nplt.ylabel('Deviance')\n\n# #############################################################################\n# Plot impurity-based feature importance\n#\n# Warning: impurity-based feature importances can be misleading for\n# high cardinality features (many unique values). See\n# :func:`sklearn.inspection.permutation_importance` as an alternative.\n\nfeature_importance = clf.feature_importances_\n# make importances relative to max importance\nfeature_importance = 100.0 * (feature_importance / feature_importance.max())\nsorted_idx = np.argsort(feature_importance)\npos = np.arange(sorted_idx.shape[0]) + .5\nplt.subplot(1, 2, 2)\nplt.barh(pos, feature_importance[sorted_idx], align='center')\nplt.yticks(pos, boston.feature_names[sorted_idx])\nplt.xlabel('Relative Importance')\nplt.title('Variable Importance')\nplt.show()"
       ]
     }
   ],
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"cell_type": "markdown",`
`16`	`16`	`"metadata": {},`
`17`	`17`	`"source": [`
`18`		`- "\n# Feature importances with forests of trees\n\n\nThis examples shows the use of forests of trees to evaluate the importance of\nfeatures on an artificial classification task. The red bars are\nthe impurity-based feature importances of the forest,\nalong with their inter-trees variability.\n\nAs expected, the plot suggests that 3 features are informative, while the\nremaining are not.\n"`
	`18`	+ "\n# Feature importances with forests of trees\n\n\nThis examples shows the use of forests of trees to evaluate the importance of\nfeatures on an artificial classification task. The red bars are\nthe impurity-based feature importances of the forest,\nalong with their inter-trees variability.\n\nAs expected, the plot suggests that 3 features are informative, while the\nremaining are not.\n\nWarning: impurity-based feature importances can be misleading for high\ncardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.\n"
`19`	`19`	`]`
`20`	`20`	`},`
`21`	`21`	`{`
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import ExtraTreesClassifier\n\n# Build a classification task using 3 informative features\nX, y = make_classification(n_samples=1000,\n n_features=10,\n n_informative=3,\n n_redundant=0,\n n_repeated=0,\n n_classes=2,\n random_state=0,\n shuffle=False)\n\n# Build a forest and compute the impurity-based feature importances\nforest = ExtraTreesClassifier(n_estimators=250,\n random_state=0)\n\nforest.fit(X, y)\nimportances = forest.feature_importances_\nstd = np.std([tree.feature_importances_ for tree in forest.estimators_],\n axis=0)\nindices = np.argsort(importances)[::-1]\n\n# Print the feature ranking\nprint(\"Feature ranking:\")\n\nfor f in range(X.shape[1]):\n print(\"%d. feature %d (%f)\" % (f + 1, indices[f], importances[indices[f]]))\n\n# Plot the impurity-based feature importances of the forest\nplt.figure()\nplt.title(\"Feature importances\")\nplt.bar(range(X.shape[1]), importances[indices],\n color=\"r\", yerr=std[indices], align=\"center\")\nplt.xticks(range(X.shape[1]), indices)\nplt.xlim([-1, X.shape[1]])\nplt.show()"
	`29`	+ "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_classification\nfrom sklearn.ensemble import ExtraTreesClassifier\n\n# Build a classification task using 3 informative features\nX, y = make_classification(n_samples=1000,\n n_features=10,\n n_informative=3,\n n_redundant=0,\n n_repeated=0,\n n_classes=2,\n random_state=0,\n shuffle=False)\n\n# Build a forest and compute the impurity-based feature importances\nforest = ExtraTreesClassifier(n_estimators=250,\n random_state=0)\n\nforest.fit(X, y)\nimportances = forest.feature_importances_\nstd = np.std([tree.feature_importances_ for tree in forest.estimators_],\n axis=0)\nindices = np.argsort(importances)[::-1]\n\n# Print the feature ranking\nprint(\"Feature ranking:\")\n\nfor f in range(X.shape[1]):\n print(\"%d. feature %d (%f)\" % (f + 1, indices[f], importances[indices[f]]))\n\n# Plot the impurity-based feature importances of the forest\nplt.figure()\nplt.title(\"Feature importances\")\nplt.bar(range(X.shape[1]), importances[indices],\n color=\"r\", yerr=std[indices], align=\"center\")\nplt.xticks(range(X.shape[1]), indices)\nplt.xlim([-1, X.shape[1]])\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`