scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
-289 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
-289 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
-284 Bytes b/‎dev/_downloads/auto_examples_python.zip
-284 Bytes
diff --git a/‎dev/_downloads/plot_adaboost_multiclass.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_adaboost_multiclass.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_adaboost_multiclass.py
Lines changed: 0 additions & 2 deletions b/‎dev/_downloads/plot_adaboost_multiclass.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_bicluster_newsgroups.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_bicluster_newsgroups.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_bicluster_newsgroups.py
Lines changed: 1 addition & 2 deletions b/‎dev/_downloads/plot_bicluster_newsgroups.py
Lines changed: 1 addition & 2 deletions
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\n# Author: Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom sklearn.externals.six.moves import zip\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_gaussian_quantiles\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nX, y = make_gaussian_quantiles(n_samples=13000, n_features=10,\n                               n_classes=3, random_state=1)\n\nn_split = 3000\n\nX_train, X_test = X[:n_split], X[n_split:]\ny_train, y_test = y[:n_split], y[n_split:]\n\nbdt_real = AdaBoostClassifier(\n    DecisionTreeClassifier(max_depth=2),\n    n_estimators=600,\n    learning_rate=1)\n\nbdt_discrete = AdaBoostClassifier(\n    DecisionTreeClassifier(max_depth=2),\n    n_estimators=600,\n    learning_rate=1.5,\n    algorithm=\"SAMME\")\n\nbdt_real.fit(X_train, y_train)\nbdt_discrete.fit(X_train, y_train)\n\nreal_test_errors = []\ndiscrete_test_errors = []\n\nfor real_test_predict, discrete_train_predict in zip(\n        bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):\n    real_test_errors.append(\n        1. - accuracy_score(real_test_predict, y_test))\n    discrete_test_errors.append(\n        1. - accuracy_score(discrete_train_predict, y_test))\n\nn_trees_discrete = len(bdt_discrete)\nn_trees_real = len(bdt_real)\n\n# Boosting might terminate early, but the following arrays are always\n# n_estimators long. We crop them to the actual number of trees here:\ndiscrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\nreal_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\ndiscrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]\n\nplt.figure(figsize=(15, 5))\n\nplt.subplot(131)\nplt.plot(range(1, n_trees_discrete + 1),\n         discrete_test_errors, c='black', label='SAMME')\nplt.plot(range(1, n_trees_real + 1),\n         real_test_errors, c='black',\n         linestyle='dashed', label='SAMME.R')\nplt.legend()\nplt.ylim(0.18, 0.62)\nplt.ylabel('Test Error')\nplt.xlabel('Number of Trees')\n\nplt.subplot(132)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_errors,\n         \"b\", label='SAMME', alpha=.5)\nplt.plot(range(1, n_trees_real + 1), real_estimator_errors,\n         \"r\", label='SAMME.R', alpha=.5)\nplt.legend()\nplt.ylabel('Error')\nplt.xlabel('Number of Trees')\nplt.ylim((.2,\n         max(real_estimator_errors.max(),\n             discrete_estimator_errors.max()) * 1.2))\nplt.xlim((-20, len(bdt_discrete) + 20))\n\nplt.subplot(133)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights,\n         \"b\", label='SAMME')\nplt.legend()\nplt.ylabel('Weight')\nplt.xlabel('Number of Trees')\nplt.ylim((0, discrete_estimator_weights.max() * 1.2))\nplt.xlim((-20, n_trees_discrete + 20))\n\n# prevent overlapping y-axis labels\nplt.subplots_adjust(wspace=0.25)\nplt.show()"
+        "print(__doc__)\n\n# Author: Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_gaussian_quantiles\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nX, y = make_gaussian_quantiles(n_samples=13000, n_features=10,\n                               n_classes=3, random_state=1)\n\nn_split = 3000\n\nX_train, X_test = X[:n_split], X[n_split:]\ny_train, y_test = y[:n_split], y[n_split:]\n\nbdt_real = AdaBoostClassifier(\n    DecisionTreeClassifier(max_depth=2),\n    n_estimators=600,\n    learning_rate=1)\n\nbdt_discrete = AdaBoostClassifier(\n    DecisionTreeClassifier(max_depth=2),\n    n_estimators=600,\n    learning_rate=1.5,\n    algorithm=\"SAMME\")\n\nbdt_real.fit(X_train, y_train)\nbdt_discrete.fit(X_train, y_train)\n\nreal_test_errors = []\ndiscrete_test_errors = []\n\nfor real_test_predict, discrete_train_predict in zip(\n        bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):\n    real_test_errors.append(\n        1. - accuracy_score(real_test_predict, y_test))\n    discrete_test_errors.append(\n        1. - accuracy_score(discrete_train_predict, y_test))\n\nn_trees_discrete = len(bdt_discrete)\nn_trees_real = len(bdt_real)\n\n# Boosting might terminate early, but the following arrays are always\n# n_estimators long. We crop them to the actual number of trees here:\ndiscrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\nreal_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\ndiscrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]\n\nplt.figure(figsize=(15, 5))\n\nplt.subplot(131)\nplt.plot(range(1, n_trees_discrete + 1),\n         discrete_test_errors, c='black', label='SAMME')\nplt.plot(range(1, n_trees_real + 1),\n         real_test_errors, c='black',\n         linestyle='dashed', label='SAMME.R')\nplt.legend()\nplt.ylim(0.18, 0.62)\nplt.ylabel('Test Error')\nplt.xlabel('Number of Trees')\n\nplt.subplot(132)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_errors,\n         \"b\", label='SAMME', alpha=.5)\nplt.plot(range(1, n_trees_real + 1), real_estimator_errors,\n         \"r\", label='SAMME.R', alpha=.5)\nplt.legend()\nplt.ylabel('Error')\nplt.xlabel('Number of Trees')\nplt.ylim((.2,\n         max(real_estimator_errors.max(),\n             discrete_estimator_errors.max()) * 1.2))\nplt.xlim((-20, len(bdt_discrete) + 20))\n\nplt.subplot(133)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights,\n         \"b\", label='SAMME')\nplt.legend()\nplt.ylabel('Weight')\nplt.xlabel('Number of Trees')\nplt.ylim((0, discrete_estimator_weights.max() * 1.2))\nplt.xlim((-20, n_trees_discrete + 20))\n\n# prevent overlapping y-axis labels\nplt.subplots_adjust(wspace=0.25)\nplt.show()"
       ]
     }
   ],
 
@@ -29,8 +29,6 @@
 #
 # License: BSD 3 clause
 
-from sklearn.externals.six.moves import zip
-
 import matplotlib.pyplot as plt
 
 from sklearn.datasets import make_gaussian_quantiles
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.externals.six import iteritems\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_normalizer(tokens):\n    \"\"\" Map all numeric tokens to a placeholder.\n\n    For many applications, tokens that begin with a number are not directly\n    useful, but the fact that such a token exists can be relevant.  By applying\n    this form of dimensionality reduction, some methods may perform better.\n    \"\"\"\n    return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n    def build_tokenizer(self):\n        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()\n        return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n              'comp.windows.x', 'misc.forsale', 'rec.autos',\n              'rec.motorcycles', 'rec.sport.baseball',\n              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n              'sci.med', 'sci.space', 'soc.religion.christian',\n              'talk.politics.guns', 'talk.politics.mideast',\n              'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n                                 svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n                         random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n    rows, cols = cocluster.get_indices(i)\n    if not (np.any(rows) and np.any(cols)):\n        import sys\n        return sys.float_info.max\n    row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n    col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n    # Note: the following is identical to X[rows[:, np.newaxis],\n    # cols].sum() but much faster in scipy <= 0.16\n    weight = X[rows][:, cols].sum()\n    cut = (X[row_complement][:, cols].sum() +\n           X[rows][:, col_complement].sum())\n    return cut / weight\n\n\ndef most_common(d):\n    \"\"\"Items of a defaultdict(int) with the highest values.\n\n    Like Counter.most_common in Python >=2.7.\n    \"\"\"\n    return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n                       for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n    n_rows, n_cols = cocluster.get_shape(cluster)\n    cluster_docs, cluster_words = cocluster.get_indices(cluster)\n    if not len(cluster_docs) or not len(cluster_words):\n        continue\n\n    # categories\n    counter = defaultdict(int)\n    for i in cluster_docs:\n        counter[document_names[i]] += 1\n    cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n                           for name, c in most_common(counter)[:3])\n\n    # words\n    out_of_cluster_docs = cocluster.row_labels_ != cluster\n    out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n    word_col = X[:, cluster_words]\n    word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n                           word_col[out_of_cluster_docs, :].sum(axis=0))\n    word_scores = word_scores.ravel()\n    important_words = list(feature_names[cluster_words[i]]\n                           for i in word_scores.argsort()[:-11:-1])\n\n    print(\"bicluster {} : {} documents, {} words\".format(\n        idx, n_rows, n_cols))\n    print(\"categories   : {}\".format(cat_string))\n    print(\"words        : {}\\n\".format(', '.join(important_words)))"
+        "from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_normalizer(tokens):\n    \"\"\" Map all numeric tokens to a placeholder.\n\n    For many applications, tokens that begin with a number are not directly\n    useful, but the fact that such a token exists can be relevant.  By applying\n    this form of dimensionality reduction, some methods may perform better.\n    \"\"\"\n    return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n    def build_tokenizer(self):\n        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()\n        return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n              'comp.windows.x', 'misc.forsale', 'rec.autos',\n              'rec.motorcycles', 'rec.sport.baseball',\n              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n              'sci.med', 'sci.space', 'soc.religion.christian',\n              'talk.politics.guns', 'talk.politics.mideast',\n              'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n                                 svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n                         random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n    rows, cols = cocluster.get_indices(i)\n    if not (np.any(rows) and np.any(cols)):\n        import sys\n        return sys.float_info.max\n    row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n    col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n    # Note: the following is identical to X[rows[:, np.newaxis],\n    # cols].sum() but much faster in scipy <= 0.16\n    weight = X[rows][:, cols].sum()\n    cut = (X[row_complement][:, cols].sum() +\n           X[rows][:, col_complement].sum())\n    return cut / weight\n\n\ndef most_common(d):\n    \"\"\"Items of a defaultdict(int) with the highest values.\n\n    Like Counter.most_common in Python >=2.7.\n    \"\"\"\n    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n                       for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n    n_rows, n_cols = cocluster.get_shape(cluster)\n    cluster_docs, cluster_words = cocluster.get_indices(cluster)\n    if not len(cluster_docs) or not len(cluster_words):\n        continue\n\n    # categories\n    counter = defaultdict(int)\n    for i in cluster_docs:\n        counter[document_names[i]] += 1\n    cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n                           for name, c in most_common(counter)[:3])\n\n    # words\n    out_of_cluster_docs = cocluster.row_labels_ != cluster\n    out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n    word_col = X[:, cluster_words]\n    word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n                           word_col[out_of_cluster_docs, :].sum(axis=0))\n    word_scores = word_scores.ravel()\n    important_words = list(feature_names[cluster_words[i]]\n                           for i in word_scores.argsort()[:-11:-1])\n\n    print(\"bicluster {} : {} documents, {} words\".format(\n        idx, n_rows, n_cols))\n    print(\"categories   : {}\".format(cat_string))\n    print(\"words        : {}\\n\".format(', '.join(important_words)))"
       ]
     }
   ],
 
@@ -32,7 +32,6 @@
 
 from sklearn.cluster.bicluster import SpectralCoclustering
 from sklearn.cluster import MiniBatchKMeans
-from sklearn.externals.six import iteritems
 from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.cluster import v_measure_score
@@ -116,7 +115,7 @@ def most_common(d):
 
     Like Counter.most_common in Python >=2.7.
     """
-    return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)
+    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)
 
 
 bicluster_ncuts = list(bicluster_ncut(i)
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "print(__doc__)\n\n# Author: Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom sklearn.externals.six.moves import zip\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_gaussian_quantiles\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nX, y = make_gaussian_quantiles(n_samples=13000, n_features=10,\n n_classes=3, random_state=1)\n\nn_split = 3000\n\nX_train, X_test = X[:n_split], X[n_split:]\ny_train, y_test = y[:n_split], y[n_split:]\n\nbdt_real = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2),\n n_estimators=600,\n learning_rate=1)\n\nbdt_discrete = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2),\n n_estimators=600,\n learning_rate=1.5,\n algorithm=\"SAMME\")\n\nbdt_real.fit(X_train, y_train)\nbdt_discrete.fit(X_train, y_train)\n\nreal_test_errors = []\ndiscrete_test_errors = []\n\nfor real_test_predict, discrete_train_predict in zip(\n bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):\n real_test_errors.append(\n 1. - accuracy_score(real_test_predict, y_test))\n discrete_test_errors.append(\n 1. - accuracy_score(discrete_train_predict, y_test))\n\nn_trees_discrete = len(bdt_discrete)\nn_trees_real = len(bdt_real)\n\n# Boosting might terminate early, but the following arrays are always\n# n_estimators long. We crop them to the actual number of trees here:\ndiscrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\nreal_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\ndiscrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]\n\nplt.figure(figsize=(15, 5))\n\nplt.subplot(131)\nplt.plot(range(1, n_trees_discrete + 1),\n discrete_test_errors, c='black', label='SAMME')\nplt.plot(range(1, n_trees_real + 1),\n real_test_errors, c='black',\n linestyle='dashed', label='SAMME.R')\nplt.legend()\nplt.ylim(0.18, 0.62)\nplt.ylabel('Test Error')\nplt.xlabel('Number of Trees')\n\nplt.subplot(132)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_errors,\n \"b\", label='SAMME', alpha=.5)\nplt.plot(range(1, n_trees_real + 1), real_estimator_errors,\n \"r\", label='SAMME.R', alpha=.5)\nplt.legend()\nplt.ylabel('Error')\nplt.xlabel('Number of Trees')\nplt.ylim((.2,\n max(real_estimator_errors.max(),\n discrete_estimator_errors.max()) * 1.2))\nplt.xlim((-20, len(bdt_discrete) + 20))\n\nplt.subplot(133)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights,\n \"b\", label='SAMME')\nplt.legend()\nplt.ylabel('Weight')\nplt.xlabel('Number of Trees')\nplt.ylim((0, discrete_estimator_weights.max() * 1.2))\nplt.xlim((-20, n_trees_discrete + 20))\n\n# prevent overlapping y-axis labels\nplt.subplots_adjust(wspace=0.25)\nplt.show()"
	`29`	+ "print(__doc__)\n\n# Author: Noel Dawe <[email protected]>\n#\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_gaussian_quantiles\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nX, y = make_gaussian_quantiles(n_samples=13000, n_features=10,\n n_classes=3, random_state=1)\n\nn_split = 3000\n\nX_train, X_test = X[:n_split], X[n_split:]\ny_train, y_test = y[:n_split], y[n_split:]\n\nbdt_real = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2),\n n_estimators=600,\n learning_rate=1)\n\nbdt_discrete = AdaBoostClassifier(\n DecisionTreeClassifier(max_depth=2),\n n_estimators=600,\n learning_rate=1.5,\n algorithm=\"SAMME\")\n\nbdt_real.fit(X_train, y_train)\nbdt_discrete.fit(X_train, y_train)\n\nreal_test_errors = []\ndiscrete_test_errors = []\n\nfor real_test_predict, discrete_train_predict in zip(\n bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):\n real_test_errors.append(\n 1. - accuracy_score(real_test_predict, y_test))\n discrete_test_errors.append(\n 1. - accuracy_score(discrete_train_predict, y_test))\n\nn_trees_discrete = len(bdt_discrete)\nn_trees_real = len(bdt_real)\n\n# Boosting might terminate early, but the following arrays are always\n# n_estimators long. We crop them to the actual number of trees here:\ndiscrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\nreal_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\ndiscrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]\n\nplt.figure(figsize=(15, 5))\n\nplt.subplot(131)\nplt.plot(range(1, n_trees_discrete + 1),\n discrete_test_errors, c='black', label='SAMME')\nplt.plot(range(1, n_trees_real + 1),\n real_test_errors, c='black',\n linestyle='dashed', label='SAMME.R')\nplt.legend()\nplt.ylim(0.18, 0.62)\nplt.ylabel('Test Error')\nplt.xlabel('Number of Trees')\n\nplt.subplot(132)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_errors,\n \"b\", label='SAMME', alpha=.5)\nplt.plot(range(1, n_trees_real + 1), real_estimator_errors,\n \"r\", label='SAMME.R', alpha=.5)\nplt.legend()\nplt.ylabel('Error')\nplt.xlabel('Number of Trees')\nplt.ylim((.2,\n max(real_estimator_errors.max(),\n discrete_estimator_errors.max()) * 1.2))\nplt.xlim((-20, len(bdt_discrete) + 20))\n\nplt.subplot(133)\nplt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights,\n \"b\", label='SAMME')\nplt.legend()\nplt.ylabel('Weight')\nplt.xlabel('Number of Trees')\nplt.ylim((0, discrete_estimator_weights.max() * 1.2))\nplt.xlim((-20, n_trees_discrete + 20))\n\n# prevent overlapping y-axis labels\nplt.subplots_adjust(wspace=0.25)\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`
Original file line number	Diff line number	Diff line change
`@@ -29,8 +29,6 @@`
`29`	`29`	`#`
`30`	`30`	`# License: BSD 3 clause`
`31`	`31`
`32`		`-from sklearn.externals.six.moves import zip`
`33`		`-`
`34`	`32`	`import matplotlib.pyplot as plt`
`35`	`33`
`36`	`34`	`from sklearn.datasets import make_gaussian_quantiles`