scikit-learn
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
75 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
75 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
73 Bytes b/‎dev/_downloads/auto_examples_python.zip
73 Bytes
diff --git a/‎dev/_downloads/plot_classifier_chain_yeast.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/plot_classifier_chain_yeast.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/plot_classifier_chain_yeast.py
Lines changed: 8 additions & 6 deletions b/‎dev/_downloads/plot_classifier_chain_yeast.py
Lines changed: 8 additions & 6 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
35.1 KB b/‎dev/_downloads/scikit-learn-docs.pdf
35.1 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
632 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
632 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
632 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
632 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
577 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
577 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
577 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
577 Bytes
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Classifier Chain\n\nExample of using classifier chain on a multilabel dataset.\n\nFor this example we will use the `yeast\n<http://mldata.org/repository/data/viewslug/yeast>`_ dataset which contains\n2417 datapoints each with 103 features and 14 possible labels. Each\ndata point has at least one label. As a baseline we first train a logistic\nregression classifier for each of the 14 labels. To evaluate the performance of\nthese classifiers we predict on a held-out test set and calculate the\n`jaccard similarity score <jaccard_similarity_score>`.\n\nNext we create 10 classifier chains. Each classifier chain contains a\nlogistic regression model for each of the 14 labels. The models in each\nchain are ordered randomly. In addition to the 103 features in the dataset,\neach model gets the predictions of the preceding models in the chain as\nfeatures (note that by default at training time each model gets the true\nlabels as features). These additional features allow each chain to exploit\ncorrelations among the classes. The Jaccard similarity score for each chain\ntends to be greater than that of the set independent logistic models.\n\nBecause the models in each chain are arranged randomly there is significant\nvariation in performance among the chains. Presumably there is an optimal\nordering of the classes in a chain that will yield the best performance.\nHowever we do not know that ordering a priori. Instead we can construct an\nvoting ensemble of classifier chains by averaging the binary predictions of\nthe chains and apply a threshold of 0.5. The Jaccard similarity score of the\nensemble is greater than that of the independent models and tends to exceed\nthe score of each chain in the ensemble (although this is not guaranteed\nwith randomly ordered chains).\n\n"
+        "\n# Classifier Chain\n\nExample of using classifier chain on a multilabel dataset.\n\nFor this example we will use the `yeast\n<http://mldata.org/repository/data/viewslug/yeast>`_ dataset which contains\n2417 datapoints each with 103 features and 14 possible labels. Each\ndata point has at least one label. As a baseline we first train a logistic\nregression classifier for each of the 14 labels. To evaluate the performance of\nthese classifiers we predict on a held-out test set and calculate the\n`jaccard score <jaccard_score>` for each sample.\n\nNext we create 10 classifier chains. Each classifier chain contains a\nlogistic regression model for each of the 14 labels. The models in each\nchain are ordered randomly. In addition to the 103 features in the dataset,\neach model gets the predictions of the preceding models in the chain as\nfeatures (note that by default at training time each model gets the true\nlabels as features). These additional features allow each chain to exploit\ncorrelations among the classes. The Jaccard similarity score for each chain\ntends to be greater than that of the set independent logistic models.\n\nBecause the models in each chain are arranged randomly there is significant\nvariation in performance among the chains. Presumably there is an optimal\nordering of the classes in a chain that will yield the best performance.\nHowever we do not know that ordering a priori. Instead we can construct an\nvoting ensemble of classifier chains by averaging the binary predictions of\nthe chains and apply a threshold of 0.5. The Jaccard similarity score of the\nensemble is greater than that of the independent models and tends to exceed\nthe score of each chain in the ensemble (although this is not guaranteed\nwith randomly ordered chains).\n\n"
       ]
     },
     {
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Adam Kleczewski\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.multioutput import ClassifierChain\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.metrics import jaccard_similarity_score\nfrom sklearn.linear_model import LogisticRegression\n\nprint(__doc__)\n\n# Load a multi-label dataset from https://www.openml.org/d/40597\nX, Y = fetch_openml('yeast', version=4, return_X_y=True)\nY = Y == 'TRUE'\nX_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2,\n                                                    random_state=0)\n\n# Fit an independent logistic regression model for each class using the\n# OneVsRestClassifier wrapper.\nbase_lr = LogisticRegression(solver='lbfgs')\novr = OneVsRestClassifier(base_lr)\novr.fit(X_train, Y_train)\nY_pred_ovr = ovr.predict(X_test)\novr_jaccard_score = jaccard_similarity_score(Y_test, Y_pred_ovr)\n\n# Fit an ensemble of logistic regression classifier chains and take the\n# take the average prediction of all the chains.\nchains = [ClassifierChain(base_lr, order='random', random_state=i)\n          for i in range(10)]\nfor chain in chains:\n    chain.fit(X_train, Y_train)\n\nY_pred_chains = np.array([chain.predict(X_test) for chain in\n                          chains])\nchain_jaccard_scores = [jaccard_similarity_score(Y_test, Y_pred_chain >= .5)\n                        for Y_pred_chain in Y_pred_chains]\n\nY_pred_ensemble = Y_pred_chains.mean(axis=0)\nensemble_jaccard_score = jaccard_similarity_score(Y_test,\n                                                  Y_pred_ensemble >= .5)\n\nmodel_scores = [ovr_jaccard_score] + chain_jaccard_scores\nmodel_scores.append(ensemble_jaccard_score)\n\nmodel_names = ('Independent',\n               'Chain 1',\n               'Chain 2',\n               'Chain 3',\n               'Chain 4',\n               'Chain 5',\n               'Chain 6',\n               'Chain 7',\n               'Chain 8',\n               'Chain 9',\n               'Chain 10',\n               'Ensemble')\n\nx_pos = np.arange(len(model_names))\n\n# Plot the Jaccard similarity scores for the independent model, each of the\n# chains, and the ensemble (note that the vertical axis on this plot does\n# not begin at 0).\n\nfig, ax = plt.subplots(figsize=(7, 4))\nax.grid(True)\nax.set_title('Classifier Chain Ensemble Performance Comparison')\nax.set_xticks(x_pos)\nax.set_xticklabels(model_names, rotation='vertical')\nax.set_ylabel('Jaccard Similarity Score')\nax.set_ylim([min(model_scores) * .9, max(model_scores) * 1.1])\ncolors = ['r'] + ['b'] * len(chain_jaccard_scores) + ['g']\nax.bar(x_pos, model_scores, alpha=0.5, color=colors)\nplt.tight_layout()\nplt.show()"
+        "# Author: Adam Kleczewski\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.multioutput import ClassifierChain\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.metrics import jaccard_score\nfrom sklearn.linear_model import LogisticRegression\n\nprint(__doc__)\n\n# Load a multi-label dataset from https://www.openml.org/d/40597\nX, Y = fetch_openml('yeast', version=4, return_X_y=True)\nY = Y == 'TRUE'\nX_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2,\n                                                    random_state=0)\n\n# Fit an independent logistic regression model for each class using the\n# OneVsRestClassifier wrapper.\nbase_lr = LogisticRegression(solver='lbfgs')\novr = OneVsRestClassifier(base_lr)\novr.fit(X_train, Y_train)\nY_pred_ovr = ovr.predict(X_test)\novr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average='samples')\n\n# Fit an ensemble of logistic regression classifier chains and take the\n# take the average prediction of all the chains.\nchains = [ClassifierChain(base_lr, order='random', random_state=i)\n          for i in range(10)]\nfor chain in chains:\n    chain.fit(X_train, Y_train)\n\nY_pred_chains = np.array([chain.predict(X_test) for chain in\n                          chains])\nchain_jaccard_scores = [jaccard_score(Y_test, Y_pred_chain >= .5,\n                                      average='samples')\n                        for Y_pred_chain in Y_pred_chains]\n\nY_pred_ensemble = Y_pred_chains.mean(axis=0)\nensemble_jaccard_score = jaccard_score(Y_test,\n                                       Y_pred_ensemble >= .5,\n                                       average='samples')\n\nmodel_scores = [ovr_jaccard_score] + chain_jaccard_scores\nmodel_scores.append(ensemble_jaccard_score)\n\nmodel_names = ('Independent',\n               'Chain 1',\n               'Chain 2',\n               'Chain 3',\n               'Chain 4',\n               'Chain 5',\n               'Chain 6',\n               'Chain 7',\n               'Chain 8',\n               'Chain 9',\n               'Chain 10',\n               'Ensemble')\n\nx_pos = np.arange(len(model_names))\n\n# Plot the Jaccard similarity scores for the independent model, each of the\n# chains, and the ensemble (note that the vertical axis on this plot does\n# not begin at 0).\n\nfig, ax = plt.subplots(figsize=(7, 4))\nax.grid(True)\nax.set_title('Classifier Chain Ensemble Performance Comparison')\nax.set_xticks(x_pos)\nax.set_xticklabels(model_names, rotation='vertical')\nax.set_ylabel('Jaccard Similarity Score')\nax.set_ylim([min(model_scores) * .9, max(model_scores) * 1.1])\ncolors = ['r'] + ['b'] * len(chain_jaccard_scores) + ['g']\nax.bar(x_pos, model_scores, alpha=0.5, color=colors)\nplt.tight_layout()\nplt.show()"
       ]
     }
   ],
 
@@ -10,7 +10,7 @@
 data point has at least one label. As a baseline we first train a logistic
 regression classifier for each of the 14 labels. To evaluate the performance of
 these classifiers we predict on a held-out test set and calculate the
-:ref:`jaccard similarity score <jaccard_similarity_score>`.
+:ref:`jaccard score <jaccard_score>` for each sample.
 
 Next we create 10 classifier chains. Each classifier chain contains a
 logistic regression model for each of the 14 labels. The models in each
@@ -41,7 +41,7 @@
 from sklearn.multioutput import ClassifierChain
 from sklearn.model_selection import train_test_split
 from sklearn.multiclass import OneVsRestClassifier
-from sklearn.metrics import jaccard_similarity_score
+from sklearn.metrics import jaccard_score
 from sklearn.linear_model import LogisticRegression
 
 print(__doc__)
@@ -58,7 +58,7 @@
 ovr = OneVsRestClassifier(base_lr)
 ovr.fit(X_train, Y_train)
 Y_pred_ovr = ovr.predict(X_test)
-ovr_jaccard_score = jaccard_similarity_score(Y_test, Y_pred_ovr)
+ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average='samples')
 
 # Fit an ensemble of logistic regression classifier chains and take the
 # take the average prediction of all the chains.
@@ -69,12 +69,14 @@
 
 Y_pred_chains = np.array([chain.predict(X_test) for chain in
                           chains])
-chain_jaccard_scores = [jaccard_similarity_score(Y_test, Y_pred_chain >= .5)
+chain_jaccard_scores = [jaccard_score(Y_test, Y_pred_chain >= .5,
+                                      average='samples')
                         for Y_pred_chain in Y_pred_chains]
 
 Y_pred_ensemble = Y_pred_chains.mean(axis=0)
-ensemble_jaccard_score = jaccard_similarity_score(Y_test,
-                                                  Y_pred_ensemble >= .5)
+ensemble_jaccard_score = jaccard_score(Y_test,
+                                       Y_pred_ensemble >= .5,
+                                       average='samples')
 
 model_scores = [ovr_jaccard_score] + chain_jaccard_scores
 model_scores.append(ensemble_jaccard_score)
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"cell_type": "markdown",`
`16`	`16`	`"metadata": {},`
`17`	`17`	`"source": [`
`18`		- "\n# Classifier Chain\n\nExample of using classifier chain on a multilabel dataset.\n\nFor this example we will use the `yeast\n<http://mldata.org/repository/data/viewslug/yeast>`_ dataset which contains\n2417 datapoints each with 103 features and 14 possible labels. Each\ndata point has at least one label. As a baseline we first train a logistic\nregression classifier for each of the 14 labels. To evaluate the performance of\nthese classifiers we predict on a held-out test set and calculate the\n`jaccard similarity score <jaccard_similarity_score>`.\n\nNext we create 10 classifier chains. Each classifier chain contains a\nlogistic regression model for each of the 14 labels. The models in each\nchain are ordered randomly. In addition to the 103 features in the dataset,\neach model gets the predictions of the preceding models in the chain as\nfeatures (note that by default at training time each model gets the true\nlabels as features). These additional features allow each chain to exploit\ncorrelations among the classes. The Jaccard similarity score for each chain\ntends to be greater than that of the set independent logistic models.\n\nBecause the models in each chain are arranged randomly there is significant\nvariation in performance among the chains. Presumably there is an optimal\nordering of the classes in a chain that will yield the best performance.\nHowever we do not know that ordering a priori. Instead we can construct an\nvoting ensemble of classifier chains by averaging the binary predictions of\nthe chains and apply a threshold of 0.5. The Jaccard similarity score of the\nensemble is greater than that of the independent models and tends to exceed\nthe score of each chain in the ensemble (although this is not guaranteed\nwith randomly ordered chains).\n\n"
	`18`	+ "\n# Classifier Chain\n\nExample of using classifier chain on a multilabel dataset.\n\nFor this example we will use the `yeast\n<http://mldata.org/repository/data/viewslug/yeast>`_ dataset which contains\n2417 datapoints each with 103 features and 14 possible labels. Each\ndata point has at least one label. As a baseline we first train a logistic\nregression classifier for each of the 14 labels. To evaluate the performance of\nthese classifiers we predict on a held-out test set and calculate the\n`jaccard score <jaccard_score>` for each sample.\n\nNext we create 10 classifier chains. Each classifier chain contains a\nlogistic regression model for each of the 14 labels. The models in each\nchain are ordered randomly. In addition to the 103 features in the dataset,\neach model gets the predictions of the preceding models in the chain as\nfeatures (note that by default at training time each model gets the true\nlabels as features). These additional features allow each chain to exploit\ncorrelations among the classes. The Jaccard similarity score for each chain\ntends to be greater than that of the set independent logistic models.\n\nBecause the models in each chain are arranged randomly there is significant\nvariation in performance among the chains. Presumably there is an optimal\nordering of the classes in a chain that will yield the best performance.\nHowever we do not know that ordering a priori. Instead we can construct an\nvoting ensemble of classifier chains by averaging the binary predictions of\nthe chains and apply a threshold of 0.5. The Jaccard similarity score of the\nensemble is greater than that of the independent models and tends to exceed\nthe score of each chain in the ensemble (although this is not guaranteed\nwith randomly ordered chains).\n\n"
`19`	`19`	`]`
`20`	`20`	`},`
`21`	`21`	`{`
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Adam Kleczewski\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.multioutput import ClassifierChain\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.metrics import jaccard_similarity_score\nfrom sklearn.linear_model import LogisticRegression\n\nprint(__doc__)\n\n# Load a multi-label dataset from https://www.openml.org/d/40597\nX, Y = fetch_openml('yeast', version=4, return_X_y=True)\nY = Y == 'TRUE'\nX_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2,\n random_state=0)\n\n# Fit an independent logistic regression model for each class using the\n# OneVsRestClassifier wrapper.\nbase_lr = LogisticRegression(solver='lbfgs')\novr = OneVsRestClassifier(base_lr)\novr.fit(X_train, Y_train)\nY_pred_ovr = ovr.predict(X_test)\novr_jaccard_score = jaccard_similarity_score(Y_test, Y_pred_ovr)\n\n# Fit an ensemble of logistic regression classifier chains and take the\n# take the average prediction of all the chains.\nchains = [ClassifierChain(base_lr, order='random', random_state=i)\n for i in range(10)]\nfor chain in chains:\n chain.fit(X_train, Y_train)\n\nY_pred_chains = np.array([chain.predict(X_test) for chain in\n chains])\nchain_jaccard_scores = [jaccard_similarity_score(Y_test, Y_pred_chain >= .5)\n for Y_pred_chain in Y_pred_chains]\n\nY_pred_ensemble = Y_pred_chains.mean(axis=0)\nensemble_jaccard_score = jaccard_similarity_score(Y_test,\n Y_pred_ensemble >= .5)\n\nmodel_scores = [ovr_jaccard_score] + chain_jaccard_scores\nmodel_scores.append(ensemble_jaccard_score)\n\nmodel_names = ('Independent',\n 'Chain 1',\n 'Chain 2',\n 'Chain 3',\n 'Chain 4',\n 'Chain 5',\n 'Chain 6',\n 'Chain 7',\n 'Chain 8',\n 'Chain 9',\n 'Chain 10',\n 'Ensemble')\n\nx_pos = np.arange(len(model_names))\n\n# Plot the Jaccard similarity scores for the independent model, each of the\n# chains, and the ensemble (note that the vertical axis on this plot does\n# not begin at 0).\n\nfig, ax = plt.subplots(figsize=(7, 4))\nax.grid(True)\nax.set_title('Classifier Chain Ensemble Performance Comparison')\nax.set_xticks(x_pos)\nax.set_xticklabels(model_names, rotation='vertical')\nax.set_ylabel('Jaccard Similarity Score')\nax.set_ylim([min(model_scores) * .9, max(model_scores) * 1.1])\ncolors = ['r'] + ['b'] * len(chain_jaccard_scores) + ['g']\nax.bar(x_pos, model_scores, alpha=0.5, color=colors)\nplt.tight_layout()\nplt.show()"
	`29`	+ "# Author: Adam Kleczewski\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.multioutput import ClassifierChain\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.metrics import jaccard_score\nfrom sklearn.linear_model import LogisticRegression\n\nprint(__doc__)\n\n# Load a multi-label dataset from https://www.openml.org/d/40597\nX, Y = fetch_openml('yeast', version=4, return_X_y=True)\nY = Y == 'TRUE'\nX_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2,\n random_state=0)\n\n# Fit an independent logistic regression model for each class using the\n# OneVsRestClassifier wrapper.\nbase_lr = LogisticRegression(solver='lbfgs')\novr = OneVsRestClassifier(base_lr)\novr.fit(X_train, Y_train)\nY_pred_ovr = ovr.predict(X_test)\novr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average='samples')\n\n# Fit an ensemble of logistic regression classifier chains and take the\n# take the average prediction of all the chains.\nchains = [ClassifierChain(base_lr, order='random', random_state=i)\n for i in range(10)]\nfor chain in chains:\n chain.fit(X_train, Y_train)\n\nY_pred_chains = np.array([chain.predict(X_test) for chain in\n chains])\nchain_jaccard_scores = [jaccard_score(Y_test, Y_pred_chain >= .5,\n average='samples')\n for Y_pred_chain in Y_pred_chains]\n\nY_pred_ensemble = Y_pred_chains.mean(axis=0)\nensemble_jaccard_score = jaccard_score(Y_test,\n Y_pred_ensemble >= .5,\n average='samples')\n\nmodel_scores = [ovr_jaccard_score] + chain_jaccard_scores\nmodel_scores.append(ensemble_jaccard_score)\n\nmodel_names = ('Independent',\n 'Chain 1',\n 'Chain 2',\n 'Chain 3',\n 'Chain 4',\n 'Chain 5',\n 'Chain 6',\n 'Chain 7',\n 'Chain 8',\n 'Chain 9',\n 'Chain 10',\n 'Ensemble')\n\nx_pos = np.arange(len(model_names))\n\n# Plot the Jaccard similarity scores for the independent model, each of the\n# chains, and the ensemble (note that the vertical axis on this plot does\n# not begin at 0).\n\nfig, ax = plt.subplots(figsize=(7, 4))\nax.grid(True)\nax.set_title('Classifier Chain Ensemble Performance Comparison')\nax.set_xticks(x_pos)\nax.set_xticklabels(model_names, rotation='vertical')\nax.set_ylabel('Jaccard Similarity Score')\nax.set_ylim([min(model_scores) * .9, max(model_scores) * 1.1])\ncolors = ['r'] + ['b'] * len(chain_jaccard_scores) + ['g']\nax.bar(x_pos, model_scores, alpha=0.5, color=colors)\nplt.tight_layout()\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`