scikit-learn
diff --git a/‎dev/_downloads/28477181ee2a477248e703cf646f97f1/plot_sparse_logistic_regression_20newsgroups.ipynb
Lines changed: 2 additions & 2 deletions b/‎dev/_downloads/28477181ee2a477248e703cf646f97f1/plot_sparse_logistic_regression_20newsgroups.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
-44 Bytes b/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
-44 Bytes
diff --git a/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
-43 Bytes b/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
-43 Bytes
diff --git a/‎dev/_downloads/e6b0467dcb3937291eefb6297ca0db77/plot_sparse_logistic_regression_20newsgroups.py
Lines changed: 3 additions & 4 deletions b/‎dev/_downloads/e6b0467dcb3937291eefb6297ca0db77/plot_sparse_logistic_regression_20newsgroups.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
-33.2 KB b/‎dev/_downloads/scikit-learn-docs.pdf
-33.2 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-32 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-32 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-32 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-32 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
223 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
223 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
223 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0031.png
223 Bytes
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Multiclass sparse logisitic regression on newgroups20\n\n\nComparison of multinomial logistic L1 vs one-versus-rest L1 logistic regression\nto classify documents from the newgroups20 dataset. Multinomial logistic\nregression yields more accurate results and is faster to train on the larger\nscale dataset.\n\nHere we use the l1 sparsity that trims the weights of not informative\nfeatures to zero. This is good if the goal is to extract the strongly\ndiscriminative vocabulary of each class. If the goal is to get the best\npredictive accuracy, it is better to use the non sparsity-inducing l2 penalty\ninstead.\n\nA more traditional (and possibly better) way to predict on a sparse subset of\ninput features would be to use univariate feature selection followed by a\ntraditional (l2-penalised) logistic regression model.\n"
+        "\n# Multiclass sparse logistic regression on 20newgroups\n\n\nComparison of multinomial logistic L1 vs one-versus-rest L1 logistic regression\nto classify documents from the newgroups20 dataset. Multinomial logistic\nregression yields more accurate results and is faster to train on the larger\nscale dataset.\n\nHere we use the l1 sparsity that trims the weights of not informative\nfeatures to zero. This is good if the goal is to extract the strongly\ndiscriminative vocabulary of each class. If the goal is to get the best\npredictive accuracy, it is better to use the non sparsity-inducing l2 penalty\ninstead.\n\nA more traditional (and possibly better) way to predict on a sparse subset of\ninput features would be to use univariate feature selection followed by a\ntraditional (l2-penalised) logistic regression model.\n"
       ]
     },
     {
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "import timeit\nimport warnings\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.datasets import fetch_20newsgroups_vectorized\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.exceptions import ConvergenceWarning\n\nprint(__doc__)\n# Author: Arthur Mensch\n\nwarnings.filterwarnings(\"ignore\", category=ConvergenceWarning,\n                        module=\"sklearn\")\nt0 = timeit.default_timer()\n\n# We use SAGA solver\nsolver = 'saga'\n\n# Turn down for faster run time\nn_samples = 10000\n\n# Memorized fetch_rcv1 for faster access\nX, y = fetch_20newsgroups_vectorized('all', return_X_y=True)\nX = X[:n_samples]\ny = y[:n_samples]\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n                                                    random_state=42,\n                                                    stratify=y,\n                                                    test_size=0.1)\ntrain_samples, n_features = X_train.shape\nn_classes = np.unique(y).shape[0]\n\nprint('Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i'\n      % (train_samples, n_features, n_classes))\n\nmodels = {'ovr': {'name': 'One versus Rest', 'iters': [1, 2, 4]},\n          'multinomial': {'name': 'Multinomial', 'iters': [1, 3, 7]}}\n\nfor model in models:\n    # Add initial chance-level values for plotting purpose\n    accuracies = [1 / n_classes]\n    times = [0]\n    densities = [1]\n\n    model_params = models[model]\n\n    # Small number of epochs for fast runtime\n    for this_max_iter in model_params['iters']:\n        print('[model=%s, solver=%s] Number of epochs: %s' %\n              (model_params['name'], solver, this_max_iter))\n        lr = LogisticRegression(solver=solver,\n                                multi_class=model,\n                                penalty='l1',\n                                max_iter=this_max_iter,\n                                random_state=42,\n                                )\n        t1 = timeit.default_timer()\n        lr.fit(X_train, y_train)\n        train_time = timeit.default_timer() - t1\n\n        y_pred = lr.predict(X_test)\n        accuracy = np.sum(y_pred == y_test) / y_test.shape[0]\n        density = np.mean(lr.coef_ != 0, axis=1) * 100\n        accuracies.append(accuracy)\n        densities.append(density)\n        times.append(train_time)\n    models[model]['times'] = times\n    models[model]['densities'] = densities\n    models[model]['accuracies'] = accuracies\n    print('Test accuracy for model %s: %.4f' % (model, accuracies[-1]))\n    print('%% non-zero coefficients for model %s, '\n          'per class:\\n %s' % (model, densities[-1]))\n    print('Run time (%i epochs) for model %s:'\n          '%.2f' % (model_params['iters'][-1], model, times[-1]))\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nfor model in models:\n    name = models[model]['name']\n    times = models[model]['times']\n    accuracies = models[model]['accuracies']\n    ax.plot(times, accuracies, marker='o',\n            label='Model: %s' % name)\n    ax.set_xlabel('Train time (s)')\n    ax.set_ylabel('Test accuracy')\nax.legend()\nfig.suptitle('Multinomial vs One-vs-Rest Logistic L1\\n'\n             'Dataset %s' % '20newsgroups')\nfig.tight_layout()\nfig.subplots_adjust(top=0.85)\nrun_time = timeit.default_timer() - t0\nprint('Example run in %.3f s' % run_time)\nplt.show()"
+        "import timeit\nimport warnings\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.datasets import fetch_20newsgroups_vectorized\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.exceptions import ConvergenceWarning\n\nprint(__doc__)\n# Author: Arthur Mensch\n\nwarnings.filterwarnings(\"ignore\", category=ConvergenceWarning,\n                        module=\"sklearn\")\nt0 = timeit.default_timer()\n\n# We use SAGA solver\nsolver = 'saga'\n\n# Turn down for faster run time\nn_samples = 10000\n\nX, y = fetch_20newsgroups_vectorized('all', return_X_y=True)\nX = X[:n_samples]\ny = y[:n_samples]\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n                                                    random_state=42,\n                                                    stratify=y,\n                                                    test_size=0.1)\ntrain_samples, n_features = X_train.shape\nn_classes = np.unique(y).shape[0]\n\nprint('Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i'\n      % (train_samples, n_features, n_classes))\n\nmodels = {'ovr': {'name': 'One versus Rest', 'iters': [1, 2, 4]},\n          'multinomial': {'name': 'Multinomial', 'iters': [1, 3, 7]}}\n\nfor model in models:\n    # Add initial chance-level values for plotting purpose\n    accuracies = [1 / n_classes]\n    times = [0]\n    densities = [1]\n\n    model_params = models[model]\n\n    # Small number of epochs for fast runtime\n    for this_max_iter in model_params['iters']:\n        print('[model=%s, solver=%s] Number of epochs: %s' %\n              (model_params['name'], solver, this_max_iter))\n        lr = LogisticRegression(solver=solver,\n                                multi_class=model,\n                                penalty='l1',\n                                max_iter=this_max_iter,\n                                random_state=42,\n                                )\n        t1 = timeit.default_timer()\n        lr.fit(X_train, y_train)\n        train_time = timeit.default_timer() - t1\n\n        y_pred = lr.predict(X_test)\n        accuracy = np.sum(y_pred == y_test) / y_test.shape[0]\n        density = np.mean(lr.coef_ != 0, axis=1) * 100\n        accuracies.append(accuracy)\n        densities.append(density)\n        times.append(train_time)\n    models[model]['times'] = times\n    models[model]['densities'] = densities\n    models[model]['accuracies'] = accuracies\n    print('Test accuracy for model %s: %.4f' % (model, accuracies[-1]))\n    print('%% non-zero coefficients for model %s, '\n          'per class:\\n %s' % (model, densities[-1]))\n    print('Run time (%i epochs) for model %s:'\n          '%.2f' % (model_params['iters'][-1], model, times[-1]))\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nfor model in models:\n    name = models[model]['name']\n    times = models[model]['times']\n    accuracies = models[model]['accuracies']\n    ax.plot(times, accuracies, marker='o',\n            label='Model: %s' % name)\n    ax.set_xlabel('Train time (s)')\n    ax.set_ylabel('Test accuracy')\nax.legend()\nfig.suptitle('Multinomial vs One-vs-Rest Logistic L1\\n'\n             'Dataset %s' % '20newsgroups')\nfig.tight_layout()\nfig.subplots_adjust(top=0.85)\nrun_time = timeit.default_timer() - t0\nprint('Example run in %.3f s' % run_time)\nplt.show()"
       ]
     }
   ],
 
@@ -1,7 +1,7 @@
 """
-=====================================================
-Multiclass sparse logisitic regression on newgroups20
-=====================================================
+====================================================
+Multiclass sparse logistic regression on 20newgroups
+====================================================
 
 Comparison of multinomial logistic L1 vs one-versus-rest L1 logistic regression
 to classify documents from the newgroups20 dataset. Multinomial logistic
@@ -42,7 +42,6 @@
 # Turn down for faster run time
 n_samples = 10000
 
-# Memorized fetch_rcv1 for faster access
 X, y = fetch_20newsgroups_vectorized('all', return_X_y=True)
 X = X[:n_samples]
 y = y[:n_samples]
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"cell_type": "markdown",`
`16`	`16`	`"metadata": {},`
`17`	`17`	`"source": [`
`18`		- "\n# Multiclass sparse logisitic regression on newgroups20\n\n\nComparison of multinomial logistic L1 vs one-versus-rest L1 logistic regression\nto classify documents from the newgroups20 dataset. Multinomial logistic\nregression yields more accurate results and is faster to train on the larger\nscale dataset.\n\nHere we use the l1 sparsity that trims the weights of not informative\nfeatures to zero. This is good if the goal is to extract the strongly\ndiscriminative vocabulary of each class. If the goal is to get the best\npredictive accuracy, it is better to use the non sparsity-inducing l2 penalty\ninstead.\n\nA more traditional (and possibly better) way to predict on a sparse subset of\ninput features would be to use univariate feature selection followed by a\ntraditional (l2-penalised) logistic regression model.\n"
	`18`	+ "\n# Multiclass sparse logistic regression on 20newgroups\n\n\nComparison of multinomial logistic L1 vs one-versus-rest L1 logistic regression\nto classify documents from the newgroups20 dataset. Multinomial logistic\nregression yields more accurate results and is faster to train on the larger\nscale dataset.\n\nHere we use the l1 sparsity that trims the weights of not informative\nfeatures to zero. This is good if the goal is to extract the strongly\ndiscriminative vocabulary of each class. If the goal is to get the best\npredictive accuracy, it is better to use the non sparsity-inducing l2 penalty\ninstead.\n\nA more traditional (and possibly better) way to predict on a sparse subset of\ninput features would be to use univariate feature selection followed by a\ntraditional (l2-penalised) logistic regression model.\n"
`19`	`19`	`]`
`20`	`20`	`},`
`21`	`21`	`{`
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "import timeit\nimport warnings\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.datasets import fetch_20newsgroups_vectorized\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.exceptions import ConvergenceWarning\n\nprint(__doc__)\n# Author: Arthur Mensch\n\nwarnings.filterwarnings(\"ignore\", category=ConvergenceWarning,\n module=\"sklearn\")\nt0 = timeit.default_timer()\n\n# We use SAGA solver\nsolver = 'saga'\n\n# Turn down for faster run time\nn_samples = 10000\n\n# Memorized fetch_rcv1 for faster access\nX, y = fetch_20newsgroups_vectorized('all', return_X_y=True)\nX = X[:n_samples]\ny = y[:n_samples]\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n random_state=42,\n stratify=y,\n test_size=0.1)\ntrain_samples, n_features = X_train.shape\nn_classes = np.unique(y).shape[0]\n\nprint('Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i'\n % (train_samples, n_features, n_classes))\n\nmodels = {'ovr': {'name': 'One versus Rest', 'iters': [1, 2, 4]},\n 'multinomial': {'name': 'Multinomial', 'iters': [1, 3, 7]}}\n\nfor model in models:\n # Add initial chance-level values for plotting purpose\n accuracies = [1 / n_classes]\n times = [0]\n densities = [1]\n\n model_params = models[model]\n\n # Small number of epochs for fast runtime\n for this_max_iter in model_params['iters']:\n print('[model=%s, solver=%s] Number of epochs: %s' %\n (model_params['name'], solver, this_max_iter))\n lr = LogisticRegression(solver=solver,\n multi_class=model,\n penalty='l1',\n max_iter=this_max_iter,\n random_state=42,\n )\n t1 = timeit.default_timer()\n lr.fit(X_train, y_train)\n train_time = timeit.default_timer() - t1\n\n y_pred = lr.predict(X_test)\n accuracy = np.sum(y_pred == y_test) / y_test.shape[0]\n density = np.mean(lr.coef_ != 0, axis=1) * 100\n accuracies.append(accuracy)\n densities.append(density)\n times.append(train_time)\n models[model]['times'] = times\n models[model]['densities'] = densities\n models[model]['accuracies'] = accuracies\n print('Test accuracy for model %s: %.4f' % (model, accuracies[-1]))\n print('%% non-zero coefficients for model %s, '\n 'per class:\\n %s' % (model, densities[-1]))\n print('Run time (%i epochs) for model %s:'\n '%.2f' % (model_params['iters'][-1], model, times[-1]))\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nfor model in models:\n name = models[model]['name']\n times = models[model]['times']\n accuracies = models[model]['accuracies']\n ax.plot(times, accuracies, marker='o',\n label='Model: %s' % name)\n ax.set_xlabel('Train time (s)')\n ax.set_ylabel('Test accuracy')\nax.legend()\nfig.suptitle('Multinomial vs One-vs-Rest Logistic L1\\n'\n 'Dataset %s' % '20newsgroups')\nfig.tight_layout()\nfig.subplots_adjust(top=0.85)\nrun_time = timeit.default_timer() - t0\nprint('Example run in %.3f s' % run_time)\nplt.show()"
	`29`	+ "import timeit\nimport warnings\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.datasets import fetch_20newsgroups_vectorized\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.exceptions import ConvergenceWarning\n\nprint(__doc__)\n# Author: Arthur Mensch\n\nwarnings.filterwarnings(\"ignore\", category=ConvergenceWarning,\n module=\"sklearn\")\nt0 = timeit.default_timer()\n\n# We use SAGA solver\nsolver = 'saga'\n\n# Turn down for faster run time\nn_samples = 10000\n\nX, y = fetch_20newsgroups_vectorized('all', return_X_y=True)\nX = X[:n_samples]\ny = y[:n_samples]\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n random_state=42,\n stratify=y,\n test_size=0.1)\ntrain_samples, n_features = X_train.shape\nn_classes = np.unique(y).shape[0]\n\nprint('Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i'\n % (train_samples, n_features, n_classes))\n\nmodels = {'ovr': {'name': 'One versus Rest', 'iters': [1, 2, 4]},\n 'multinomial': {'name': 'Multinomial', 'iters': [1, 3, 7]}}\n\nfor model in models:\n # Add initial chance-level values for plotting purpose\n accuracies = [1 / n_classes]\n times = [0]\n densities = [1]\n\n model_params = models[model]\n\n # Small number of epochs for fast runtime\n for this_max_iter in model_params['iters']:\n print('[model=%s, solver=%s] Number of epochs: %s' %\n (model_params['name'], solver, this_max_iter))\n lr = LogisticRegression(solver=solver,\n multi_class=model,\n penalty='l1',\n max_iter=this_max_iter,\n random_state=42,\n )\n t1 = timeit.default_timer()\n lr.fit(X_train, y_train)\n train_time = timeit.default_timer() - t1\n\n y_pred = lr.predict(X_test)\n accuracy = np.sum(y_pred == y_test) / y_test.shape[0]\n density = np.mean(lr.coef_ != 0, axis=1) * 100\n accuracies.append(accuracy)\n densities.append(density)\n times.append(train_time)\n models[model]['times'] = times\n models[model]['densities'] = densities\n models[model]['accuracies'] = accuracies\n print('Test accuracy for model %s: %.4f' % (model, accuracies[-1]))\n print('%% non-zero coefficients for model %s, '\n 'per class:\\n %s' % (model, densities[-1]))\n print('Run time (%i epochs) for model %s:'\n '%.2f' % (model_params['iters'][-1], model, times[-1]))\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nfor model in models:\n name = models[model]['name']\n times = models[model]['times']\n accuracies = models[model]['accuracies']\n ax.plot(times, accuracies, marker='o',\n label='Model: %s' % name)\n ax.set_xlabel('Train time (s)')\n ax.set_ylabel('Test accuracy')\nax.legend()\nfig.suptitle('Multinomial vs One-vs-Rest Logistic L1\\n'\n 'Dataset %s' % '20newsgroups')\nfig.tight_layout()\nfig.subplots_adjust(top=0.85)\nrun_time = timeit.default_timer() - t0\nprint('Example run in %.3f s' % run_time)\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`