linuxdevhub
diff --git a/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
-214 Bytes b/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
-214 Bytes
diff --git a/‎dev/_downloads/388641d133587cc11aa26f2dbef4b950/plot_document_classification_20newsgroups.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/388641d133587cc11aa26f2dbef4b950/plot_document_classification_20newsgroups.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/3b31bf37034a6ece04667cd422e5ff79/plot_document_classification_20newsgroups.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/3b31bf37034a6ece04667cd422e5ff79/plot_document_classification_20newsgroups.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/58e661bc0f16de8cbd6dd7f2c1237745/plot_svm_scale_c.py
Lines changed: 1 addition & 2 deletions b/‎dev/_downloads/58e661bc0f16de8cbd6dd7f2c1237745/plot_svm_scale_c.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎dev/_downloads/5ba9a28ce0b812d50bf8133494f3bf19/plot_svm_scale_c.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/5ba9a28ce0b812d50bf8133494f3bf19/plot_svm_scale_c.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/64866fb2a9398ff657578febcb91d430/plot_mnist_filters.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/64866fb2a9398ff657578febcb91d430/plot_mnist_filters.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/80692cf167e9ea27b27e5bd144159c82/plot_out_of_core_classification.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/80692cf167e9ea27b27e5bd144159c82/plot_out_of_core_classification.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/9b3be64651591413a73d3848e0317ffd/plot_mnist_filters.py
Lines changed: 1 addition & 3 deletions b/‎dev/_downloads/9b3be64651591413a73d3848e0317ffd/plot_mnist_filters.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎dev/_downloads/b86db3a111b621a7beeaa9d099608e5b/plot_out_of_core_classification.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/b86db3a111b621a7beeaa9d099608e5b/plot_out_of_core_classification.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
-217 Bytes b/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
-217 Bytes
@@ -248,7 +248,7 @@ def benchmark(clf):
 for clf, name in (
         (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
         (Perceptron(max_iter=50), "Perceptron"),
-        (PassiveAggressiveClassifier(max_iter=50, tol=1e-3),
+        (PassiveAggressiveClassifier(max_iter=50),
          "Passive-Aggressive"),
         (KNeighborsClassifier(n_neighbors=10), "kNN"),
         (RandomForestClassifier(), "Random forest")):
 
@@ -112,8 +112,7 @@
 clf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,
                        tol=1e-3),
              np.logspace(-2.3, -1.3, 10), X_1, y_1),
-            (LinearSVC(penalty='l2', loss='squared_hinge', dual=True,
-                       tol=1e-4),
+            (LinearSVC(penalty='l2', loss='squared_hinge', dual=True),
              np.logspace(-4.5, -2, 10), X_2, y_2)]
 
 colors = ['navy', 'cyan', 'darkorange']
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "print(__doc__)\n\n\n# Author: Andreas Mueller <[email protected]>\n#         Jaques Grobler <[email protected]>\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.svm import LinearSVC\nfrom sklearn.model_selection import ShuffleSplit\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.utils import check_random_state\nfrom sklearn import datasets\n\nrnd = check_random_state(1)\n\n# set up dataset\nn_samples = 100\nn_features = 300\n\n# l1 data (only 5 informative features)\nX_1, y_1 = datasets.make_classification(n_samples=n_samples,\n                                        n_features=n_features, n_informative=5,\n                                        random_state=1)\n\n# l2 data: non sparse, but less features\ny_2 = np.sign(.5 - rnd.rand(n_samples))\nX_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis]\nX_2 += 5 * rnd.randn(n_samples, n_features // 5)\n\nclf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,\n                       tol=1e-3),\n             np.logspace(-2.3, -1.3, 10), X_1, y_1),\n            (LinearSVC(penalty='l2', loss='squared_hinge', dual=True,\n                       tol=1e-4),\n             np.logspace(-4.5, -2, 10), X_2, y_2)]\n\ncolors = ['navy', 'cyan', 'darkorange']\nlw = 2\n\nfor clf, cs, X, y in clf_sets:\n    # set up the plot for each regressor\n    fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))\n\n    for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]):\n        param_grid = dict(C=cs)\n        # To get nice curve, we need a large number of iterations to\n        # reduce the variance\n        grid = GridSearchCV(clf, refit=False, param_grid=param_grid,\n                            cv=ShuffleSplit(train_size=train_size,\n                                            test_size=.3,\n                                            n_splits=250, random_state=1))\n        grid.fit(X, y)\n        scores = grid.cv_results_['mean_test_score']\n\n        scales = [(1, 'No scaling'),\n                  ((n_samples * train_size), '1/n_samples'),\n                  ]\n\n        for ax, (scaler, name) in zip(axes, scales):\n            ax.set_xlabel('C')\n            ax.set_ylabel('CV Score')\n            grid_cs = cs * float(scaler)  # scale the C's\n            ax.semilogx(grid_cs, scores, label=\"fraction %.2f\" %\n                        train_size, color=colors[k], lw=lw)\n            ax.set_title('scaling=%s, penalty=%s, loss=%s' %\n                         (name, clf.penalty, clf.loss))\n\n    plt.legend(loc=\"best\")\nplt.show()"
+        "print(__doc__)\n\n\n# Author: Andreas Mueller <[email protected]>\n#         Jaques Grobler <[email protected]>\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.svm import LinearSVC\nfrom sklearn.model_selection import ShuffleSplit\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.utils import check_random_state\nfrom sklearn import datasets\n\nrnd = check_random_state(1)\n\n# set up dataset\nn_samples = 100\nn_features = 300\n\n# l1 data (only 5 informative features)\nX_1, y_1 = datasets.make_classification(n_samples=n_samples,\n                                        n_features=n_features, n_informative=5,\n                                        random_state=1)\n\n# l2 data: non sparse, but less features\ny_2 = np.sign(.5 - rnd.rand(n_samples))\nX_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis]\nX_2 += 5 * rnd.randn(n_samples, n_features // 5)\n\nclf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,\n                       tol=1e-3),\n             np.logspace(-2.3, -1.3, 10), X_1, y_1),\n            (LinearSVC(penalty='l2', loss='squared_hinge', dual=True),\n             np.logspace(-4.5, -2, 10), X_2, y_2)]\n\ncolors = ['navy', 'cyan', 'darkorange']\nlw = 2\n\nfor clf, cs, X, y in clf_sets:\n    # set up the plot for each regressor\n    fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))\n\n    for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]):\n        param_grid = dict(C=cs)\n        # To get nice curve, we need a large number of iterations to\n        # reduce the variance\n        grid = GridSearchCV(clf, refit=False, param_grid=param_grid,\n                            cv=ShuffleSplit(train_size=train_size,\n                                            test_size=.3,\n                                            n_splits=250, random_state=1))\n        grid.fit(X, y)\n        scores = grid.cv_results_['mean_test_score']\n\n        scales = [(1, 'No scaling'),\n                  ((n_samples * train_size), '1/n_samples'),\n                  ]\n\n        for ax, (scaler, name) in zip(axes, scales):\n            ax.set_xlabel('C')\n            ax.set_ylabel('CV Score')\n            grid_cs = cs * float(scaler)  # scale the C's\n            ax.semilogx(grid_cs, scores, label=\"fraction %.2f\" %\n                        train_size, color=colors[k], lw=lw)\n            ax.set_title('scaling=%s, penalty=%s, loss=%s' %\n                         (name, clf.penalty, clf.loss))\n\n    plt.legend(loc=\"best\")\nplt.show()"
       ]
     }
   ],
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "import matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.neural_network import MLPClassifier\n\nprint(__doc__)\n\n# Load data from https://www.openml.org/d/554\nX, y = fetch_openml('mnist_784', version=1, return_X_y=True)\nX = X / 255.\n\n# rescale the data, use the traditional train/test split\nX_train, X_test = X[:60000], X[60000:]\ny_train, y_test = y[:60000], y[60000:]\n\n# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,\n#                     solver='sgd', verbose=10, tol=1e-4, random_state=1)\nmlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,\n                    solver='sgd', verbose=10, tol=1e-4, random_state=1,\n                    learning_rate_init=.1)\n\nmlp.fit(X_train, y_train)\nprint(\"Training set score: %f\" % mlp.score(X_train, y_train))\nprint(\"Test set score: %f\" % mlp.score(X_test, y_test))\n\nfig, axes = plt.subplots(4, 4)\n# use global min / max to ensure all weights are shown on the same scale\nvmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()\nfor coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):\n    ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin,\n               vmax=.5 * vmax)\n    ax.set_xticks(())\n    ax.set_yticks(())\n\nplt.show()"
+        "import matplotlib.pyplot as plt\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.neural_network import MLPClassifier\n\nprint(__doc__)\n\n# Load data from https://www.openml.org/d/554\nX, y = fetch_openml('mnist_784', version=1, return_X_y=True)\nX = X / 255.\n\n# rescale the data, use the traditional train/test split\nX_train, X_test = X[:60000], X[60000:]\ny_train, y_test = y[:60000], y[60000:]\n\nmlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,\n                    solver='sgd', verbose=10, random_state=1,\n                    learning_rate_init=.1)\n\nmlp.fit(X_train, y_train)\nprint(\"Training set score: %f\" % mlp.score(X_train, y_train))\nprint(\"Test set score: %f\" % mlp.score(X_test, y_test))\n\nfig, axes = plt.subplots(4, 4)\n# use global min / max to ensure all weights are shown on the same scale\nvmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()\nfor coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):\n    ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin,\n               vmax=.5 * vmax)\n    ax.set_xticks(())\n    ax.set_yticks(())\n\nplt.show()"
       ]
     }
   ],
 
@@ -210,7 +210,7 @@ def progress(blocknum, bs, size):
     'SGD': SGDClassifier(max_iter=5),
     'Perceptron': Perceptron(),
     'NB Multinomial': MultinomialNB(alpha=0.01),
-    'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),
+    'Passive-Aggressive': PassiveAggressiveClassifier(),
 }
 
 
 
@@ -34,10 +34,8 @@
 X_train, X_test = X[:60000], X[60000:]
 y_train, y_test = y[:60000], y[60000:]
 
-# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
-#                     solver='sgd', verbose=10, tol=1e-4, random_state=1)
 mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
-                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
+                    solver='sgd', verbose=10, random_state=1,
                     learning_rate_init=.1)
 
 mlp.fit(X_train, y_train)
 
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,\n                               alternate_sign=False)\n\n\n# Iterator over parsed Reuters SGML files.\ndata_stream = stream_reuters_documents()\n\n# We learn a binary classification between the \"acq\" class and all the others.\n# \"acq\" was chosen as it is more or less evenly distributed in the Reuters\n# files. For other datasets, one should take care of creating a test set with\n# a realistic portion of positive instances.\nall_classes = np.array([0, 1])\npositive_class = 'acq'\n\n# Here are some classifiers that support the `partial_fit` method\npartial_fit_classifiers = {\n    'SGD': SGDClassifier(max_iter=5),\n    'Perceptron': Perceptron(),\n    'NB Multinomial': MultinomialNB(alpha=0.01),\n    'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),\n}\n\n\ndef get_minibatch(doc_iter, size, pos_class=positive_class):\n    \"\"\"Extract a minibatch of examples, return a tuple X_text, y.\n\n    Note: size is before excluding invalid docs with no topics assigned.\n\n    \"\"\"\n    data = [('{title}\\n\\n{body}'.format(**doc), pos_class in doc['topics'])\n            for doc in itertools.islice(doc_iter, size)\n            if doc['topics']]\n    if not len(data):\n        return np.asarray([], dtype=int), np.asarray([], dtype=int)\n    X_text, y = zip(*data)\n    return X_text, np.asarray(y, dtype=int)\n\n\ndef iter_minibatches(doc_iter, minibatch_size):\n    \"\"\"Generator of minibatches.\"\"\"\n    X_text, y = get_minibatch(doc_iter, minibatch_size)\n    while len(X_text):\n        yield X_text, y\n        X_text, y = get_minibatch(doc_iter, minibatch_size)\n\n\n# test data statistics\ntest_stats = {'n_test': 0, 'n_test_pos': 0}\n\n# First we hold out a number of examples to estimate accuracy\nn_test_documents = 1000\ntick = time.time()\nX_test_text, y_test = get_minibatch(data_stream, 1000)\nparsing_time = time.time() - tick\ntick = time.time()\nX_test = vectorizer.transform(X_test_text)\nvectorizing_time = time.time() - tick\ntest_stats['n_test'] += len(y_test)\ntest_stats['n_test_pos'] += sum(y_test)\nprint(\"Test set is %d documents (%d positive)\" % (len(y_test), sum(y_test)))\n\n\ndef progress(cls_name, stats):\n    \"\"\"Report progress information, return a string.\"\"\"\n    duration = time.time() - stats['t0']\n    s = \"%20s classifier : \\t\" % cls_name\n    s += \"%(n_train)6d train docs (%(n_train_pos)6d positive) \" % stats\n    s += \"%(n_test)6d test docs (%(n_test_pos)6d positive) \" % test_stats\n    s += \"accuracy: %(accuracy).3f \" % stats\n    s += \"in %.2fs (%5d docs/s)\" % (duration, stats['n_train'] / duration)\n    return s\n\n\ncls_stats = {}\n\nfor cls_name in partial_fit_classifiers:\n    stats = {'n_train': 0, 'n_train_pos': 0,\n             'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),\n             'runtime_history': [(0, 0)], 'total_fit_time': 0.0}\n    cls_stats[cls_name] = stats\n\nget_minibatch(data_stream, n_test_documents)\n# Discard test set\n\n# We will feed the classifier with mini-batches of 1000 documents; this means\n# we have at most 1000 docs in memory at any time.  The smaller the document\n# batch, the bigger the relative overhead of the partial fit methods.\nminibatch_size = 1000\n\n# Create the data_stream that parses Reuters SGML files and iterates on\n# documents as a stream.\nminibatch_iterators = iter_minibatches(data_stream, minibatch_size)\ntotal_vect_time = 0.0\n\n# Main loop : iterate on mini-batches of examples\nfor i, (X_train_text, y_train) in enumerate(minibatch_iterators):\n\n    tick = time.time()\n    X_train = vectorizer.transform(X_train_text)\n    total_vect_time += time.time() - tick\n\n    for cls_name, cls in partial_fit_classifiers.items():\n        tick = time.time()\n        # update estimator with examples in the current mini-batch\n        cls.partial_fit(X_train, y_train, classes=all_classes)\n\n        # accumulate test accuracy stats\n        cls_stats[cls_name]['total_fit_time'] += time.time() - tick\n        cls_stats[cls_name]['n_train'] += X_train.shape[0]\n        cls_stats[cls_name]['n_train_pos'] += sum(y_train)\n        tick = time.time()\n        cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)\n        cls_stats[cls_name]['prediction_time'] = time.time() - tick\n        acc_history = (cls_stats[cls_name]['accuracy'],\n                       cls_stats[cls_name]['n_train'])\n        cls_stats[cls_name]['accuracy_history'].append(acc_history)\n        run_history = (cls_stats[cls_name]['accuracy'],\n                       total_vect_time + cls_stats[cls_name]['total_fit_time'])\n        cls_stats[cls_name]['runtime_history'].append(run_history)\n\n        if i % 3 == 0:\n            print(progress(cls_name, cls_stats[cls_name]))\n    if i % 3 == 0:\n        print('\\n')"
+        "vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,\n                               alternate_sign=False)\n\n\n# Iterator over parsed Reuters SGML files.\ndata_stream = stream_reuters_documents()\n\n# We learn a binary classification between the \"acq\" class and all the others.\n# \"acq\" was chosen as it is more or less evenly distributed in the Reuters\n# files. For other datasets, one should take care of creating a test set with\n# a realistic portion of positive instances.\nall_classes = np.array([0, 1])\npositive_class = 'acq'\n\n# Here are some classifiers that support the `partial_fit` method\npartial_fit_classifiers = {\n    'SGD': SGDClassifier(max_iter=5),\n    'Perceptron': Perceptron(),\n    'NB Multinomial': MultinomialNB(alpha=0.01),\n    'Passive-Aggressive': PassiveAggressiveClassifier(),\n}\n\n\ndef get_minibatch(doc_iter, size, pos_class=positive_class):\n    \"\"\"Extract a minibatch of examples, return a tuple X_text, y.\n\n    Note: size is before excluding invalid docs with no topics assigned.\n\n    \"\"\"\n    data = [('{title}\\n\\n{body}'.format(**doc), pos_class in doc['topics'])\n            for doc in itertools.islice(doc_iter, size)\n            if doc['topics']]\n    if not len(data):\n        return np.asarray([], dtype=int), np.asarray([], dtype=int)\n    X_text, y = zip(*data)\n    return X_text, np.asarray(y, dtype=int)\n\n\ndef iter_minibatches(doc_iter, minibatch_size):\n    \"\"\"Generator of minibatches.\"\"\"\n    X_text, y = get_minibatch(doc_iter, minibatch_size)\n    while len(X_text):\n        yield X_text, y\n        X_text, y = get_minibatch(doc_iter, minibatch_size)\n\n\n# test data statistics\ntest_stats = {'n_test': 0, 'n_test_pos': 0}\n\n# First we hold out a number of examples to estimate accuracy\nn_test_documents = 1000\ntick = time.time()\nX_test_text, y_test = get_minibatch(data_stream, 1000)\nparsing_time = time.time() - tick\ntick = time.time()\nX_test = vectorizer.transform(X_test_text)\nvectorizing_time = time.time() - tick\ntest_stats['n_test'] += len(y_test)\ntest_stats['n_test_pos'] += sum(y_test)\nprint(\"Test set is %d documents (%d positive)\" % (len(y_test), sum(y_test)))\n\n\ndef progress(cls_name, stats):\n    \"\"\"Report progress information, return a string.\"\"\"\n    duration = time.time() - stats['t0']\n    s = \"%20s classifier : \\t\" % cls_name\n    s += \"%(n_train)6d train docs (%(n_train_pos)6d positive) \" % stats\n    s += \"%(n_test)6d test docs (%(n_test_pos)6d positive) \" % test_stats\n    s += \"accuracy: %(accuracy).3f \" % stats\n    s += \"in %.2fs (%5d docs/s)\" % (duration, stats['n_train'] / duration)\n    return s\n\n\ncls_stats = {}\n\nfor cls_name in partial_fit_classifiers:\n    stats = {'n_train': 0, 'n_train_pos': 0,\n             'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),\n             'runtime_history': [(0, 0)], 'total_fit_time': 0.0}\n    cls_stats[cls_name] = stats\n\nget_minibatch(data_stream, n_test_documents)\n# Discard test set\n\n# We will feed the classifier with mini-batches of 1000 documents; this means\n# we have at most 1000 docs in memory at any time.  The smaller the document\n# batch, the bigger the relative overhead of the partial fit methods.\nminibatch_size = 1000\n\n# Create the data_stream that parses Reuters SGML files and iterates on\n# documents as a stream.\nminibatch_iterators = iter_minibatches(data_stream, minibatch_size)\ntotal_vect_time = 0.0\n\n# Main loop : iterate on mini-batches of examples\nfor i, (X_train_text, y_train) in enumerate(minibatch_iterators):\n\n    tick = time.time()\n    X_train = vectorizer.transform(X_train_text)\n    total_vect_time += time.time() - tick\n\n    for cls_name, cls in partial_fit_classifiers.items():\n        tick = time.time()\n        # update estimator with examples in the current mini-batch\n        cls.partial_fit(X_train, y_train, classes=all_classes)\n\n        # accumulate test accuracy stats\n        cls_stats[cls_name]['total_fit_time'] += time.time() - tick\n        cls_stats[cls_name]['n_train'] += X_train.shape[0]\n        cls_stats[cls_name]['n_train_pos'] += sum(y_train)\n        tick = time.time()\n        cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)\n        cls_stats[cls_name]['prediction_time'] = time.time() - tick\n        acc_history = (cls_stats[cls_name]['accuracy'],\n                       cls_stats[cls_name]['n_train'])\n        cls_stats[cls_name]['accuracy_history'].append(acc_history)\n        run_history = (cls_stats[cls_name]['accuracy'],\n                       total_vect_time + cls_stats[cls_name]['total_fit_time'])\n        cls_stats[cls_name]['runtime_history'].append(run_history)\n\n        if i % 3 == 0:\n            print(progress(cls_name, cls_stats[cls_name]))\n    if i % 3 == 0:\n        print('\\n')"
       ]
     },
     {
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "print(__doc__)\n\n\n# Author: Andreas Mueller <[email protected]>\n# Jaques Grobler <[email protected]>\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.svm import LinearSVC\nfrom sklearn.model_selection import ShuffleSplit\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.utils import check_random_state\nfrom sklearn import datasets\n\nrnd = check_random_state(1)\n\n# set up dataset\nn_samples = 100\nn_features = 300\n\n# l1 data (only 5 informative features)\nX_1, y_1 = datasets.make_classification(n_samples=n_samples,\n n_features=n_features, n_informative=5,\n random_state=1)\n\n# l2 data: non sparse, but less features\ny_2 = np.sign(.5 - rnd.rand(n_samples))\nX_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis]\nX_2 += 5 * rnd.randn(n_samples, n_features // 5)\n\nclf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,\n tol=1e-3),\n np.logspace(-2.3, -1.3, 10), X_1, y_1),\n (LinearSVC(penalty='l2', loss='squared_hinge', dual=True,\n tol=1e-4),\n np.logspace(-4.5, -2, 10), X_2, y_2)]\n\ncolors = ['navy', 'cyan', 'darkorange']\nlw = 2\n\nfor clf, cs, X, y in clf_sets:\n # set up the plot for each regressor\n fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))\n\n for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]):\n param_grid = dict(C=cs)\n # To get nice curve, we need a large number of iterations to\n # reduce the variance\n grid = GridSearchCV(clf, refit=False, param_grid=param_grid,\n cv=ShuffleSplit(train_size=train_size,\n test_size=.3,\n n_splits=250, random_state=1))\n grid.fit(X, y)\n scores = grid.cv_results_['mean_test_score']\n\n scales = [(1, 'No scaling'),\n ((n_samples * train_size), '1/n_samples'),\n ]\n\n for ax, (scaler, name) in zip(axes, scales):\n ax.set_xlabel('C')\n ax.set_ylabel('CV Score')\n grid_cs = cs * float(scaler) # scale the C's\n ax.semilogx(grid_cs, scores, label=\"fraction %.2f\" %\n train_size, color=colors[k], lw=lw)\n ax.set_title('scaling=%s, penalty=%s, loss=%s' %\n (name, clf.penalty, clf.loss))\n\n plt.legend(loc=\"best\")\nplt.show()"
	`29`	+ "print(__doc__)\n\n\n# Author: Andreas Mueller <[email protected]>\n# Jaques Grobler <[email protected]>\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.svm import LinearSVC\nfrom sklearn.model_selection import ShuffleSplit\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.utils import check_random_state\nfrom sklearn import datasets\n\nrnd = check_random_state(1)\n\n# set up dataset\nn_samples = 100\nn_features = 300\n\n# l1 data (only 5 informative features)\nX_1, y_1 = datasets.make_classification(n_samples=n_samples,\n n_features=n_features, n_informative=5,\n random_state=1)\n\n# l2 data: non sparse, but less features\ny_2 = np.sign(.5 - rnd.rand(n_samples))\nX_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis]\nX_2 += 5 * rnd.randn(n_samples, n_features // 5)\n\nclf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,\n tol=1e-3),\n np.logspace(-2.3, -1.3, 10), X_1, y_1),\n (LinearSVC(penalty='l2', loss='squared_hinge', dual=True),\n np.logspace(-4.5, -2, 10), X_2, y_2)]\n\ncolors = ['navy', 'cyan', 'darkorange']\nlw = 2\n\nfor clf, cs, X, y in clf_sets:\n # set up the plot for each regressor\n fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))\n\n for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]):\n param_grid = dict(C=cs)\n # To get nice curve, we need a large number of iterations to\n # reduce the variance\n grid = GridSearchCV(clf, refit=False, param_grid=param_grid,\n cv=ShuffleSplit(train_size=train_size,\n test_size=.3,\n n_splits=250, random_state=1))\n grid.fit(X, y)\n scores = grid.cv_results_['mean_test_score']\n\n scales = [(1, 'No scaling'),\n ((n_samples * train_size), '1/n_samples'),\n ]\n\n for ax, (scaler, name) in zip(axes, scales):\n ax.set_xlabel('C')\n ax.set_ylabel('CV Score')\n grid_cs = cs * float(scaler) # scale the C's\n ax.semilogx(grid_cs, scores, label=\"fraction %.2f\" %\n train_size, color=colors[k], lw=lw)\n ax.set_title('scaling=%s, penalty=%s, loss=%s' %\n (name, clf.penalty, clf.loss))\n\n plt.legend(loc=\"best\")\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`
Original file line number	Diff line number	Diff line change
`@@ -210,7 +210,7 @@ def progress(blocknum, bs, size):`
`210`	`210`	`'SGD': SGDClassifier(max_iter=5),`
`211`	`211`	`'Perceptron': Perceptron(),`
`212`	`212`	`'NB Multinomial': MultinomialNB(alpha=0.01),`
`213`		`- 'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),`
	`213`	`+ 'Passive-Aggressive': PassiveAggressiveClassifier(),`
`214`	`214`	`}`
`215`	`215`
`216`	`216`
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@`
`62`	`62`	`},`
`63`	`63`	`"outputs": [],`
`64`	`64`	`"source": [`
`65`		- "vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 18,\n alternate_sign=False)\n\n\n# Iterator over parsed Reuters SGML files.\ndata_stream = stream_reuters_documents()\n\n# We learn a binary classification between the \"acq\" class and all the others.\n# \"acq\" was chosen as it is more or less evenly distributed in the Reuters\n# files. For other datasets, one should take care of creating a test set with\n# a realistic portion of positive instances.\nall_classes = np.array([0, 1])\npositive_class = 'acq'\n\n# Here are some classifiers that support the `partial_fit` method\npartial_fit_classifiers = {\n 'SGD': SGDClassifier(max_iter=5),\n 'Perceptron': Perceptron(),\n 'NB Multinomial': MultinomialNB(alpha=0.01),\n 'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),\n}\n\n\ndef get_minibatch(doc_iter, size, pos_class=positive_class):\n \"\"\"Extract a minibatch of examples, return a tuple X_text, y.\n\n Note: size is before excluding invalid docs with no topics assigned.\n\n \"\"\"\n data = [('{title}\\n\\n{body}'.format(doc), pos_class in doc['topics'])\n for doc in itertools.islice(doc_iter, size)\n if doc['topics']]\n if not len(data):\n return np.asarray([], dtype=int), np.asarray([], dtype=int)\n X_text, y = zip(*data)\n return X_text, np.asarray(y, dtype=int)\n\n\ndef iter_minibatches(doc_iter, minibatch_size):\n \"\"\"Generator of minibatches.\"\"\"\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n while len(X_text):\n yield X_text, y\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n\n\n# test data statistics\ntest_stats = {'n_test': 0, 'n_test_pos': 0}\n\n# First we hold out a number of examples to estimate accuracy\nn_test_documents = 1000\ntick = time.time()\nX_test_text, y_test = get_minibatch(data_stream, 1000)\nparsing_time = time.time() - tick\ntick = time.time()\nX_test = vectorizer.transform(X_test_text)\nvectorizing_time = time.time() - tick\ntest_stats['n_test'] += len(y_test)\ntest_stats['n_test_pos'] += sum(y_test)\nprint(\"Test set is %d documents (%d positive)\" % (len(y_test), sum(y_test)))\n\n\ndef progress(cls_name, stats):\n \"\"\"Report progress information, return a string.\"\"\"\n duration = time.time() - stats['t0']\n s = \"%20s classifier : \\t\" % cls_name\n s += \"%(n_train)6d train docs (%(n_train_pos)6d positive) \" % stats\n s += \"%(n_test)6d test docs (%(n_test_pos)6d positive) \" % test_stats\n s += \"accuracy: %(accuracy).3f \" % stats\n s += \"in %.2fs (%5d docs/s)\" % (duration, stats['n_train'] / duration)\n return s\n\n\ncls_stats = {}\n\nfor cls_name in partial_fit_classifiers:\n stats = {'n_train': 0, 'n_train_pos': 0,\n 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),\n 'runtime_history': [(0, 0)], 'total_fit_time': 0.0}\n cls_stats[cls_name] = stats\n\nget_minibatch(data_stream, n_test_documents)\n# Discard test set\n\n# We will feed the classifier with mini-batches of 1000 documents; this means\n# we have at most 1000 docs in memory at any time. The smaller the document\n# batch, the bigger the relative overhead of the partial fit methods.\nminibatch_size = 1000\n\n# Create the data_stream that parses Reuters SGML files and iterates on\n# documents as a stream.\nminibatch_iterators = iter_minibatches(data_stream, minibatch_size)\ntotal_vect_time = 0.0\n\n# Main loop : iterate on mini-batches of examples\nfor i, (X_train_text, y_train) in enumerate(minibatch_iterators):\n\n tick = time.time()\n X_train = vectorizer.transform(X_train_text)\n total_vect_time += time.time() - tick\n\n for cls_name, cls in partial_fit_classifiers.items():\n tick = time.time()\n # update estimator with examples in the current mini-batch\n cls.partial_fit(X_train, y_train, classes=all_classes)\n\n # accumulate test accuracy stats\n cls_stats[cls_name]['total_fit_time'] += time.time() - tick\n cls_stats[cls_name]['n_train'] += X_train.shape[0]\n cls_stats[cls_name]['n_train_pos'] += sum(y_train)\n tick = time.time()\n cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)\n cls_stats[cls_name]['prediction_time'] = time.time() - tick\n acc_history = (cls_stats[cls_name]['accuracy'],\n cls_stats[cls_name]['n_train'])\n cls_stats[cls_name]['accuracy_history'].append(acc_history)\n run_history = (cls_stats[cls_name]['accuracy'],\n total_vect_time + cls_stats[cls_name]['total_fit_time'])\n cls_stats[cls_name]['runtime_history'].append(run_history)\n\n if i % 3 == 0:\n print(progress(cls_name, cls_stats[cls_name]))\n if i % 3 == 0:\n print('\\n')"
	`65`	+ "vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 18,\n alternate_sign=False)\n\n\n# Iterator over parsed Reuters SGML files.\ndata_stream = stream_reuters_documents()\n\n# We learn a binary classification between the \"acq\" class and all the others.\n# \"acq\" was chosen as it is more or less evenly distributed in the Reuters\n# files. For other datasets, one should take care of creating a test set with\n# a realistic portion of positive instances.\nall_classes = np.array([0, 1])\npositive_class = 'acq'\n\n# Here are some classifiers that support the `partial_fit` method\npartial_fit_classifiers = {\n 'SGD': SGDClassifier(max_iter=5),\n 'Perceptron': Perceptron(),\n 'NB Multinomial': MultinomialNB(alpha=0.01),\n 'Passive-Aggressive': PassiveAggressiveClassifier(),\n}\n\n\ndef get_minibatch(doc_iter, size, pos_class=positive_class):\n \"\"\"Extract a minibatch of examples, return a tuple X_text, y.\n\n Note: size is before excluding invalid docs with no topics assigned.\n\n \"\"\"\n data = [('{title}\\n\\n{body}'.format(doc), pos_class in doc['topics'])\n for doc in itertools.islice(doc_iter, size)\n if doc['topics']]\n if not len(data):\n return np.asarray([], dtype=int), np.asarray([], dtype=int)\n X_text, y = zip(*data)\n return X_text, np.asarray(y, dtype=int)\n\n\ndef iter_minibatches(doc_iter, minibatch_size):\n \"\"\"Generator of minibatches.\"\"\"\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n while len(X_text):\n yield X_text, y\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n\n\n# test data statistics\ntest_stats = {'n_test': 0, 'n_test_pos': 0}\n\n# First we hold out a number of examples to estimate accuracy\nn_test_documents = 1000\ntick = time.time()\nX_test_text, y_test = get_minibatch(data_stream, 1000)\nparsing_time = time.time() - tick\ntick = time.time()\nX_test = vectorizer.transform(X_test_text)\nvectorizing_time = time.time() - tick\ntest_stats['n_test'] += len(y_test)\ntest_stats['n_test_pos'] += sum(y_test)\nprint(\"Test set is %d documents (%d positive)\" % (len(y_test), sum(y_test)))\n\n\ndef progress(cls_name, stats):\n \"\"\"Report progress information, return a string.\"\"\"\n duration = time.time() - stats['t0']\n s = \"%20s classifier : \\t\" % cls_name\n s += \"%(n_train)6d train docs (%(n_train_pos)6d positive) \" % stats\n s += \"%(n_test)6d test docs (%(n_test_pos)6d positive) \" % test_stats\n s += \"accuracy: %(accuracy).3f \" % stats\n s += \"in %.2fs (%5d docs/s)\" % (duration, stats['n_train'] / duration)\n return s\n\n\ncls_stats = {}\n\nfor cls_name in partial_fit_classifiers:\n stats = {'n_train': 0, 'n_train_pos': 0,\n 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),\n 'runtime_history': [(0, 0)], 'total_fit_time': 0.0}\n cls_stats[cls_name] = stats\n\nget_minibatch(data_stream, n_test_documents)\n# Discard test set\n\n# We will feed the classifier with mini-batches of 1000 documents; this means\n# we have at most 1000 docs in memory at any time. The smaller the document\n# batch, the bigger the relative overhead of the partial fit methods.\nminibatch_size = 1000\n\n# Create the data_stream that parses Reuters SGML files and iterates on\n# documents as a stream.\nminibatch_iterators = iter_minibatches(data_stream, minibatch_size)\ntotal_vect_time = 0.0\n\n# Main loop : iterate on mini-batches of examples\nfor i, (X_train_text, y_train) in enumerate(minibatch_iterators):\n\n tick = time.time()\n X_train = vectorizer.transform(X_train_text)\n total_vect_time += time.time() - tick\n\n for cls_name, cls in partial_fit_classifiers.items():\n tick = time.time()\n # update estimator with examples in the current mini-batch\n cls.partial_fit(X_train, y_train, classes=all_classes)\n\n # accumulate test accuracy stats\n cls_stats[cls_name]['total_fit_time'] += time.time() - tick\n cls_stats[cls_name]['n_train'] += X_train.shape[0]\n cls_stats[cls_name]['n_train_pos'] += sum(y_train)\n tick = time.time()\n cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)\n cls_stats[cls_name]['prediction_time'] = time.time() - tick\n acc_history = (cls_stats[cls_name]['accuracy'],\n cls_stats[cls_name]['n_train'])\n cls_stats[cls_name]['accuracy_history'].append(acc_history)\n run_history = (cls_stats[cls_name]['accuracy'],\n total_vect_time + cls_stats[cls_name]['total_fit_time'])\n cls_stats[cls_name]['runtime_history'].append(run_history)\n\n if i % 3 == 0:\n print(progress(cls_name, cls_stats[cls_name]))\n if i % 3 == 0:\n print('\\n')"
`66`	`66`	`]`
`67`	`67`	`},`
`68`	`68`	`{`