dionysiskokkoris
diff --git a/‎dev/_downloads/auto_examples_jupyter.zip
-33 Bytes b/‎dev/_downloads/auto_examples_jupyter.zip
-33 Bytes
diff --git a/‎dev/_downloads/auto_examples_python.zip
-33 Bytes b/‎dev/_downloads/auto_examples_python.zip
-33 Bytes
diff --git a/‎dev/_downloads/plot_bicluster_newsgroups.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_bicluster_newsgroups.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/plot_bicluster_newsgroups.py
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/plot_bicluster_newsgroups.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
-14.1 KB b/‎dev/_downloads/scikit-learn-docs.pdf
-14.1 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-573 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-573 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-573 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-573 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-577 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-577 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
-577 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
-577 Bytes
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_normalizer(tokens):\n    \"\"\" Map all numeric tokens to a placeholder.\n\n    For many applications, tokens that begin with a number are not directly\n    useful, but the fact that such a token exists can be relevant.  By applying\n    this form of dimensionality reduction, some methods may perform better.\n    \"\"\"\n    return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n    def build_tokenizer(self):\n        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()\n        return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n              'comp.windows.x', 'misc.forsale', 'rec.autos',\n              'rec.motorcycles', 'rec.sport.baseball',\n              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n              'sci.med', 'sci.space', 'soc.religion.christian',\n              'talk.politics.guns', 'talk.politics.mideast',\n              'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n                                 svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n                         random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n    rows, cols = cocluster.get_indices(i)\n    if not (np.any(rows) and np.any(cols)):\n        import sys\n        return sys.float_info.max\n    row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n    col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n    # Note: the following is identical to X[rows[:, np.newaxis],\n    # cols].sum() but much faster in scipy <= 0.16\n    weight = X[rows][:, cols].sum()\n    cut = (X[row_complement][:, cols].sum() +\n           X[rows][:, col_complement].sum())\n    return cut / weight\n\n\ndef most_common(d):\n    \"\"\"Items of a defaultdict(int) with the highest values.\n\n    Like Counter.most_common in Python >=2.7.\n    \"\"\"\n    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n                       for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n    n_rows, n_cols = cocluster.get_shape(cluster)\n    cluster_docs, cluster_words = cocluster.get_indices(cluster)\n    if not len(cluster_docs) or not len(cluster_words):\n        continue\n\n    # categories\n    counter = defaultdict(int)\n    for i in cluster_docs:\n        counter[document_names[i]] += 1\n    cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n                           for name, c in most_common(counter)[:3])\n\n    # words\n    out_of_cluster_docs = cocluster.row_labels_ != cluster\n    out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n    word_col = X[:, cluster_words]\n    word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n                           word_col[out_of_cluster_docs, :].sum(axis=0))\n    word_scores = word_scores.ravel()\n    important_words = list(feature_names[cluster_words[i]]\n                           for i in word_scores.argsort()[:-11:-1])\n\n    print(\"bicluster {} : {} documents, {} words\".format(\n        idx, n_rows, n_cols))\n    print(\"categories   : {}\".format(cat_string))\n    print(\"words        : {}\\n\".format(', '.join(important_words)))"
+        "from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_normalizer(tokens):\n    \"\"\" Map all numeric tokens to a placeholder.\n\n    For many applications, tokens that begin with a number are not directly\n    useful, but the fact that such a token exists can be relevant.  By applying\n    this form of dimensionality reduction, some methods may perform better.\n    \"\"\"\n    return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n    def build_tokenizer(self):\n        tokenize = super().build_tokenizer()\n        return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n              'comp.windows.x', 'misc.forsale', 'rec.autos',\n              'rec.motorcycles', 'rec.sport.baseball',\n              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n              'sci.med', 'sci.space', 'soc.religion.christian',\n              'talk.politics.guns', 'talk.politics.mideast',\n              'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n                                 svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n                         random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n    time() - start_time,\n    v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n    rows, cols = cocluster.get_indices(i)\n    if not (np.any(rows) and np.any(cols)):\n        import sys\n        return sys.float_info.max\n    row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n    col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n    # Note: the following is identical to X[rows[:, np.newaxis],\n    # cols].sum() but much faster in scipy <= 0.16\n    weight = X[rows][:, cols].sum()\n    cut = (X[row_complement][:, cols].sum() +\n           X[rows][:, col_complement].sum())\n    return cut / weight\n\n\ndef most_common(d):\n    \"\"\"Items of a defaultdict(int) with the highest values.\n\n    Like Counter.most_common in Python >=2.7.\n    \"\"\"\n    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n                       for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n    n_rows, n_cols = cocluster.get_shape(cluster)\n    cluster_docs, cluster_words = cocluster.get_indices(cluster)\n    if not len(cluster_docs) or not len(cluster_words):\n        continue\n\n    # categories\n    counter = defaultdict(int)\n    for i in cluster_docs:\n        counter[document_names[i]] += 1\n    cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n                           for name, c in most_common(counter)[:3])\n\n    # words\n    out_of_cluster_docs = cocluster.row_labels_ != cluster\n    out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n    word_col = X[:, cluster_words]\n    word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n                           word_col[out_of_cluster_docs, :].sum(axis=0))\n    word_scores = word_scores.ravel()\n    important_words = list(feature_names[cluster_words[i]]\n                           for i in word_scores.argsort()[:-11:-1])\n\n    print(\"bicluster {} : {} documents, {} words\".format(\n        idx, n_rows, n_cols))\n    print(\"categories   : {}\".format(cat_string))\n    print(\"words        : {}\\n\".format(', '.join(important_words)))"
       ]
     }
   ],
 
@@ -51,7 +51,7 @@ def number_normalizer(tokens):
 
 class NumberNormalizingVectorizer(TfidfVectorizer):
     def build_tokenizer(self):
-        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
+        tokenize = super().build_tokenizer()
         return lambda doc: list(number_normalizer(tokenize(doc)))
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_normalizer(tokens):\n \"\"\" Map all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n def build_tokenizer(self):\n tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()\n return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(d.items(), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
	`29`	+ "from __future__ import print_function\n\nfrom collections import defaultdict\nimport operator\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster.bicluster import SpectralCoclustering\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.datasets.twenty_newsgroups import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\nprint(__doc__)\n\n\ndef number_normalizer(tokens):\n \"\"\" Map all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n def build_tokenizer(self):\n tokenize = super().build_tokenizer()\n return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = ['alt.atheism', 'comp.graphics',\n 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n 'comp.windows.x', 'misc.forsale', 'rec.autos',\n 'rec.motorcycles', 'rec.sport.baseball',\n 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',\n 'sci.med', 'sci.space', 'soc.religion.christian',\n 'talk.politics.guns', 'talk.politics.mideast',\n 'talk.politics.misc', 'talk.religion.misc']\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)\ncocluster = SpectralCoclustering(n_clusters=len(categories),\n svd_method='arpack', random_state=0)\nkmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,\n random_state=0)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_cocluster, y_true)))\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time,\n v_measure_score(y_kmeans, y_true)))\n\nfeature_names = vectorizer.get_feature_names()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = (X[row_complement][:, cols].sum() +\n X[rows][:, col_complement].sum())\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(d.items(), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i)\n for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3])\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -\n word_col[out_of_cluster_docs, :].sum(axis=0))\n word_scores = word_scores.ravel()\n important_words = list(feature_names[cluster_words[i]]\n for i in word_scores.argsort()[:-11:-1])\n\n print(\"bicluster {} : {} documents, {} words\".format(\n idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(', '.join(important_words)))"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`