TanShiKai
diff --git a/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
-143 Bytes b/‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip
-143 Bytes
diff --git a/‎dev/_downloads/637afdd681404c733540858401aadf5c/wikipedia_principal_eigenvector.py
Lines changed: 0 additions & 6 deletions b/‎dev/_downloads/637afdd681404c733540858401aadf5c/wikipedia_principal_eigenvector.py
Lines changed: 0 additions & 6 deletions
diff --git a/‎dev/_downloads/add893ef60503007c442db9f9ff7b6e5/wikipedia_principal_eigenvector.ipynb
Lines changed: 1 addition & 1 deletion b/‎dev/_downloads/add893ef60503007c442db9f9ff7b6e5/wikipedia_principal_eigenvector.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
-151 Bytes b/‎dev/_downloads/d34667f097c619f8afda4bc936e7af21/auto_examples_jupyter.zip
-151 Bytes
diff --git a/‎dev/_downloads/scikit-learn-docs.pdf
-15.6 KB b/‎dev/_downloads/scikit-learn-docs.pdf
-15.6 KB
diff --git a/‎dev/_images/iris.png
0 Bytes b/‎dev/_images/iris.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-239 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-239 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-239 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0011.png
-239 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
88 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
88 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
88 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_0021.png
88 Bytes
@@ -42,8 +42,6 @@
 
 from scipy import sparse
 
-from joblib import Memory
-
 from sklearn.decomposition import randomized_svd
 from urllib.request import urlopen
 
@@ -74,8 +72,6 @@
 # #############################################################################
 # Loading the redirect files
 
-memory = Memory(cachedir=".")
-
 
 def index(redirects, index_map, k):
     """Find the index of an article name after redirect resolution"""
@@ -124,8 +120,6 @@ def get_redirects(redirects_filename):
     return redirects
 
 
-# disabling joblib as the pickling of large dicts seems much too slow
-#@memory.cache
 def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):
     """Extract the adjacency graph as a scipy sparse matrix
 
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Olivier Grisel <[email protected]>\n# License: BSD 3 clause\n\nfrom bz2 import BZ2File\nimport os\nfrom datetime import datetime\nfrom pprint import pprint\nfrom time import time\n\nimport numpy as np\n\nfrom scipy import sparse\n\nfrom joblib import Memory\n\nfrom sklearn.decomposition import randomized_svd\nfrom urllib.request import urlopen\n\n\nprint(__doc__)\n\n# #############################################################################\n# Where to download the data, if not already on disk\nredirects_url = \"http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2\"\nredirects_filename = redirects_url.rsplit(\"/\", 1)[1]\n\npage_links_url = \"http://downloads.dbpedia.org/3.5.1/en/page_links_en.nt.bz2\"\npage_links_filename = page_links_url.rsplit(\"/\", 1)[1]\n\nresources = [\n    (redirects_url, redirects_filename),\n    (page_links_url, page_links_filename),\n]\n\nfor url, filename in resources:\n    if not os.path.exists(filename):\n        print(\"Downloading data from '%s', please wait...\" % url)\n        opener = urlopen(url)\n        open(filename, 'wb').write(opener.read())\n        print()\n\n\n# #############################################################################\n# Loading the redirect files\n\nmemory = Memory(cachedir=\".\")\n\n\ndef index(redirects, index_map, k):\n    \"\"\"Find the index of an article name after redirect resolution\"\"\"\n    k = redirects.get(k, k)\n    return index_map.setdefault(k, len(index_map))\n\n\nDBPEDIA_RESOURCE_PREFIX_LEN = len(\"http://dbpedia.org/resource/\")\nSHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1)\n\n\ndef short_name(nt_uri):\n    \"\"\"Remove the < and > URI markers and the common URI prefix\"\"\"\n    return nt_uri[SHORTNAME_SLICE]\n\n\ndef get_redirects(redirects_filename):\n    \"\"\"Parse the redirections and build a transitively closed map out of it\"\"\"\n    redirects = {}\n    print(\"Parsing the NT redirect file\")\n    for l, line in enumerate(BZ2File(redirects_filename)):\n        split = line.split()\n        if len(split) != 4:\n            print(\"ignoring malformed line: \" + line)\n            continue\n        redirects[short_name(split[0])] = short_name(split[2])\n        if l % 1000000 == 0:\n            print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n    # compute the transitive closure\n    print(\"Computing the transitive closure of the redirect relation\")\n    for l, source in enumerate(redirects.keys()):\n        transitive_target = None\n        target = redirects[source]\n        seen = {source}\n        while True:\n            transitive_target = target\n            target = redirects.get(target)\n            if target is None or target in seen:\n                break\n            seen.add(target)\n        redirects[source] = transitive_target\n        if l % 1000000 == 0:\n            print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n    return redirects\n\n\n# disabling joblib as the pickling of large dicts seems much too slow\n#@memory.cache\ndef get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):\n    \"\"\"Extract the adjacency graph as a scipy sparse matrix\n\n    Redirects are resolved first.\n\n    Returns X, the scipy sparse adjacency matrix, redirects as python\n    dict from article names to article names and index_map a python dict\n    from article names to python int (article indexes).\n    \"\"\"\n\n    print(\"Computing the redirect map\")\n    redirects = get_redirects(redirects_filename)\n\n    print(\"Computing the integer index map\")\n    index_map = dict()\n    links = list()\n    for l, line in enumerate(BZ2File(page_links_filename)):\n        split = line.split()\n        if len(split) != 4:\n            print(\"ignoring malformed line: \" + line)\n            continue\n        i = index(redirects, index_map, short_name(split[0]))\n        j = index(redirects, index_map, short_name(split[2]))\n        links.append((i, j))\n        if l % 1000000 == 0:\n            print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n        if limit is not None and l >= limit - 1:\n            break\n\n    print(\"Computing the adjacency matrix\")\n    X = sparse.lil_matrix((len(index_map), len(index_map)), dtype=np.float32)\n    for i, j in links:\n        X[i, j] = 1.0\n    del links\n    print(\"Converting to CSR representation\")\n    X = X.tocsr()\n    print(\"CSR conversion done\")\n    return X, redirects, index_map\n\n\n# stop after 5M links to make it possible to work in RAM\nX, redirects, index_map = get_adjacency_matrix(\n    redirects_filename, page_links_filename, limit=5000000)\nnames = {i: name for name, i in index_map.items()}\n\nprint(\"Computing the principal singular vectors using randomized_svd\")\nt0 = time()\nU, s, V = randomized_svd(X, 5, n_iter=3)\nprint(\"done in %0.3fs\" % (time() - t0))\n\n# print the names of the wikipedia related strongest components of the\n# principal singular vector which should be similar to the highest eigenvector\nprint(\"Top wikipedia pages according to principal singular vectors\")\npprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])\npprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])\n\n\ndef centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):\n    \"\"\"Power iteration computation of the principal eigenvector\n\n    This method is also known as Google PageRank and the implementation\n    is based on the one from the NetworkX project (BSD licensed too)\n    with copyrights by:\n\n      Aric Hagberg <[email protected]>\n      Dan Schult <[email protected]>\n      Pieter Swart <[email protected]>\n    \"\"\"\n    n = X.shape[0]\n    X = X.copy()\n    incoming_counts = np.asarray(X.sum(axis=1)).ravel()\n\n    print(\"Normalizing the graph\")\n    for i in incoming_counts.nonzero()[0]:\n        X.data[X.indptr[i]:X.indptr[i + 1]] *= 1.0 / incoming_counts[i]\n    dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0),\n                                 1.0 / n, 0)).ravel()\n\n    scores = np.full(n, 1. / n, dtype=np.float32)  # initial guess\n    for i in range(max_iter):\n        print(\"power iteration #%d\" % i)\n        prev_scores = scores\n        scores = (alpha * (scores * X + np.dot(dangle, prev_scores))\n                  + (1 - alpha) * prev_scores.sum() / n)\n        # check convergence: normalized l_inf norm\n        scores_max = np.abs(scores).max()\n        if scores_max == 0.0:\n            scores_max = 1.0\n        err = np.abs(scores - prev_scores).max() / scores_max\n        print(\"error: %0.6f\" % err)\n        if err < n * tol:\n            return scores\n\n    return scores\n\nprint(\"Computing principal eigenvector score using a power iteration method\")\nt0 = time()\nscores = centrality_scores(X, max_iter=100)\nprint(\"done in %0.3fs\" % (time() - t0))\npprint([names[i] for i in np.abs(scores).argsort()[-10:]])"
+        "# Author: Olivier Grisel <[email protected]>\n# License: BSD 3 clause\n\nfrom bz2 import BZ2File\nimport os\nfrom datetime import datetime\nfrom pprint import pprint\nfrom time import time\n\nimport numpy as np\n\nfrom scipy import sparse\n\nfrom sklearn.decomposition import randomized_svd\nfrom urllib.request import urlopen\n\n\nprint(__doc__)\n\n# #############################################################################\n# Where to download the data, if not already on disk\nredirects_url = \"http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2\"\nredirects_filename = redirects_url.rsplit(\"/\", 1)[1]\n\npage_links_url = \"http://downloads.dbpedia.org/3.5.1/en/page_links_en.nt.bz2\"\npage_links_filename = page_links_url.rsplit(\"/\", 1)[1]\n\nresources = [\n    (redirects_url, redirects_filename),\n    (page_links_url, page_links_filename),\n]\n\nfor url, filename in resources:\n    if not os.path.exists(filename):\n        print(\"Downloading data from '%s', please wait...\" % url)\n        opener = urlopen(url)\n        open(filename, 'wb').write(opener.read())\n        print()\n\n\n# #############################################################################\n# Loading the redirect files\n\n\ndef index(redirects, index_map, k):\n    \"\"\"Find the index of an article name after redirect resolution\"\"\"\n    k = redirects.get(k, k)\n    return index_map.setdefault(k, len(index_map))\n\n\nDBPEDIA_RESOURCE_PREFIX_LEN = len(\"http://dbpedia.org/resource/\")\nSHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1)\n\n\ndef short_name(nt_uri):\n    \"\"\"Remove the < and > URI markers and the common URI prefix\"\"\"\n    return nt_uri[SHORTNAME_SLICE]\n\n\ndef get_redirects(redirects_filename):\n    \"\"\"Parse the redirections and build a transitively closed map out of it\"\"\"\n    redirects = {}\n    print(\"Parsing the NT redirect file\")\n    for l, line in enumerate(BZ2File(redirects_filename)):\n        split = line.split()\n        if len(split) != 4:\n            print(\"ignoring malformed line: \" + line)\n            continue\n        redirects[short_name(split[0])] = short_name(split[2])\n        if l % 1000000 == 0:\n            print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n    # compute the transitive closure\n    print(\"Computing the transitive closure of the redirect relation\")\n    for l, source in enumerate(redirects.keys()):\n        transitive_target = None\n        target = redirects[source]\n        seen = {source}\n        while True:\n            transitive_target = target\n            target = redirects.get(target)\n            if target is None or target in seen:\n                break\n            seen.add(target)\n        redirects[source] = transitive_target\n        if l % 1000000 == 0:\n            print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n    return redirects\n\n\ndef get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):\n    \"\"\"Extract the adjacency graph as a scipy sparse matrix\n\n    Redirects are resolved first.\n\n    Returns X, the scipy sparse adjacency matrix, redirects as python\n    dict from article names to article names and index_map a python dict\n    from article names to python int (article indexes).\n    \"\"\"\n\n    print(\"Computing the redirect map\")\n    redirects = get_redirects(redirects_filename)\n\n    print(\"Computing the integer index map\")\n    index_map = dict()\n    links = list()\n    for l, line in enumerate(BZ2File(page_links_filename)):\n        split = line.split()\n        if len(split) != 4:\n            print(\"ignoring malformed line: \" + line)\n            continue\n        i = index(redirects, index_map, short_name(split[0]))\n        j = index(redirects, index_map, short_name(split[2]))\n        links.append((i, j))\n        if l % 1000000 == 0:\n            print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n        if limit is not None and l >= limit - 1:\n            break\n\n    print(\"Computing the adjacency matrix\")\n    X = sparse.lil_matrix((len(index_map), len(index_map)), dtype=np.float32)\n    for i, j in links:\n        X[i, j] = 1.0\n    del links\n    print(\"Converting to CSR representation\")\n    X = X.tocsr()\n    print(\"CSR conversion done\")\n    return X, redirects, index_map\n\n\n# stop after 5M links to make it possible to work in RAM\nX, redirects, index_map = get_adjacency_matrix(\n    redirects_filename, page_links_filename, limit=5000000)\nnames = {i: name for name, i in index_map.items()}\n\nprint(\"Computing the principal singular vectors using randomized_svd\")\nt0 = time()\nU, s, V = randomized_svd(X, 5, n_iter=3)\nprint(\"done in %0.3fs\" % (time() - t0))\n\n# print the names of the wikipedia related strongest components of the\n# principal singular vector which should be similar to the highest eigenvector\nprint(\"Top wikipedia pages according to principal singular vectors\")\npprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])\npprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])\n\n\ndef centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):\n    \"\"\"Power iteration computation of the principal eigenvector\n\n    This method is also known as Google PageRank and the implementation\n    is based on the one from the NetworkX project (BSD licensed too)\n    with copyrights by:\n\n      Aric Hagberg <[email protected]>\n      Dan Schult <[email protected]>\n      Pieter Swart <[email protected]>\n    \"\"\"\n    n = X.shape[0]\n    X = X.copy()\n    incoming_counts = np.asarray(X.sum(axis=1)).ravel()\n\n    print(\"Normalizing the graph\")\n    for i in incoming_counts.nonzero()[0]:\n        X.data[X.indptr[i]:X.indptr[i + 1]] *= 1.0 / incoming_counts[i]\n    dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0),\n                                 1.0 / n, 0)).ravel()\n\n    scores = np.full(n, 1. / n, dtype=np.float32)  # initial guess\n    for i in range(max_iter):\n        print(\"power iteration #%d\" % i)\n        prev_scores = scores\n        scores = (alpha * (scores * X + np.dot(dangle, prev_scores))\n                  + (1 - alpha) * prev_scores.sum() / n)\n        # check convergence: normalized l_inf norm\n        scores_max = np.abs(scores).max()\n        if scores_max == 0.0:\n            scores_max = 1.0\n        err = np.abs(scores - prev_scores).max() / scores_max\n        print(\"error: %0.6f\" % err)\n        if err < n * tol:\n            return scores\n\n    return scores\n\nprint(\"Computing principal eigenvector score using a power iteration method\")\nt0 = time()\nscores = centrality_scores(X, max_iter=100)\nprint(\"done in %0.3fs\" % (time() - t0))\npprint([names[i] for i in np.abs(scores).argsort()[-10:]])"
       ]
     }
   ],
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Olivier Grisel <[email protected]>\n# License: BSD 3 clause\n\nfrom bz2 import BZ2File\nimport os\nfrom datetime import datetime\nfrom pprint import pprint\nfrom time import time\n\nimport numpy as np\n\nfrom scipy import sparse\n\nfrom joblib import Memory\n\nfrom sklearn.decomposition import randomized_svd\nfrom urllib.request import urlopen\n\n\nprint(__doc__)\n\n# #############################################################################\n# Where to download the data, if not already on disk\nredirects_url = \"http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2\"\nredirects_filename = redirects_url.rsplit(\"/\", 1)[1]\n\npage_links_url = \"http://downloads.dbpedia.org/3.5.1/en/page_links_en.nt.bz2\"\npage_links_filename = page_links_url.rsplit(\"/\", 1)[1]\n\nresources = [\n (redirects_url, redirects_filename),\n (page_links_url, page_links_filename),\n]\n\nfor url, filename in resources:\n if not os.path.exists(filename):\n print(\"Downloading data from '%s', please wait...\" % url)\n opener = urlopen(url)\n open(filename, 'wb').write(opener.read())\n print()\n\n\n# #############################################################################\n# Loading the redirect files\n\nmemory = Memory(cachedir=\".\")\n\n\ndef index(redirects, index_map, k):\n \"\"\"Find the index of an article name after redirect resolution\"\"\"\n k = redirects.get(k, k)\n return index_map.setdefault(k, len(index_map))\n\n\nDBPEDIA_RESOURCE_PREFIX_LEN = len(\"http://dbpedia.org/resource/\")\nSHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1)\n\n\ndef short_name(nt_uri):\n \"\"\"Remove the < and > URI markers and the common URI prefix\"\"\"\n return nt_uri[SHORTNAME_SLICE]\n\n\ndef get_redirects(redirects_filename):\n \"\"\"Parse the redirections and build a transitively closed map out of it\"\"\"\n redirects = {}\n print(\"Parsing the NT redirect file\")\n for l, line in enumerate(BZ2File(redirects_filename)):\n split = line.split()\n if len(split) != 4:\n print(\"ignoring malformed line: \" + line)\n continue\n redirects[short_name(split[0])] = short_name(split[2])\n if l % 1000000 == 0:\n print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n # compute the transitive closure\n print(\"Computing the transitive closure of the redirect relation\")\n for l, source in enumerate(redirects.keys()):\n transitive_target = None\n target = redirects[source]\n seen = {source}\n while True:\n transitive_target = target\n target = redirects.get(target)\n if target is None or target in seen:\n break\n seen.add(target)\n redirects[source] = transitive_target\n if l % 1000000 == 0:\n print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n return redirects\n\n\n# disabling joblib as the pickling of large dicts seems much too slow\n#@memory.cache\ndef get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):\n \"\"\"Extract the adjacency graph as a scipy sparse matrix\n\n Redirects are resolved first.\n\n Returns X, the scipy sparse adjacency matrix, redirects as python\n dict from article names to article names and index_map a python dict\n from article names to python int (article indexes).\n \"\"\"\n\n print(\"Computing the redirect map\")\n redirects = get_redirects(redirects_filename)\n\n print(\"Computing the integer index map\")\n index_map = dict()\n links = list()\n for l, line in enumerate(BZ2File(page_links_filename)):\n split = line.split()\n if len(split) != 4:\n print(\"ignoring malformed line: \" + line)\n continue\n i = index(redirects, index_map, short_name(split[0]))\n j = index(redirects, index_map, short_name(split[2]))\n links.append((i, j))\n if l % 1000000 == 0:\n print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n if limit is not None and l >= limit - 1:\n break\n\n print(\"Computing the adjacency matrix\")\n X = sparse.lil_matrix((len(index_map), len(index_map)), dtype=np.float32)\n for i, j in links:\n X[i, j] = 1.0\n del links\n print(\"Converting to CSR representation\")\n X = X.tocsr()\n print(\"CSR conversion done\")\n return X, redirects, index_map\n\n\n# stop after 5M links to make it possible to work in RAM\nX, redirects, index_map = get_adjacency_matrix(\n redirects_filename, page_links_filename, limit=5000000)\nnames = {i: name for name, i in index_map.items()}\n\nprint(\"Computing the principal singular vectors using randomized_svd\")\nt0 = time()\nU, s, V = randomized_svd(X, 5, n_iter=3)\nprint(\"done in %0.3fs\" % (time() - t0))\n\n# print the names of the wikipedia related strongest components of the\n# principal singular vector which should be similar to the highest eigenvector\nprint(\"Top wikipedia pages according to principal singular vectors\")\npprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])\npprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])\n\n\ndef centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):\n \"\"\"Power iteration computation of the principal eigenvector\n\n This method is also known as Google PageRank and the implementation\n is based on the one from the NetworkX project (BSD licensed too)\n with copyrights by:\n\n Aric Hagberg <[email protected]>\n Dan Schult <[email protected]>\n Pieter Swart <[email protected]>\n \"\"\"\n n = X.shape[0]\n X = X.copy()\n incoming_counts = np.asarray(X.sum(axis=1)).ravel()\n\n print(\"Normalizing the graph\")\n for i in incoming_counts.nonzero()[0]:\n X.data[X.indptr[i]:X.indptr[i + 1]] = 1.0 / incoming_counts[i]\n dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0),\n 1.0 / n, 0)).ravel()\n\n scores = np.full(n, 1. / n, dtype=np.float32) # initial guess\n for i in range(max_iter):\n print(\"power iteration #%d\" % i)\n prev_scores = scores\n scores = (alpha (scores * X + np.dot(dangle, prev_scores))\n + (1 - alpha) * prev_scores.sum() / n)\n # check convergence: normalized l_inf norm\n scores_max = np.abs(scores).max()\n if scores_max == 0.0:\n scores_max = 1.0\n err = np.abs(scores - prev_scores).max() / scores_max\n print(\"error: %0.6f\" % err)\n if err < n * tol:\n return scores\n\n return scores\n\nprint(\"Computing principal eigenvector score using a power iteration method\")\nt0 = time()\nscores = centrality_scores(X, max_iter=100)\nprint(\"done in %0.3fs\" % (time() - t0))\npprint([names[i] for i in np.abs(scores).argsort()[-10:]])"
	`29`	+ "# Author: Olivier Grisel <[email protected]>\n# License: BSD 3 clause\n\nfrom bz2 import BZ2File\nimport os\nfrom datetime import datetime\nfrom pprint import pprint\nfrom time import time\n\nimport numpy as np\n\nfrom scipy import sparse\n\nfrom sklearn.decomposition import randomized_svd\nfrom urllib.request import urlopen\n\n\nprint(__doc__)\n\n# #############################################################################\n# Where to download the data, if not already on disk\nredirects_url = \"http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2\"\nredirects_filename = redirects_url.rsplit(\"/\", 1)[1]\n\npage_links_url = \"http://downloads.dbpedia.org/3.5.1/en/page_links_en.nt.bz2\"\npage_links_filename = page_links_url.rsplit(\"/\", 1)[1]\n\nresources = [\n (redirects_url, redirects_filename),\n (page_links_url, page_links_filename),\n]\n\nfor url, filename in resources:\n if not os.path.exists(filename):\n print(\"Downloading data from '%s', please wait...\" % url)\n opener = urlopen(url)\n open(filename, 'wb').write(opener.read())\n print()\n\n\n# #############################################################################\n# Loading the redirect files\n\n\ndef index(redirects, index_map, k):\n \"\"\"Find the index of an article name after redirect resolution\"\"\"\n k = redirects.get(k, k)\n return index_map.setdefault(k, len(index_map))\n\n\nDBPEDIA_RESOURCE_PREFIX_LEN = len(\"http://dbpedia.org/resource/\")\nSHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1)\n\n\ndef short_name(nt_uri):\n \"\"\"Remove the < and > URI markers and the common URI prefix\"\"\"\n return nt_uri[SHORTNAME_SLICE]\n\n\ndef get_redirects(redirects_filename):\n \"\"\"Parse the redirections and build a transitively closed map out of it\"\"\"\n redirects = {}\n print(\"Parsing the NT redirect file\")\n for l, line in enumerate(BZ2File(redirects_filename)):\n split = line.split()\n if len(split) != 4:\n print(\"ignoring malformed line: \" + line)\n continue\n redirects[short_name(split[0])] = short_name(split[2])\n if l % 1000000 == 0:\n print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n # compute the transitive closure\n print(\"Computing the transitive closure of the redirect relation\")\n for l, source in enumerate(redirects.keys()):\n transitive_target = None\n target = redirects[source]\n seen = {source}\n while True:\n transitive_target = target\n target = redirects.get(target)\n if target is None or target in seen:\n break\n seen.add(target)\n redirects[source] = transitive_target\n if l % 1000000 == 0:\n print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n return redirects\n\n\ndef get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):\n \"\"\"Extract the adjacency graph as a scipy sparse matrix\n\n Redirects are resolved first.\n\n Returns X, the scipy sparse adjacency matrix, redirects as python\n dict from article names to article names and index_map a python dict\n from article names to python int (article indexes).\n \"\"\"\n\n print(\"Computing the redirect map\")\n redirects = get_redirects(redirects_filename)\n\n print(\"Computing the integer index map\")\n index_map = dict()\n links = list()\n for l, line in enumerate(BZ2File(page_links_filename)):\n split = line.split()\n if len(split) != 4:\n print(\"ignoring malformed line: \" + line)\n continue\n i = index(redirects, index_map, short_name(split[0]))\n j = index(redirects, index_map, short_name(split[2]))\n links.append((i, j))\n if l % 1000000 == 0:\n print(\"[%s] line: %08d\" % (datetime.now().isoformat(), l))\n\n if limit is not None and l >= limit - 1:\n break\n\n print(\"Computing the adjacency matrix\")\n X = sparse.lil_matrix((len(index_map), len(index_map)), dtype=np.float32)\n for i, j in links:\n X[i, j] = 1.0\n del links\n print(\"Converting to CSR representation\")\n X = X.tocsr()\n print(\"CSR conversion done\")\n return X, redirects, index_map\n\n\n# stop after 5M links to make it possible to work in RAM\nX, redirects, index_map = get_adjacency_matrix(\n redirects_filename, page_links_filename, limit=5000000)\nnames = {i: name for name, i in index_map.items()}\n\nprint(\"Computing the principal singular vectors using randomized_svd\")\nt0 = time()\nU, s, V = randomized_svd(X, 5, n_iter=3)\nprint(\"done in %0.3fs\" % (time() - t0))\n\n# print the names of the wikipedia related strongest components of the\n# principal singular vector which should be similar to the highest eigenvector\nprint(\"Top wikipedia pages according to principal singular vectors\")\npprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])\npprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])\n\n\ndef centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):\n \"\"\"Power iteration computation of the principal eigenvector\n\n This method is also known as Google PageRank and the implementation\n is based on the one from the NetworkX project (BSD licensed too)\n with copyrights by:\n\n Aric Hagberg <[email protected]>\n Dan Schult <[email protected]>\n Pieter Swart <[email protected]>\n \"\"\"\n n = X.shape[0]\n X = X.copy()\n incoming_counts = np.asarray(X.sum(axis=1)).ravel()\n\n print(\"Normalizing the graph\")\n for i in incoming_counts.nonzero()[0]:\n X.data[X.indptr[i]:X.indptr[i + 1]] = 1.0 / incoming_counts[i]\n dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0),\n 1.0 / n, 0)).ravel()\n\n scores = np.full(n, 1. / n, dtype=np.float32) # initial guess\n for i in range(max_iter):\n print(\"power iteration #%d\" % i)\n prev_scores = scores\n scores = (alpha (scores * X + np.dot(dangle, prev_scores))\n + (1 - alpha) * prev_scores.sum() / n)\n # check convergence: normalized l_inf norm\n scores_max = np.abs(scores).max()\n if scores_max == 0.0:\n scores_max = 1.0\n err = np.abs(scores - prev_scores).max() / scores_max\n print(\"error: %0.6f\" % err)\n if err < n * tol:\n return scores\n\n return scores\n\nprint(\"Computing principal eigenvector score using a power iteration method\")\nt0 = time()\nscores = centrality_scores(X, max_iter=100)\nprint(\"done in %0.3fs\" % (time() - t0))\npprint([names[i] for i in np.abs(scores).argsort()[-10:]])"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`