scikit-learn
diff --git a/‎_pst_preview/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎_pst_preview/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎_pst_preview/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
42 Bytes b/‎_pst_preview/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
42 Bytes
diff --git a/‎_pst_preview/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
45 Bytes b/‎_pst_preview/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
45 Bytes
diff --git a/‎_pst_preview/_downloads/c08598f3ffe66017f7cad294026ee0b9/plot_out_of_core_classification.ipynb
Lines changed: 1 addition & 1 deletion b/‎_pst_preview/_downloads/c08598f3ffe66017f7cad294026ee0b9/plot_out_of_core_classification.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎_pst_preview/_downloads/f7c999465d2f8d68e0c04bec778aa48e/plot_out_of_core_classification.py
Lines changed: 2 additions & 1 deletion b/‎_pst_preview/_downloads/f7c999465d2f8d68e0c04bec778aa48e/plot_out_of_core_classification.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎_pst_preview/_images/probabl.png
19.7 KB b/‎_pst_preview/_images/probabl.png
19.7 KB
diff --git a/‎_pst_preview/_images/sphx_glr_plot_agglomerative_clustering_001.png
-342 Bytes b/‎_pst_preview/_images/sphx_glr_plot_agglomerative_clustering_001.png
-342 Bytes
diff --git a/‎_pst_preview/_images/sphx_glr_plot_agglomerative_clustering_002.png
-227 Bytes b/‎_pst_preview/_images/sphx_glr_plot_agglomerative_clustering_002.png
-227 Bytes
diff --git a/‎_pst_preview/_images/sphx_glr_plot_agglomerative_clustering_003.png
-74 Bytes b/‎_pst_preview/_images/sphx_glr_plot_agglomerative_clustering_003.png
-74 Bytes
diff --git a/‎_pst_preview/_images/sphx_glr_plot_agglomerative_clustering_004.png
-371 Bytes b/‎_pst_preview/_images/sphx_glr_plot_agglomerative_clustering_004.png
-371 Bytes
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 6e039326ae97414d0046aae34cf6dd1e
+config: 18d37eb98ca80365ed9983fc53edb534
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "class ReutersParser(HTMLParser):\n    \"\"\"Utility class to parse a SGML file and yield documents one at a time.\"\"\"\n\n    def __init__(self, encoding=\"latin-1\"):\n        HTMLParser.__init__(self)\n        self._reset()\n        self.encoding = encoding\n\n    def handle_starttag(self, tag, attrs):\n        method = \"start_\" + tag\n        getattr(self, method, lambda x: None)(attrs)\n\n    def handle_endtag(self, tag):\n        method = \"end_\" + tag\n        getattr(self, method, lambda: None)()\n\n    def _reset(self):\n        self.in_title = 0\n        self.in_body = 0\n        self.in_topics = 0\n        self.in_topic_d = 0\n        self.title = \"\"\n        self.body = \"\"\n        self.topics = []\n        self.topic_d = \"\"\n\n    def parse(self, fd):\n        self.docs = []\n        for chunk in fd:\n            self.feed(chunk.decode(self.encoding))\n            for doc in self.docs:\n                yield doc\n            self.docs = []\n        self.close()\n\n    def handle_data(self, data):\n        if self.in_body:\n            self.body += data\n        elif self.in_title:\n            self.title += data\n        elif self.in_topic_d:\n            self.topic_d += data\n\n    def start_reuters(self, attributes):\n        pass\n\n    def end_reuters(self):\n        self.body = re.sub(r\"\\s+\", r\" \", self.body)\n        self.docs.append(\n            {\"title\": self.title, \"body\": self.body, \"topics\": self.topics}\n        )\n        self._reset()\n\n    def start_title(self, attributes):\n        self.in_title = 1\n\n    def end_title(self):\n        self.in_title = 0\n\n    def start_body(self, attributes):\n        self.in_body = 1\n\n    def end_body(self):\n        self.in_body = 0\n\n    def start_topics(self, attributes):\n        self.in_topics = 1\n\n    def end_topics(self):\n        self.in_topics = 0\n\n    def start_d(self, attributes):\n        self.in_topic_d = 1\n\n    def end_d(self):\n        self.in_topic_d = 0\n        self.topics.append(self.topic_d)\n        self.topic_d = \"\"\n\n\ndef stream_reuters_documents(data_path=None):\n    \"\"\"Iterate over documents of the Reuters dataset.\n\n    The Reuters archive will automatically be downloaded and uncompressed if\n    the `data_path` directory does not exist.\n\n    Documents are represented as dictionaries with 'body' (str),\n    'title' (str), 'topics' (list(str)) keys.\n\n    \"\"\"\n\n    DOWNLOAD_URL = (\n        \"http://archive.ics.uci.edu/ml/machine-learning-databases/\"\n        \"reuters21578-mld/reuters21578.tar.gz\"\n    )\n    ARCHIVE_SHA256 = \"3bae43c9b14e387f76a61b6d82bf98a4fb5d3ef99ef7e7075ff2ccbcf59f9d30\"\n    ARCHIVE_FILENAME = \"reuters21578.tar.gz\"\n\n    if data_path is None:\n        data_path = Path(get_data_home()) / \"reuters\"\n    else:\n        data_path = Path(data_path)\n    if not data_path.exists():\n        \"\"\"Download the dataset.\"\"\"\n        print(\"downloading dataset (once and for all) into %s\" % data_path)\n        data_path.mkdir(parents=True, exist_ok=True)\n\n        def progress(blocknum, bs, size):\n            total_sz_mb = \"%.2f MB\" % (size / 1e6)\n            current_sz_mb = \"%.2f MB\" % ((blocknum * bs) / 1e6)\n            if _not_in_sphinx():\n                sys.stdout.write(\"\\rdownloaded %s / %s\" % (current_sz_mb, total_sz_mb))\n\n        archive_path = data_path / ARCHIVE_FILENAME\n\n        urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress)\n        if _not_in_sphinx():\n            sys.stdout.write(\"\\r\")\n\n        # Check that the archive was not tampered:\n        assert sha256(archive_path.read_bytes()).hexdigest() == ARCHIVE_SHA256\n\n        print(\"untarring Reuters dataset...\")\n        tarfile.open(archive_path, \"r:gz\").extractall(data_path)\n        print(\"done.\")\n\n    parser = ReutersParser()\n    for filename in data_path.glob(\"*.sgm\"):\n        for doc in parser.parse(open(filename, \"rb\")):\n            yield doc"
+        "class ReutersParser(HTMLParser):\n    \"\"\"Utility class to parse a SGML file and yield documents one at a time.\"\"\"\n\n    def __init__(self, encoding=\"latin-1\"):\n        HTMLParser.__init__(self)\n        self._reset()\n        self.encoding = encoding\n\n    def handle_starttag(self, tag, attrs):\n        method = \"start_\" + tag\n        getattr(self, method, lambda x: None)(attrs)\n\n    def handle_endtag(self, tag):\n        method = \"end_\" + tag\n        getattr(self, method, lambda: None)()\n\n    def _reset(self):\n        self.in_title = 0\n        self.in_body = 0\n        self.in_topics = 0\n        self.in_topic_d = 0\n        self.title = \"\"\n        self.body = \"\"\n        self.topics = []\n        self.topic_d = \"\"\n\n    def parse(self, fd):\n        self.docs = []\n        for chunk in fd:\n            self.feed(chunk.decode(self.encoding))\n            for doc in self.docs:\n                yield doc\n            self.docs = []\n        self.close()\n\n    def handle_data(self, data):\n        if self.in_body:\n            self.body += data\n        elif self.in_title:\n            self.title += data\n        elif self.in_topic_d:\n            self.topic_d += data\n\n    def start_reuters(self, attributes):\n        pass\n\n    def end_reuters(self):\n        self.body = re.sub(r\"\\s+\", r\" \", self.body)\n        self.docs.append(\n            {\"title\": self.title, \"body\": self.body, \"topics\": self.topics}\n        )\n        self._reset()\n\n    def start_title(self, attributes):\n        self.in_title = 1\n\n    def end_title(self):\n        self.in_title = 0\n\n    def start_body(self, attributes):\n        self.in_body = 1\n\n    def end_body(self):\n        self.in_body = 0\n\n    def start_topics(self, attributes):\n        self.in_topics = 1\n\n    def end_topics(self):\n        self.in_topics = 0\n\n    def start_d(self, attributes):\n        self.in_topic_d = 1\n\n    def end_d(self):\n        self.in_topic_d = 0\n        self.topics.append(self.topic_d)\n        self.topic_d = \"\"\n\n\ndef stream_reuters_documents(data_path=None):\n    \"\"\"Iterate over documents of the Reuters dataset.\n\n    The Reuters archive will automatically be downloaded and uncompressed if\n    the `data_path` directory does not exist.\n\n    Documents are represented as dictionaries with 'body' (str),\n    'title' (str), 'topics' (list(str)) keys.\n\n    \"\"\"\n\n    DOWNLOAD_URL = (\n        \"http://archive.ics.uci.edu/ml/machine-learning-databases/\"\n        \"reuters21578-mld/reuters21578.tar.gz\"\n    )\n    ARCHIVE_SHA256 = \"3bae43c9b14e387f76a61b6d82bf98a4fb5d3ef99ef7e7075ff2ccbcf59f9d30\"\n    ARCHIVE_FILENAME = \"reuters21578.tar.gz\"\n\n    if data_path is None:\n        data_path = Path(get_data_home()) / \"reuters\"\n    else:\n        data_path = Path(data_path)\n    if not data_path.exists():\n        \"\"\"Download the dataset.\"\"\"\n        print(\"downloading dataset (once and for all) into %s\" % data_path)\n        data_path.mkdir(parents=True, exist_ok=True)\n\n        def progress(blocknum, bs, size):\n            total_sz_mb = \"%.2f MB\" % (size / 1e6)\n            current_sz_mb = \"%.2f MB\" % ((blocknum * bs) / 1e6)\n            if _not_in_sphinx():\n                sys.stdout.write(\"\\rdownloaded %s / %s\" % (current_sz_mb, total_sz_mb))\n\n        archive_path = data_path / ARCHIVE_FILENAME\n\n        urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress)\n        if _not_in_sphinx():\n            sys.stdout.write(\"\\r\")\n\n        # Check that the archive was not tampered:\n        assert sha256(archive_path.read_bytes()).hexdigest() == ARCHIVE_SHA256\n\n        print(\"untarring Reuters dataset...\")\n        with tarfile.open(archive_path, \"r:gz\") as fp:\n            fp.extractall(data_path, filter=\"data\")\n        print(\"done.\")\n\n    parser = ReutersParser()\n    for filename in data_path.glob(\"*.sgm\"):\n        for doc in parser.parse(open(filename, \"rb\")):\n            yield doc"
       ]
     },
     {
 
@@ -175,7 +175,8 @@ def progress(blocknum, bs, size):
         assert sha256(archive_path.read_bytes()).hexdigest() == ARCHIVE_SHA256
 
         print("untarring Reuters dataset...")
-        tarfile.open(archive_path, "r:gz").extractall(data_path)
+        with tarfile.open(archive_path, "r:gz") as fp:
+            fp.extractall(data_path, filter="data")
         print("done.")
 
     parser = ReutersParser()
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`},`
`34`	`34`	`"outputs": [],`
`35`	`35`	`"source": [`
`36`		- "class ReutersParser(HTMLParser):\n \"\"\"Utility class to parse a SGML file and yield documents one at a time.\"\"\"\n\n def __init__(self, encoding=\"latin-1\"):\n HTMLParser.__init__(self)\n self._reset()\n self.encoding = encoding\n\n def handle_starttag(self, tag, attrs):\n method = \"start_\" + tag\n getattr(self, method, lambda x: None)(attrs)\n\n def handle_endtag(self, tag):\n method = \"end_\" + tag\n getattr(self, method, lambda: None)()\n\n def _reset(self):\n self.in_title = 0\n self.in_body = 0\n self.in_topics = 0\n self.in_topic_d = 0\n self.title = \"\"\n self.body = \"\"\n self.topics = []\n self.topic_d = \"\"\n\n def parse(self, fd):\n self.docs = []\n for chunk in fd:\n self.feed(chunk.decode(self.encoding))\n for doc in self.docs:\n yield doc\n self.docs = []\n self.close()\n\n def handle_data(self, data):\n if self.in_body:\n self.body += data\n elif self.in_title:\n self.title += data\n elif self.in_topic_d:\n self.topic_d += data\n\n def start_reuters(self, attributes):\n pass\n\n def end_reuters(self):\n self.body = re.sub(r\"\\s+\", r\" \", self.body)\n self.docs.append(\n {\"title\": self.title, \"body\": self.body, \"topics\": self.topics}\n )\n self._reset()\n\n def start_title(self, attributes):\n self.in_title = 1\n\n def end_title(self):\n self.in_title = 0\n\n def start_body(self, attributes):\n self.in_body = 1\n\n def end_body(self):\n self.in_body = 0\n\n def start_topics(self, attributes):\n self.in_topics = 1\n\n def end_topics(self):\n self.in_topics = 0\n\n def start_d(self, attributes):\n self.in_topic_d = 1\n\n def end_d(self):\n self.in_topic_d = 0\n self.topics.append(self.topic_d)\n self.topic_d = \"\"\n\n\ndef stream_reuters_documents(data_path=None):\n \"\"\"Iterate over documents of the Reuters dataset.\n\n The Reuters archive will automatically be downloaded and uncompressed if\n the `data_path` directory does not exist.\n\n Documents are represented as dictionaries with 'body' (str),\n 'title' (str), 'topics' (list(str)) keys.\n\n \"\"\"\n\n DOWNLOAD_URL = (\n \"http://archive.ics.uci.edu/ml/machine-learning-databases/\"\n \"reuters21578-mld/reuters21578.tar.gz\"\n )\n ARCHIVE_SHA256 = \"3bae43c9b14e387f76a61b6d82bf98a4fb5d3ef99ef7e7075ff2ccbcf59f9d30\"\n ARCHIVE_FILENAME = \"reuters21578.tar.gz\"\n\n if data_path is None:\n data_path = Path(get_data_home()) / \"reuters\"\n else:\n data_path = Path(data_path)\n if not data_path.exists():\n \"\"\"Download the dataset.\"\"\"\n print(\"downloading dataset (once and for all) into %s\" % data_path)\n data_path.mkdir(parents=True, exist_ok=True)\n\n def progress(blocknum, bs, size):\n total_sz_mb = \"%.2f MB\" % (size / 1e6)\n current_sz_mb = \"%.2f MB\" % ((blocknum * bs) / 1e6)\n if _not_in_sphinx():\n sys.stdout.write(\"\\rdownloaded %s / %s\" % (current_sz_mb, total_sz_mb))\n\n archive_path = data_path / ARCHIVE_FILENAME\n\n urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress)\n if _not_in_sphinx():\n sys.stdout.write(\"\\r\")\n\n # Check that the archive was not tampered:\n assert sha256(archive_path.read_bytes()).hexdigest() == ARCHIVE_SHA256\n\n print(\"untarring Reuters dataset...\")\n tarfile.open(archive_path, \"r:gz\").extractall(data_path)\n print(\"done.\")\n\n parser = ReutersParser()\n for filename in data_path.glob(\"*.sgm\"):\n for doc in parser.parse(open(filename, \"rb\")):\n yield doc"
	`36`	+ "class ReutersParser(HTMLParser):\n \"\"\"Utility class to parse a SGML file and yield documents one at a time.\"\"\"\n\n def __init__(self, encoding=\"latin-1\"):\n HTMLParser.__init__(self)\n self._reset()\n self.encoding = encoding\n\n def handle_starttag(self, tag, attrs):\n method = \"start_\" + tag\n getattr(self, method, lambda x: None)(attrs)\n\n def handle_endtag(self, tag):\n method = \"end_\" + tag\n getattr(self, method, lambda: None)()\n\n def _reset(self):\n self.in_title = 0\n self.in_body = 0\n self.in_topics = 0\n self.in_topic_d = 0\n self.title = \"\"\n self.body = \"\"\n self.topics = []\n self.topic_d = \"\"\n\n def parse(self, fd):\n self.docs = []\n for chunk in fd:\n self.feed(chunk.decode(self.encoding))\n for doc in self.docs:\n yield doc\n self.docs = []\n self.close()\n\n def handle_data(self, data):\n if self.in_body:\n self.body += data\n elif self.in_title:\n self.title += data\n elif self.in_topic_d:\n self.topic_d += data\n\n def start_reuters(self, attributes):\n pass\n\n def end_reuters(self):\n self.body = re.sub(r\"\\s+\", r\" \", self.body)\n self.docs.append(\n {\"title\": self.title, \"body\": self.body, \"topics\": self.topics}\n )\n self._reset()\n\n def start_title(self, attributes):\n self.in_title = 1\n\n def end_title(self):\n self.in_title = 0\n\n def start_body(self, attributes):\n self.in_body = 1\n\n def end_body(self):\n self.in_body = 0\n\n def start_topics(self, attributes):\n self.in_topics = 1\n\n def end_topics(self):\n self.in_topics = 0\n\n def start_d(self, attributes):\n self.in_topic_d = 1\n\n def end_d(self):\n self.in_topic_d = 0\n self.topics.append(self.topic_d)\n self.topic_d = \"\"\n\n\ndef stream_reuters_documents(data_path=None):\n \"\"\"Iterate over documents of the Reuters dataset.\n\n The Reuters archive will automatically be downloaded and uncompressed if\n the `data_path` directory does not exist.\n\n Documents are represented as dictionaries with 'body' (str),\n 'title' (str), 'topics' (list(str)) keys.\n\n \"\"\"\n\n DOWNLOAD_URL = (\n \"http://archive.ics.uci.edu/ml/machine-learning-databases/\"\n \"reuters21578-mld/reuters21578.tar.gz\"\n )\n ARCHIVE_SHA256 = \"3bae43c9b14e387f76a61b6d82bf98a4fb5d3ef99ef7e7075ff2ccbcf59f9d30\"\n ARCHIVE_FILENAME = \"reuters21578.tar.gz\"\n\n if data_path is None:\n data_path = Path(get_data_home()) / \"reuters\"\n else:\n data_path = Path(data_path)\n if not data_path.exists():\n \"\"\"Download the dataset.\"\"\"\n print(\"downloading dataset (once and for all) into %s\" % data_path)\n data_path.mkdir(parents=True, exist_ok=True)\n\n def progress(blocknum, bs, size):\n total_sz_mb = \"%.2f MB\" % (size / 1e6)\n current_sz_mb = \"%.2f MB\" % ((blocknum * bs) / 1e6)\n if _not_in_sphinx():\n sys.stdout.write(\"\\rdownloaded %s / %s\" % (current_sz_mb, total_sz_mb))\n\n archive_path = data_path / ARCHIVE_FILENAME\n\n urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress)\n if _not_in_sphinx():\n sys.stdout.write(\"\\r\")\n\n # Check that the archive was not tampered:\n assert sha256(archive_path.read_bytes()).hexdigest() == ARCHIVE_SHA256\n\n print(\"untarring Reuters dataset...\")\n with tarfile.open(archive_path, \"r:gz\") as fp:\n fp.extractall(data_path, filter=\"data\")\n print(\"done.\")\n\n parser = ReutersParser()\n for filename in data_path.glob(\"*.sgm\"):\n for doc in parser.parse(open(filename, \"rb\")):\n yield doc"
`37`	`37`	`]`
`38`	`38`	`},`
`39`	`39`	`{`