Skip to content

Commit 039e6a7

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 50b818e9fbc5e2345a0610a687b89a8badeaa046
1 parent c466e92 commit 039e6a7

File tree

1,222 files changed

+4603
-4234
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,222 files changed

+4603
-4234
lines changed
Binary file not shown.
Binary file not shown.

dev/_downloads/c08598f3ffe66017f7cad294026ee0b9/plot_out_of_core_classification.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Authors: Eustache Diemert <[email protected]>\n# @FedericoV <https://github.com/FedericoV/>\n# License: BSD 3 clause\n\nfrom glob import glob\nimport itertools\nimport os.path\nimport re\nimport tarfile\nimport time\nimport sys\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib import rcParams\n\nfrom html.parser import HTMLParser\nfrom urllib.request import urlretrieve\nfrom sklearn.datasets import get_data_home\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.naive_bayes import MultinomialNB\n\n\ndef _not_in_sphinx():\n # Hack to detect whether we are running by the sphinx builder\n return \"__file__\" in globals()"
29+
"# Authors: Eustache Diemert <[email protected]>\n# @FedericoV <https://github.com/FedericoV/>\n# License: BSD 3 clause\n\nimport itertools\nfrom pathlib import Path\nfrom hashlib import sha256\nimport re\nimport tarfile\nimport time\nimport sys\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib import rcParams\n\nfrom html.parser import HTMLParser\nfrom urllib.request import urlretrieve\nfrom sklearn.datasets import get_data_home\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.naive_bayes import MultinomialNB\n\n\ndef _not_in_sphinx():\n # Hack to detect whether we are running by the sphinx builder\n return \"__file__\" in globals()"
3030
]
3131
},
3232
{
@@ -44,7 +44,7 @@
4444
},
4545
"outputs": [],
4646
"source": [
47-
"class ReutersParser(HTMLParser):\n \"\"\"Utility class to parse a SGML file and yield documents one at a time.\"\"\"\n\n def __init__(self, encoding=\"latin-1\"):\n HTMLParser.__init__(self)\n self._reset()\n self.encoding = encoding\n\n def handle_starttag(self, tag, attrs):\n method = \"start_\" + tag\n getattr(self, method, lambda x: None)(attrs)\n\n def handle_endtag(self, tag):\n method = \"end_\" + tag\n getattr(self, method, lambda: None)()\n\n def _reset(self):\n self.in_title = 0\n self.in_body = 0\n self.in_topics = 0\n self.in_topic_d = 0\n self.title = \"\"\n self.body = \"\"\n self.topics = []\n self.topic_d = \"\"\n\n def parse(self, fd):\n self.docs = []\n for chunk in fd:\n self.feed(chunk.decode(self.encoding))\n for doc in self.docs:\n yield doc\n self.docs = []\n self.close()\n\n def handle_data(self, data):\n if self.in_body:\n self.body += data\n elif self.in_title:\n self.title += data\n elif self.in_topic_d:\n self.topic_d += data\n\n def start_reuters(self, attributes):\n pass\n\n def end_reuters(self):\n self.body = re.sub(r\"\\s+\", r\" \", self.body)\n self.docs.append(\n {\"title\": self.title, \"body\": self.body, \"topics\": self.topics}\n )\n self._reset()\n\n def start_title(self, attributes):\n self.in_title = 1\n\n def end_title(self):\n self.in_title = 0\n\n def start_body(self, attributes):\n self.in_body = 1\n\n def end_body(self):\n self.in_body = 0\n\n def start_topics(self, attributes):\n self.in_topics = 1\n\n def end_topics(self):\n self.in_topics = 0\n\n def start_d(self, attributes):\n self.in_topic_d = 1\n\n def end_d(self):\n self.in_topic_d = 0\n self.topics.append(self.topic_d)\n self.topic_d = \"\"\n\n\ndef stream_reuters_documents(data_path=None):\n \"\"\"Iterate over documents of the Reuters dataset.\n\n The Reuters archive will automatically be downloaded and uncompressed if\n the `data_path` directory does not exist.\n\n Documents are represented as dictionaries with 'body' (str),\n 'title' (str), 'topics' (list(str)) keys.\n\n \"\"\"\n\n DOWNLOAD_URL = (\n \"http://archive.ics.uci.edu/ml/machine-learning-databases/\"\n \"reuters21578-mld/reuters21578.tar.gz\"\n )\n ARCHIVE_FILENAME = \"reuters21578.tar.gz\"\n\n if data_path is None:\n data_path = os.path.join(get_data_home(), \"reuters\")\n if not os.path.exists(data_path):\n \"\"\"Download the dataset.\"\"\"\n print(\"downloading dataset (once and for all) into %s\" % data_path)\n os.mkdir(data_path)\n\n def progress(blocknum, bs, size):\n total_sz_mb = \"%.2f MB\" % (size / 1e6)\n current_sz_mb = \"%.2f MB\" % ((blocknum * bs) / 1e6)\n if _not_in_sphinx():\n sys.stdout.write(\"\\rdownloaded %s / %s\" % (current_sz_mb, total_sz_mb))\n\n archive_path = os.path.join(data_path, ARCHIVE_FILENAME)\n urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress)\n if _not_in_sphinx():\n sys.stdout.write(\"\\r\")\n print(\"untarring Reuters dataset...\")\n tarfile.open(archive_path, \"r:gz\").extractall(data_path)\n print(\"done.\")\n\n parser = ReutersParser()\n for filename in glob(os.path.join(data_path, \"*.sgm\")):\n for doc in parser.parse(open(filename, \"rb\")):\n yield doc"
47+
"class ReutersParser(HTMLParser):\n \"\"\"Utility class to parse a SGML file and yield documents one at a time.\"\"\"\n\n def __init__(self, encoding=\"latin-1\"):\n HTMLParser.__init__(self)\n self._reset()\n self.encoding = encoding\n\n def handle_starttag(self, tag, attrs):\n method = \"start_\" + tag\n getattr(self, method, lambda x: None)(attrs)\n\n def handle_endtag(self, tag):\n method = \"end_\" + tag\n getattr(self, method, lambda: None)()\n\n def _reset(self):\n self.in_title = 0\n self.in_body = 0\n self.in_topics = 0\n self.in_topic_d = 0\n self.title = \"\"\n self.body = \"\"\n self.topics = []\n self.topic_d = \"\"\n\n def parse(self, fd):\n self.docs = []\n for chunk in fd:\n self.feed(chunk.decode(self.encoding))\n for doc in self.docs:\n yield doc\n self.docs = []\n self.close()\n\n def handle_data(self, data):\n if self.in_body:\n self.body += data\n elif self.in_title:\n self.title += data\n elif self.in_topic_d:\n self.topic_d += data\n\n def start_reuters(self, attributes):\n pass\n\n def end_reuters(self):\n self.body = re.sub(r\"\\s+\", r\" \", self.body)\n self.docs.append(\n {\"title\": self.title, \"body\": self.body, \"topics\": self.topics}\n )\n self._reset()\n\n def start_title(self, attributes):\n self.in_title = 1\n\n def end_title(self):\n self.in_title = 0\n\n def start_body(self, attributes):\n self.in_body = 1\n\n def end_body(self):\n self.in_body = 0\n\n def start_topics(self, attributes):\n self.in_topics = 1\n\n def end_topics(self):\n self.in_topics = 0\n\n def start_d(self, attributes):\n self.in_topic_d = 1\n\n def end_d(self):\n self.in_topic_d = 0\n self.topics.append(self.topic_d)\n self.topic_d = \"\"\n\n\ndef stream_reuters_documents(data_path=None):\n \"\"\"Iterate over documents of the Reuters dataset.\n\n The Reuters archive will automatically be downloaded and uncompressed if\n the `data_path` directory does not exist.\n\n Documents are represented as dictionaries with 'body' (str),\n 'title' (str), 'topics' (list(str)) keys.\n\n \"\"\"\n\n DOWNLOAD_URL = (\n \"http://archive.ics.uci.edu/ml/machine-learning-databases/\"\n \"reuters21578-mld/reuters21578.tar.gz\"\n )\n ARCHIVE_SHA256 = \"3bae43c9b14e387f76a61b6d82bf98a4fb5d3ef99ef7e7075ff2ccbcf59f9d30\"\n ARCHIVE_FILENAME = \"reuters21578.tar.gz\"\n\n if data_path is None:\n data_path = Path(get_data_home()) / \"reuters\"\n else:\n data_path = Path(data_path)\n if not data_path.exists():\n \"\"\"Download the dataset.\"\"\"\n print(\"downloading dataset (once and for all) into %s\" % data_path)\n data_path.mkdir(parents=True, exist_ok=True)\n\n def progress(blocknum, bs, size):\n total_sz_mb = \"%.2f MB\" % (size / 1e6)\n current_sz_mb = \"%.2f MB\" % ((blocknum * bs) / 1e6)\n if _not_in_sphinx():\n sys.stdout.write(\"\\rdownloaded %s / %s\" % (current_sz_mb, total_sz_mb))\n\n archive_path = data_path / ARCHIVE_FILENAME\n\n urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress)\n if _not_in_sphinx():\n sys.stdout.write(\"\\r\")\n\n # Check that the archive was not tampered:\n assert sha256(archive_path.read_bytes()).hexdigest() == ARCHIVE_SHA256\n\n print(\"untarring Reuters dataset...\")\n tarfile.open(archive_path, \"r:gz\").extractall(data_path)\n print(\"done.\")\n\n parser = ReutersParser()\n for filename in data_path.glob(\"*.sgm\"):\n for doc in parser.parse(open(filename, \"rb\")):\n yield doc"
4848
]
4949
},
5050
{

0 commit comments

Comments
 (0)