Created using Colaboratory

matthewshawnkehoe · matthewshawnkehoe · commit 0631cbfdbeb8 · 2023-08-09T22:29:16.000-04:00
diff --git a/chapter11_part02_sequence-models.ipynb b/chapter11_part02_sequence-models.ipynb
@@ -83,13 +83,13 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": 1,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "OGvK58hsjdwv",
-        "outputId": "cd051d4b-566e-480b-8378-f152e899bf6b"
+        "outputId": "c6039281-6ac8-4f59-d41b-06856df4f919"
       },
       "outputs": [
         {
@@ -98,7 +98,7 @@
           "text": [
             "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
             "                                 Dload  Upload   Total   Spent    Left  Speed\n",
-            "100 80.2M  100 80.2M    0     0  14.5M      0  0:00:05  0:00:05 --:--:-- 16.4M\n"
+            "100 80.2M  100 80.2M    0     0  8564k      0  0:00:09  0:00:09 --:--:-- 12.2M\n"
           ]
         }
       ],
@@ -134,7 +134,7 @@
           "base_uri": "https://localhost:8080/"
         },
         "id": "_JwuTAnfjdww",
-        "outputId": "fbf0e10a-84fc-4cef-ef61-564d43f25337"
+        "outputId": "ad3d822a-1e24-46ca-ca03-9dca124027cf"
       },
       "outputs": [
         {
@@ -241,7 +241,7 @@
           "base_uri": "https://localhost:8080/"
         },
         "id": "UJ0MDrEwjdwy",
-        "outputId": "29ad4cc3-9954-4846-e85c-f485f4add33c"
+        "outputId": "b2215c6e-806a-4235-8650-ceb5228783cc"
       },
       "outputs": [
         {
@@ -453,28 +453,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": 5,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 197
-        },
-        "id": "9mZ7UQTUjdwz",
-        "outputId": "71a5c8be-9908-4b32-b0ea-8263b7089934"
+        "id": "9mZ7UQTUjdwz"
       },
-      "outputs": [
-        {
-          "output_type": "error",
-          "ename": "NameError",
-          "evalue": "ignored",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-            "\u001b[0;32m<ipython-input-1-1417eb60feec>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0membedding_layer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlayers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEmbedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_dim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmax_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_dim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m256\u001b[0m\u001b[0;34m)\u001b[0m        \u001b[0;31m# The Embedding layer takes at least two arguments: the number of\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m                                                                                 \u001b[0;31m# possible tokens and the dimensionality of the embeddings (here, 256).\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;31mNameError\u001b[0m: name 'layers' is not defined"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)        # The Embedding layer takes at least two arguments: the number of\n",
         "                                                                                # possible tokens and the dimensionality of the embeddings (here, 256)."
@@ -525,9 +508,46 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "XPW1QVqijdwz"
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "XPW1QVqijdwz",
+        "outputId": "74a25aff-765a-460d-bbf7-31bf2eae102e"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Model: \"model_1\"\n",
+            "_________________________________________________________________\n",
+            " Layer (type)                Output Shape              Param #   \n",
+            "=================================================================\n",
+            " input_2 (InputLayer)        [(None, None)]            0         \n",
+            "                                                                 \n",
+            " embedding_1 (Embedding)     (None, None, 256)         5120000   \n",
+            "                                                                 \n",
+            " bidirectional_1 (Bidirectio  (None, 64)               73984     \n",
+            " nal)                                                            \n",
+            "                                                                 \n",
+            " dropout_1 (Dropout)         (None, 64)                0         \n",
+            "                                                                 \n",
+            " dense_1 (Dense)             (None, 1)                 65        \n",
+            "                                                                 \n",
+            "=================================================================\n",
+            "Total params: 5,194,049\n",
+            "Trainable params: 5,194,049\n",
+            "Non-trainable params: 0\n",
+            "_________________________________________________________________\n",
+            "Epoch 1/10\n",
+            "625/625 [==============================] - 725s 1s/step - loss: 0.5296 - accuracy: 0.7391 - val_loss: 0.3803 - val_accuracy: 0.8542\n",
+            "Epoch 2/10\n",
+            "625/625 [==============================] - 645s 1s/step - loss: 0.3777 - accuracy: 0.8534 - val_loss: 0.3608 - val_accuracy: 0.8566\n",
+            "Epoch 3/10\n",
+            "342/625 [===============>..............] - ETA: 4:23 - loss: 0.3036 - accuracy: 0.8893"
+          ]
+        }
+      ],
       "source": [
         "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n",
         "embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)\n",
@@ -571,18 +591,15 @@
     {
       "cell_type": "markdown",
       "source": [
-        "One thing that's slightly hurting model performance here is that our input sequences are full of zeros. This comes from our use of the `output_sequence_length=max_length option` in `TextVectorization` (with `max_length` equal to 600): sentences longer than 600 tokens are truncated to a length of 600 tokens, and sentences shorter than 600 tokens are padded with zeros at the end so that they can be concatenated together with other sequences to form contiguous batches.\n",
+        "One thing that's slightly hurting model performance here is that our <font color='blue'>input sequences</font> are full of <font color='blue'>zeros</font>. This comes from our use of the `output_sequence_length=max_length option` in `TextVectorization` (with `max_length` equal to 600): sentences <font color='blue'>longer</font> than <font color='blue'>600 tokens</font> are <font color='blue'>truncated</font> to a length of 600 tokens, and sentences <font color='blue'>shorter</font> than <font color='blue'>600 tokens</font> are <font color='blue'>padded with zeros</font> at the end so that they can be concatenated together with other sequences to form contiguous batches.\n",
         "\n",
-        "We're using a bidirectional RNN: two RNN layers running in parallel, with one\n",
-        "processing the tokens in their natural order, and the other processing the same\n",
-        "tokens in reverse. The RNN that looks at the tokens in their natural order will spend its last iterations seeing only vectors that encode padding—possibly for several hundreds of iterations if the original sentence was short. The information stored in the internal state of the RNN will gradually fade out as it gets exposed to these meaningless inputs.\n",
+        "We're using a bidirectional RNN: two RNN layers running in <font color='blue'>parallel</font>, with one processing the tokens in their <font color='blue'>natural order</font>, and the other processing the <font color='blue'>same</font> tokens in <font color='blue'>reverse</font>. The RNN that looks at the tokens in their natural order will spend its <font color='blue'>last iterations</font> seeing only vectors that <font color='blue'>encode padding</font>—possibly for several hundreds of iterations if the original sentence was short. The information stored in the internal state of the RNN will gradually fade out as it gets exposed to these meaningless inputs.\n",
         "\n",
         "We need some way to tell the RNN that it should skip these iterations. There's an API for that: <font color='blue'>masking<font color='blue'>.\n",
         "\n",
-        "The *Embedding* layer is capable of generating a “mask” that corresponds to its\n",
-        "input data. This mask is a tensor of ones and zeros (or True/False booleans), of shape `(batch_size, sequence_length)`, where the entry `mask[i, t]` indicates where time step `t` of sample `i` should be skipped or not (the timestep will be skipped if mask`[i, t]` is 0 or False, and processed otherwise).\n",
+        "The *Embedding* layer is capable of generating a <font color='blue'>mask</font> that corresponds to its input data. This mask is a <font color='blue'>tensor</font> of <font color='blue'>ones</font> and <font color='blue'>zeros</font> (or True/False booleans), of shape `(batch_size, sequence_length)`, where the entry <font color='blue'>mask[i, t]</font> indicates where time step <font color='blue'>t</font> of sample <font color='blue'>i</font> should be skipped or not (the timestep will be skipped if mask`[i, t]` is 0 or False, and processed otherwise).\n",
         "\n",
-        "By default, this option isn't active—you can turn it on by passing `mask_zero=True` to your *Embedding* layer. You can retrieve the mask with the `compute_mask()` method:"
+        "By default, this option isn't active—you can turn it on by passing <font color='blue'>mask_zero=True</font> to your *Embedding* layer. You can retrieve the mask with the `compute_mask()` method:"
       ],
       "metadata": {
         "id": "-rLx8D0Jw-cT"
@@ -611,13 +628,10 @@
     {
       "cell_type": "markdown",
       "source": [
-        "(FIX FROM HERE!)\n",
+        "In practice, you will almost never have to manage masks by hand. Instead, Keras will <font color='blue'>automatically</font> pass on the mask to <font color='blue'>every layer</font> that is able to process it (as a piece of\n",
+        "metadata attached to the sequence it represents). This mask will be used by RNN layers to skip masked steps. If your model returns an entire sequence, the mask will also be used by the loss function to skip masked steps in the output sequence.\n",
         "\n",
-        "In practice, you will almost never have to manage masks by hand. Instead, Keras will\n",
-        "automatically pass on the mask to every layer that is able to process it (as a piece of\n",
-        "metadata attached to the sequence it represents). This mask will be used by RNN lay-\n",
-        "ers to skip masked steps. If your model returns an entire sequence, the mask will also\n",
-        "be used by the loss function to skip masked steps in the output sequence."
+        "Let’s try retraining our model with masking enabled."
       ],
       "metadata": {
         "id": "_GYcZ_HByN4U"
@@ -661,6 +675,15 @@
         "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "This time we get to <font color='blue'>88%</font> test accuracy—a small but noticeable improvement."
+      ],
+      "metadata": {
+        "id": "4ddOdbTRKmC9"
+      }
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -670,6 +693,27 @@
         "#### Using pretrained word embeddings"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Sometimes you have so little training data available that you can't use your data alone to learn an appropriate task-specific embedding of your vocabulary. In such cases, instead of learning word embeddings jointly with the problem you want to solve, you can <font color='blue'>load embedding vectors</font> from a <font color='blue'>precomputed embedding space</font> that you know is highly structured and exhibits useful properties—one that captures generic aspects of language structure. The rationale behind using pretrained word embeddings in natural language processing is much the same as for using pretrained convnets in image classification: you don't have enough data available to learn truly powerful features on your own, but you <font color='blue'>expect</font> that the <font color='blue'>features</font> you need are fairly <font color='blue'>generic</font>—that is, common visual features or semantic features. In this case, it makes sense to <font color='blue'>reuse</font> features learned on a different problem.\n",
+        "\n",
+        "Such word embeddings are generally computed using <font color='blue'>word-occurrence</font> statistics (observations about what words co-occur in sentences or documents), using a variety of techniques, some involving neural networks, others not. The idea of a <font color='blue'>dense, low-dimensional</font> embedding space for words, computed in an <font color='blue'>unsupervised</font> way, was initially explored by <font color='blue'>Bengio et al.</font> in the early 2000s, but it only started to take off in research and industry applications after the release of one of the most famous and successful word-embedding schemes: the <font color='blue'>Word2Vec</font> algorithm (https://code.google.com/archive/p/word2vec), developed by Tomas Mikolov at Google in 2013. Word2Vec\n",
+        "dimensions capture specific semantic properties, such as gender.\n",
+        "\n",
+        "There are various precomputed databases of word embeddings that you can down-\n",
+        "load and use in a Keras *Embedding* layer. Word2vec is one of them. Another popular one is called <font color='blue'>Global Vectors for Word Representation</font> (GloVe, https://nlp.stanford.edu/projects/glove), which was developed by Stanford researchers in 2014. This\n",
+        "embedding technique is based on factorizing a matrix of word co-occurrence statistics. Its developers have made available precomputed embeddings for millions of English tokens, obtained from Wikipedia data and Common Crawl data.\n",
+        "\n",
+        "Let's look at how you can get started using <font color='blue'>GloVe</font> embeddings in a Keras model. The same method is valid for Word2Vec embeddings or any other word-embedding database. We'll start by downloading the GloVe files and parse them. We'll then load the word vectors into a Keras Embedding layer, which we'll use to build a new model.\n",
+        "\n",
+        "First, let's download the GloVe word embeddings precomputed on the 2014\n",
+        "English Wikipedia dataset. It's an 822 MB zip file containing 100-dimensional embedding vectors for 400,000 words (or non-word tokens)."
+      ],
+      "metadata": {
+        "id": "o_Fk0FChKxAU"
+      }
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -682,6 +726,15 @@
         "!unzip -q glove.6B.zip"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's parse the unzipped file (a .txt file) to build an <font color='blue'>index</font> that <font color='blue'>maps words</font> (as strings) to their <font color='blue'>vector</font> representation."
+      ],
+      "metadata": {
+        "id": "-L-N4516MgzO"
+      }
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -712,6 +765,15 @@
         "print(f\"Found {len(embeddings_index)} word vectors.\")"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Next, let's build an embedding matrix that you can load into an *Embedding* layer. It must be a matrix of shape `(max_words, embedding_dim)`, where each entry `i` contains the `embedding_dim`-dimensional vector for the word of index `i` in the reference word index (built during tokenization)."
+      ],
+      "metadata": {
+        "id": "TJKhaPdMM8li"
+      }
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -731,17 +793,26 @@
       "source": [
         "embedding_dim = 100\n",
         "\n",
-        "vocabulary = text_vectorization.get_vocabulary()\n",
-        "word_index = dict(zip(vocabulary, range(len(vocabulary))))\n",
+        "vocabulary = text_vectorization.get_vocabulary()                                # Retrieve the vocabulary indexed by our previous TextVectorization layer.\n",
+        "word_index = dict(zip(vocabulary, range(len(vocabulary))))                      # Use it to create a mapping from words to their index in the vocabulary.\n",
         "\n",
-        "embedding_matrix = np.zeros((max_tokens, embedding_dim))\n",
+        "embedding_matrix = np.zeros((max_tokens, embedding_dim))                        # Prepare a matrix that we’ll fill with the GloVe vectors.\n",
         "for word, i in word_index.items():\n",
-        "    if i < max_tokens:\n",
-        "        embedding_vector = embeddings_index.get(word)\n",
+        "    if i < max_tokens:                                                          # Fill entry i in the matrix with the word vector for index i. Words\n",
+        "        embedding_vector = embeddings_index.get(word)                           # not found in the embedding index will be all zeros.\n",
         "    if embedding_vector is not None:\n",
         "        embedding_matrix[i] = embedding_vector"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we use a <font color='blue'>Constant</font> initializer to <font color='blue'>load</font> the <font color='blue'>pretrained embeddings</font> in an *Embedding* layer. So as not to disrupt the pretrained representations during training, we <font color='blue'>freeze</font> the layer via `trainable=False`:"
+      ],
+      "metadata": {
+        "id": "GJZ3pbusNu2E"
+      }
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -759,6 +830,15 @@
         ")"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We're now ready to train a new model—identical to our previous model, but leveraging the 100-dimensional pretrained GloVe embeddings instead of 128-dimensional learned embeddings."
+      ],
+      "metadata": {
+        "id": "EhOa_ki_N4CR"
+      }
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -795,6 +875,24 @@
         "model = keras.models.load_model(\"glove_embeddings_sequence_model.keras\")\n",
         "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")"
       ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "You'll find that on this particular task, pretrained embeddings aren't very helpful, because the dataset contains <font color='blue'>enough samples</font> that it is possible to <font color='blue'>learn</font> a specialized enough <font color='blue'>embedding space</font> from <font color='blue'>scratch</font>. However, leveraging pretrained embeddings can be very helpful when you're working with a smaller dataset."
+      ],
+      "metadata": {
+        "id": "8u-zMJBxOApe"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "Ns3vzk-XOFGf"
+      },
+      "execution_count": null,
+      "outputs": []
     }
   ],
   "metadata": {