Fix dropout problems:

ruotianluo · ruotianluo · commit 4ec081983df7 · 2017-08-02T11:03:08.000-07:00
1. For showtell, and capitonmodel, since the rnn are using official implementation, there is no dropout at the last layer. We manually add it before doing logit.
2. For Att2in models, the state passed to the next time step is also dropped out; now only the output is dropped out.
diff --git a/misc/Att2inModel.py b/misc/Att2inModel.py
@@ -68,9 +68,7 @@ def forward(self, xt, fc_feats, att_feats, p_att_feats, state):
         next_c = forget_gate * state[1][-1] + in_gate * in_transform
         next_h = out_gate * F.tanh(next_c)
 
-        next_h = self.dropout(next_h)
-
-        output = next_h
+        output = self.dropout(next_h)
         state = (next_h.unsqueeze(0), next_c.unsqueeze(0))
         return output, state
 
diff --git a/misc/AttModel.py b/misc/AttModel.py
@@ -464,9 +464,7 @@ def forward(self, xt, fc_feats, att_feats, p_att_feats, state):
         next_c = forget_gate * state[1][-1] + in_gate * in_transform
         next_h = out_gate * F.tanh(next_c)
 
-        next_h = self.dropout(next_h)
-
-        output = next_h
+        output = self.dropout(next_h)
         state = (next_h.unsqueeze(0), next_c.unsqueeze(0))
         return output, state
 
diff --git a/misc/CaptionModel.py b/misc/CaptionModel.py
@@ -33,6 +33,7 @@ def __init__(self, opt):
         self.linear = nn.Linear(self.fc_feat_size, self.num_layers * self.rnn_size) # feature to rnn_size
         self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size)
         self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1)
+        self.dropout = nn.Dropout(self.drop_prob_lm)
 
         self.init_weights()
 
@@ -78,7 +79,7 @@ def forward(self, fc_feats, att_feats, seq):
             xt = self.embed(it)
 
             output, state = self.core(xt, fc_feats, att_feats, state)
-            output = F.log_softmax(self.logit(output))
+            output = F.log_softmax(self.logit(self.dropout(output)))
             outputs.append(output)
 
         return torch.cat([_.unsqueeze(1) for _ in outputs], 1)
@@ -165,7 +166,7 @@ def sample_beam(self, fc_feats, att_feats, opt={}):
                     state = new_state
 
                 output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, state)
-                logprobs = F.log_softmax(self.logit(output))
+                logprobs = F.log_softmax(self.logit(self.dropout(output)))
 
             self.done_beams[k] = sorted(self.done_beams[k], key=lambda x: -x['p'])
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
@@ -216,7 +217,7 @@ def sample(self, fc_feats, att_feats, opt={}):
                 seqLogprobs.append(sampleLogprobs.view(-1))
 
             output, state = self.core(xt, fc_feats, att_feats, state)
-            logprobs = F.log_softmax(self.logit(output))
+            logprobs = F.log_softmax(self.logit(self.dropout(output)))
 
         return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
 
diff --git a/misc/ShowTellModel.py b/misc/ShowTellModel.py
@@ -26,6 +26,7 @@ def __init__(self, opt):
         self.core = getattr(nn, self.rnn_type.upper())(self.input_encoding_size, self.rnn_size, self.num_layers, bias=False, dropout=self.drop_prob_lm)
         self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size)
         self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1)
+        self.dropout = nn.Dropout(self.drop_prob_lm)
 
         self.init_weights()
 
@@ -73,7 +74,7 @@ def forward(self, fc_feats, att_feats, seq):
                 xt = self.embed(it)
 
             output, state = self.core(xt.unsqueeze(0), state)
-            output = F.log_softmax(self.logit(output.squeeze(0)))
+            output = F.log_softmax(self.logit(self.dropout(output.squeeze(0))))
             outputs.append(output)
 
         return torch.cat([_.unsqueeze(1) for _ in outputs[1:]], 1).contiguous()
@@ -159,7 +160,7 @@ def sample_beam(self, fc_feats, att_feats, opt={}):
                     state = new_state
 
                 output, state = self.core(xt.unsqueeze(0), state)
-                logprobs = F.log_softmax(self.logit(output.squeeze(0)))
+                logprobs = F.log_softmax(self.logit(self.dropout(output.squeeze(0))))
 
             self.done_beams[k] = sorted(self.done_beams[k], key=lambda x: -x['p'])
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
@@ -212,6 +213,6 @@ def sample(self, fc_feats, att_feats, opt={}):
                 seqLogprobs.append(sampleLogprobs.view(-1))
 
             output, state = self.core(xt.unsqueeze(0), state)
-            logprobs = F.log_softmax(self.logit(output.squeeze(0)))
+            logprobs = F.log_softmax(self.logit(self.dropout(output.squeeze(0))))
 
         return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)