karpathy · gngdb · Sep 10, 2022 · Sep 10, 2022 · Sep 10, 2022
diff --git a/mingpt/model.py b/mingpt/model.py
@@ -84,12 +84,11 @@ def __init__(self, config):
             act     = NewGELU(),
             dropout = nn.Dropout(config.resid_pdrop),
         ))
-        m = self.mlp
-        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x)))) # MLP forward
 
     def forward(self, x):
         x = x + self.attn(self.ln_1(x))
-        x = x + self.mlpf(self.ln_2(x))
+        m = self.mlp
+        x = x + m.dropout(m.c_proj(m.act(m.c_fc(self.ln_2(x))))) # MLP forward
         return x
 
 class GPT(nn.Module):

diff --git a/mingpt/trainer.py b/mingpt/trainer.py
@@ -7,6 +7,7 @@
 from collections import defaultdict
 
 import torch
+import torch.nn as nn
 from torch.utils.data.dataloader import DataLoader
 from mingpt.utils import CfgNode as CN
 
@@ -26,6 +27,7 @@ def get_default_config():
         C.betas = (0.9, 0.95)
         C.weight_decay = 0.1 # only applied on matmul weights
         C.grad_norm_clip = 1.0
+        C.data_parallel = True
         return C
 
     def __init__(self, config, model, train_dataset):
@@ -64,6 +66,10 @@ def run(self):
         # setup the optimizer
         self.optimizer = model.configure_optimizers(config)
 
+        if torch.cuda.device_count() > 1 and config.data_parallel:
+            model = nn.DataParallel(model)
+            model.to(self.device)
+
         # setup the dataloader
         train_loader = DataLoader(
             self.train_dataset,
@@ -91,6 +97,8 @@ def run(self):
 
             # forward the model
             logits, self.loss = model(x, y)
+            if self.loss.nelement() > 1:
+                self.loss = self.loss.mean() # DataParallel can return a vector of losses
 
             # backprop and update the parameters
             model.zero_grad(set_to_none=True)