-
Notifications
You must be signed in to change notification settings - Fork 0
/
layers.py
68 lines (58 loc) · 2.24 KB
/
layers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import torch
from torch import nn
from einops import rearrange
class MultiHeadAttention(nn.Module):
def __init__(self, n_heads, embed_dim, d_k, dropout=0.1):
super().__init__()
self.n_heads, self.d_k = n_heads, d_k
self.W_qkv = nn.Linear(embed_dim, 3 * n_heads * d_k)
self.attn_dropout = nn.Dropout(dropout)
self.fc = nn.Linear(n_heads * d_k, embed_dim)
self.resid_dropout = nn.Dropout(dropout)
def forward(self, X):
Q, K, V = rearrange(self.W_qkv(X), "b l (h ddd) -> b h l ddd", h=self.n_heads).split(self.d_k, dim=-1)
attn = torch.einsum("bhqd,bhkd->bhqk", Q, K) / self.d_k**0.5
attn_weights = self.attn_dropout(torch.softmax(attn, dim=-1))
attn_out = torch.einsum("bhqk,bhkv->bhqv", attn_weights, V)
proj = self.fc(rearrange(attn_out, "b h l d -> b l (h d)"))
return self.resid_dropout(proj)
class FFN(nn.Module):
def __init__(self, in_dim, hidden_dim, out_dim, num_hiddens=1, dropout=0.1, final_dropout=True):
super().__init__()
assert num_hiddens > 0, "Must have > 0 hidden layers."
self.layers = nn.ModuleList([
nn.Sequential(
nn.Linear(in_dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout)
)
])
for h in range(num_hiddens - 1):
self.layers.append(nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout)
))
self.layers.append(nn.Linear(hidden_dim, out_dim))
if final_dropout:
self.layers.append(nn.Dropout(dropout))
def forward(self, X):
for layer in self.layers:
X = layer(X)
return X
class PreNormAndAdd(nn.Module):
def __init__(self, embed_dim, sublayer):
super().__init__()
self.norm = nn.LayerNorm(embed_dim)
self.sublayer = sublayer
def forward(self, X):
return X + self.sublayer(self.norm(X))
class TransformerBlock(nn.Module):
def __init__(self, n_heads, embed_dim, d_k, ffn_hidden_dim, dropout):
super().__init__()
self.net = nn.Sequential(
PreNormAndAdd(embed_dim, MultiHeadAttention(n_heads, embed_dim, d_k, dropout)),
PreNormAndAdd(embed_dim, FFN(embed_dim, ffn_hidden_dim, embed_dim, num_hiddens=1, dropout=dropout))
)
def forward(self, X):
return self.net(X)