diff --git a/en-wordlist.txt b/en-wordlist.txt index 56cb35ba58..d06a376751 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -1,6 +1,7 @@ ACL ADI +ALiBi AOT AOTInductor APIs @@ -80,6 +81,7 @@ FX FX's FairSeq Fastpath +FFN FloydHub FloydHub's Frobenius @@ -128,7 +130,7 @@ Kihyuk Kiuk Kubernetes Kuei -KV-Caching +KV LRSchedulers LSTM LSTMs @@ -164,6 +166,7 @@ NLP NTK NUMA NaN +NaNs NanoGPT Netron NeurIPS @@ -232,6 +235,7 @@ Sigmoid SoTA Sohn Spacy +SwiGLU TCP THP TIAToolbox @@ -467,6 +471,7 @@ nheads nightlies NJT NJTs +NJT's num numericalize numpy diff --git a/intermediate_source/transformer_building_blocks.py b/intermediate_source/transformer_building_blocks.py index 95118eb0b9..c18fb22193 100644 --- a/intermediate_source/transformer_building_blocks.py +++ b/intermediate_source/transformer_building_blocks.py @@ -344,7 +344,7 @@ def benchmark(func, *args, **kwargs): torch.manual_seed(6) vanilla_mha_layer = nn.MultiheadAttention(E_q, nheads, dropout=dropout, batch_first=True, bias=bias, device='cuda') -# nn.MultiheadAttention uses a non conventional initialization for layers, so do this for exact parity :( +# ``nn.MultiheadAttention`` uses a non conventional initialization for layers, so do this for exact parity :( mha_layer.out_proj.weight = nn.Parameter(vanilla_mha_layer.out_proj.weight.clone().detach()) mha_layer.packed_proj.weight = nn.Parameter(vanilla_mha_layer.in_proj_weight.clone().detach()) mha_layer.out_proj.bias = nn.Parameter(vanilla_mha_layer.out_proj.bias.clone().detach()) @@ -421,7 +421,7 @@ def benchmark(func, *args, **kwargs): # gives equivalent results to an ``nn.TransformerEncoderLayer`` with # ``is_causal=True``. # -# We demonstrate examples of implementing the rest of the nn layers +# We demonstrate examples of implementing the rest of the ``nn`` layers # `here `_ but omit that from this # tutorial for brevity. @@ -457,7 +457,7 @@ def benchmark(func, *args, **kwargs): # * SwiGLU activation in feed-forward network of Transformer Layer # # Input projection for MultiheadAttention -# ---------------------------------------- +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Recall that when doing self-attention, the ``query``, ``key`` and ``value`` # are the same tensor. Each of these tensors is projected with a # ``Linear(E_q, E_total)`` layer. Instead, we can pack this into one layer, @@ -502,8 +502,9 @@ def forward(self, query): (q_out, k_out, v_out), time_packed, _ = benchmark(packed_in_proj, q) print(f"InputProjection: {time:5f} s, PackedInputProjection: {time_packed:5f} s, speedup: {time/time_packed:.2f}x") +################################################## # SwiGLU feed forward network of Transformer Layer -# ------------------------------------------------ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # SwiGLU is a non-linear activation function that is increasingly popular in the feed-forward # network of the transformer layer (e.g. Llama). A feed-forward network with SwiGLU activation is defined as @@ -524,6 +525,7 @@ def __init__(self, dim, hidden_dim, multiple_of, ffn_dim_multiplier=None, device def forward(self, x): return self.w2(F.silu(self.w1(x)) * self.w3(x)) +######################################################################## # An alternative way of implementing this that uses packed projection is class PackedSwiGLUFFN(nn.Module): @@ -543,6 +545,7 @@ def forward(self, x): x1, x3 = torch.chunk(self.w13(x), 2, dim=-1) return self.w2(F.silu(x1) * x3) +################################################################################ # We can compare the performance of the two implementations as follows # Depending on your hardware, you might see different results. On an A100 I see # 1.12x speedup for D=128. @@ -635,20 +638,14 @@ def alibi_mod(score, b, h, q_idx, kv_idx): ) out_flex2 = flex_attention(query, key, value, score_mod=alibi_score_mod) -################################################################################ -# And more -# -------- -# -# We intend to update this tutorial to demonstrate more examples of how to use -# the various performant building blocks such as KV-Caching, Grouped Query Attention -# etc. - ################################################################################ # Extended examples # ----------------- # -# There are several good examples of using various performant building blocks to +# We intend to update this tutorial to demonstrate more examples of how to use +# the various performant building blocks such as KV-Caching, Grouped Query Attention +# etc. Further, there are several good examples of using various performant building blocks to # implement various transformer architectures. Some examples include # # * `gpt-fast `_