Skip to content

Commit

Permalink
more spelling + rendering
Browse files Browse the repository at this point in the history
  • Loading branch information
mikaylagawarecki committed Oct 30, 2024
1 parent d83f14b commit cae22fa
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 14 deletions.
7 changes: 6 additions & 1 deletion en-wordlist.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

ACL
ADI
ALiBi
AOT
AOTInductor
APIs
Expand Down Expand Up @@ -80,6 +81,7 @@ FX
FX's
FairSeq
Fastpath
FFN
FloydHub
FloydHub's
Frobenius
Expand Down Expand Up @@ -128,7 +130,7 @@ Kihyuk
Kiuk
Kubernetes
Kuei
KV-Caching
KV
LRSchedulers
LSTM
LSTMs
Expand Down Expand Up @@ -164,6 +166,7 @@ NLP
NTK
NUMA
NaN
NaNs
NanoGPT
Netron
NeurIPS
Expand Down Expand Up @@ -232,6 +235,7 @@ Sigmoid
SoTA
Sohn
Spacy
SwiGLU
TCP
THP
TIAToolbox
Expand Down Expand Up @@ -467,6 +471,7 @@ nheads
nightlies
NJT
NJTs
NJT's
num
numericalize
numpy
Expand Down
23 changes: 10 additions & 13 deletions intermediate_source/transformer_building_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def benchmark(func, *args, **kwargs):
torch.manual_seed(6)
vanilla_mha_layer = nn.MultiheadAttention(E_q, nheads, dropout=dropout, batch_first=True, bias=bias, device='cuda')

# nn.MultiheadAttention uses a non conventional initialization for layers, so do this for exact parity :(
# ``nn.MultiheadAttention`` uses a non conventional initialization for layers, so do this for exact parity :(
mha_layer.out_proj.weight = nn.Parameter(vanilla_mha_layer.out_proj.weight.clone().detach())
mha_layer.packed_proj.weight = nn.Parameter(vanilla_mha_layer.in_proj_weight.clone().detach())
mha_layer.out_proj.bias = nn.Parameter(vanilla_mha_layer.out_proj.bias.clone().detach())
Expand Down Expand Up @@ -421,7 +421,7 @@ def benchmark(func, *args, **kwargs):
# gives equivalent results to an ``nn.TransformerEncoderLayer`` with
# ``is_causal=True``.
#
# We demonstrate examples of implementing the rest of the nn layers
# We demonstrate examples of implementing the rest of the ``nn`` layers
# `here <https://github.com/mikaylagawarecki/temp>`_ but omit that from this
# tutorial for brevity.

Expand Down Expand Up @@ -457,7 +457,7 @@ def benchmark(func, *args, **kwargs):
# * SwiGLU activation in feed-forward network of Transformer Layer
#
# Input projection for MultiheadAttention
# ----------------------------------------
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Recall that when doing self-attention, the ``query``, ``key`` and ``value``
# are the same tensor. Each of these tensors is projected with a
# ``Linear(E_q, E_total)`` layer. Instead, we can pack this into one layer,
Expand Down Expand Up @@ -502,8 +502,9 @@ def forward(self, query):
(q_out, k_out, v_out), time_packed, _ = benchmark(packed_in_proj, q)
print(f"InputProjection: {time:5f} s, PackedInputProjection: {time_packed:5f} s, speedup: {time/time_packed:.2f}x")

##################################################
# SwiGLU feed forward network of Transformer Layer
# ------------------------------------------------
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# SwiGLU is a non-linear activation function that is increasingly popular in the feed-forward
# network of the transformer layer (e.g. Llama). A feed-forward network with SwiGLU activation is defined as

Expand All @@ -524,6 +525,7 @@ def __init__(self, dim, hidden_dim, multiple_of, ffn_dim_multiplier=None, device
def forward(self, x):
return self.w2(F.silu(self.w1(x)) * self.w3(x))

########################################################################
# An alternative way of implementing this that uses packed projection is

class PackedSwiGLUFFN(nn.Module):
Expand All @@ -543,6 +545,7 @@ def forward(self, x):
x1, x3 = torch.chunk(self.w13(x), 2, dim=-1)
return self.w2(F.silu(x1) * x3)

################################################################################
# We can compare the performance of the two implementations as follows
# Depending on your hardware, you might see different results. On an A100 I see
# 1.12x speedup for D=128.
Expand Down Expand Up @@ -635,20 +638,14 @@ def alibi_mod(score, b, h, q_idx, kv_idx):
)
out_flex2 = flex_attention(query, key, value, score_mod=alibi_score_mod)

################################################################################
# And more
# --------
#
# We intend to update this tutorial to demonstrate more examples of how to use
# the various performant building blocks such as KV-Caching, Grouped Query Attention
# etc.


################################################################################
# Extended examples
# -----------------
#
# There are several good examples of using various performant building blocks to
# We intend to update this tutorial to demonstrate more examples of how to use
# the various performant building blocks such as KV-Caching, Grouped Query Attention
# etc. Further, there are several good examples of using various performant building blocks to
# implement various transformer architectures. Some examples include
#
# * `gpt-fast <https://github.com/pytorch-labs/gpt-fast>`_
Expand Down

0 comments on commit cae22fa

Please sign in to comment.