test.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="MagicDec: Breaking the Latency-Throughput Tradeoff for Long Contexts with Speculative Decoding">
  <meta property="og:title" content="Magicdec"/>
  <meta property="og:description" content="MagicDec: Breaking the Latency-Throughput Tradeoff for Long Contexts with Speculative Decoding"/>
  <meta property="og:url" content="https://github.com/Infini-AI-Lab/MagicDec/"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/images/icons/MagicDec.png"/>
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="MagicDec">
  <meta name="twitter:description" content="MagicDec: Breaking the Latency-Throughput Tradeoff for Long Contexts with Speculative Decoding">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/icons/MagicDec.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="Speculative Decoding">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>MagicDec: Breaking the Latency-Throughput Tradeoff for Long Contexts with Speculative Decoding</title>
  <link rel="icon" type="image/x-icon" href="static/images/icons/MagicDec.png">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
  <script type="text/x-mathjax-config">
    MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}});
  </script>
  <script type="text/javascript"
    src="http://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
  </script>
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
  <style>
    @font-face {
      font-family: 'TriForceFont';
      src: url('static/Triforce.ttf') format('truetype');
    }
  
    .custom-font {
      font-family: 'TriForceFont', sans-serif !important;
        font-size: 3.0rem;
    }

    body {
        background-color: #f5f5f5; /* Adjust this to match your page's gray color */
    }

    .image-container {
        background-color: #f5f5f5; /* Same as body background */
        display: inline-block; /* Or 'block' depending on your layout needs */
    }

    .image-container img {
        mix-blend-mode: multiply;
        max-width: 100%;
        height: auto;
    }


    .container.is-fluid {
      margin-left: 15px;
      margin-right: 15px;
      max-width: none;
    }
    
    .hero .hero-body {
      padding: 3rem 0;
    }
    
    .section {
      padding: 3rem 0;
    }
    
    .column.is-full-width {
      padding: 0 15px;
    }
  </style>
</head>
<body>


<!-- Section: Header Titlepage -->
<section class="hero">
  <div class="hero-body">
    <div class="container is-fluid">
      <div class="columns is-centered">
        <div class="column is-full-width has-text-centered">
            <img src="static/images/icons/MagicDec.png" alt="Magic Wand Icon" style="display: inline; height: 3rem; vertical-align: top;">
            <h1 class="title is-2 publication-title" style="display: inline;">MagicDec: Breaking the Latency-Throughput Tradeoff for Long Contexts with Speculative Decoding</h1>
            
            <br><br>
            
            <div class="is-size-5 publication-authors">
              <span class="author-block"><a href="" target="_blank">Jian Chen</a><sup>*1</sup>,</span>
              <span class="author-block"><a href="" target="_blank">Vashisth Tiwari</a><sup>*1</sup>,</span>
              <span class="author-block"><a href="" target="_blank">Ranajoy Sadhukhan</a><sup>*1</sup>,</span>
              <span class="author-block"><a href="https://dreaming-panda.github.io/" target="_blank">Zhuoming Chen</a><sup>1</sup>,</span>
              <span class="author-block"><a href="" target="_blank">Jinyuan Shi</a><sup>2</sup></span>
              <br>
              <span class="author-block"><a href="" target="_blank">Ian Yan</a><sup>2</sup>,</span>
              <span class="author-block"><a href="https://www.andrew.cmu.edu/user/beidic/" target="_blank">Beidi Chen</a><sup>1,3</sup></span>
            </div>
            
            <div class="is-size-5 publication-authors">
              <span class="affliation">
                <small>
                  <sup>1</sup>Carnegie Mellon University 
                  <sup>2</sup>Moffett AI
                  <sup>3</sup>Meta AI (FAIR)
                </small>
              </span>
              <span class="eql-cntrb">
                <small><br><sup>*</sup>Indicates Equal Contribution</small>
              </span>
            </div>
            
            <div class="column has-text-centered">
              <span class="link-block">
                <a href="https://arxiv.org/abs/2404.11912" target="_blank" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon"><i class="ai ai-arxiv"></i></span>
                  <span>arXiv</span>
                </a>
              </span>
              
              <span class="link-block">
                <a href="https://github.com/Infini-AI-Lab/MagicDec/tree/main" target="_blank" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon"><i class="fab fa-github"></i></span>
                  <span>Code</span>
                </a>
              </span>
              
              <span class="link-block">
                <a href="https://youtu.be/vRAaAyjr6Jo" target="_blank" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon"><i class="fab fa-youtube"></i></span>
                  <span>Video</span>
                </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>


<!-- Section: Paper abstract -->
<section class="section hero is-light">
  <div class="container is-fluid">
    <div class="columns is-centered">
      <div class="column is-full-width">
          <h2 class="title is-3" style="text-align: center;">
            <img src="static/images/icons/Llama.png" style="height: 43px; display: inline; vertical-align:text-top;"/>
            &nbsp; Introduction
          </h2>
          <div class="content has-text-justified">
            <p>
                As Large Language Models (LLMs) become more prevalent in applications like interactive chatbots, efficient serving of long context requests has gained significant attention. However, balancing throughput and latency when serving long-context requests remains a challenge, with improvements in one often leading to drawbacks in the other. Our method <b>MagicDec</b> uses speculative decoding to handle this issue for large batches of long sequences without sacrificing output quality. We observe and theoretically motivate a critical sequence length beyond which speculative decoding becomes increasingly beneficial for larger batch sizes, highlighting an extensive scope of MagicDec in long context serving. For moderate to long sequences, we demonstrate upto <b>2x speedup for Llama2-7B-32K and 1.84x speedup for Llama3.1-8B </b>, when serving batch sizes ranging from 32 to 256 on 8 NVIDIA A100 GPUs. 
                <!-- <a style="color: #209CEE" href="https://arxiv.org/abs/2404.11912" target="_blank">paper</a>. -->
            </p>
          </div>
        </div>
      </div>
    </div>
</section>

<!-- Section: Paper abstract -->
<section class="section hero is-light">
  <div class="container is-fluid">
    <div class="columns is-centered">
      <div class="column is-full-width">
          <h2 class="title is-3" style="text-align: center;">
            <img src="static/images/icons/timepassing.png" style="height: 50px; display: inline; vertical-align: middle;"/>
            &nbsp; MagicDec: Improvements for Large Batches and Long Sequences
          </h2>
          <div class="content has-text-justified">
            <p>
              To demonstrate improvements in speedups with increasing batch size, we have chosen two kinds of draft models: (1)
              Standalone draft model with GQA (<a style="color: #209CEE" href="https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T">TinyLLaMA-1.1B</a> to speculate <a style="color: #209CEE" href="https://huggingface.co/togethercomputer/LLaMA-2-7B-32K">LLaMA-2-7B-32K</a>) and (2)
              Self-Speculation for Llama-2-7B-32k and and <a style="color: #209CEE" href="https://huggingface.co/meta-llama/Meta-Llama-3.1-8B">Llama-3.1-8B</a>. In both cases, we utilize StreamingLLM in
              drafting to address the KV bottleneck. The performance was evaluated on A100, H100, and L40 devices on <a style="color: #209CEE" href="https://huggingface.co/datasets/emozilla/pg19-test">PG-19</a> dataset.
            </p>
            <h6>Key observations:</h6>
            <!-- <ol style="font-size: 0.9em;">
              <li>Speculative Decoding is effective in improving throughput for long enough sequences.</li>
              <li>For every model and hardware pair, there exists a critical sequence length, beyond which
                  speedup increases with increasing batch size—the longer the sequence, the better the
                  speedup scaling.</li>
              <li>For moderate to long sequences, optimal speculation length also increases with batch
                  size.</li>
            </ol> -->
             <!-- <p><strong>1. Speculative Decoding achieves better throughput-latency trade-off for moderate to long sequences.</strong></p> -->
             <!-- <div style="display: flex; flex-wrap: wrap; justify-content: center;">
              <div style="width: 120%; min-width: 300px; margin: 5px;">
                  <canvas id="chart1"></canvas>
              </div>
              <div style="width: 120%; min-width: 300px; margin: 5px;">
                  <canvas id="chart2"></canvas>
              </div>
            </div>
             <script src="static/js/plots/throughput_latency.js"></script> -->
             <!-- <ol style="font-size: 0.9em;">
              <li>Speculative Decoding is effective in improving throughput for long enough sequences.</li>
              <li>For every model and hardware pair, there exists a critical sequence length, beyond which
                  speedup increases with increasing batch size—the longer the sequence, the better the
                  speedup scaling.</li>
              <li>For moderate to long sequences, optimal speculation length also increases with batch
                  size.</li>
            </ol> -->
            <p><strong>1. Speculative Decoding achieves better throughput-latency trade-off for moderate to long sequences.</strong></p>
            <!-- <div style="display: flex; flex-wrap: wrap; justify-content: center;">
            <div style="width: 60%; min-width: 250px; margin: 5px;">
                <canvas id="chart1"></canvas>
            </div>
            <div style="width: 60%; min-width: 250px; margin: 5px;">
                <canvas id="chart2"></canvas>
            </div>
          </div>
            <script src="static/js/plots/throughput_latency_smaller.js"></script>
          
             -->
             <div style="display: flex; flex-wrap: wrap; justify-content: center;">
              <div style="width: 45%; min-width: 150px; margin: 5px;">
                  <canvas id="chart1"></canvas>
              </div>
              <div style="width: 45%; min-width: 150px; margin: 5px;">
                  <canvas id="chart2"></canvas>
              </div>
          </div>
          <script src="static/js/plots/throughput_latency_smaller.js"></script>
          
             <p><strong>2. For every model and hardware pair, there exists a critical sequence length, beyond which speedup increases with increasing batch size—the longer the sequence, the better the speedup scaling.</strong></p>
             <p><strong>3. Interestingly, the optimal speculation length also increases with batch size for sufficiently long sequences.</strong></p>
          </div>
          <br>
          <div class="image-container">
            <img src="static/images/speedups_budget512.png" alt="Results" height="350" />
            <figcaption>
              <strong>Figure 1: End-to-end Speculative Decoding Speedups for Various Target-Draft pairs on 8 NVIDIA A100s.</strong>
            </figcaption>
          </div>
          <br>
          <br>
          <style>
            table {
              border-collapse: collapse;
              width: 100%;
            }
            th, td {
              text-align: left;
              padding: 12px;
            }
            tr td {
              border-bottom: 1px solid black;
            }
            th {
              border-bottom: 2px solid black;
            }
            tr:nth-child(3) td, 
            tr:nth-child(5) td {
              border-bottom: 2px solid black;
            }
          </style>
          
          <table>
            <caption style="caption-side: top; text-align: center; font-weight: bold; margin-bottom: 10px;">
              End-to-end Speculative Decoding Speedups for Various Target-Draft pairs on H100
            </caption>
            <tr>
              <th>Target</th>
              <th>Draft</th>
              <th>Prefill</th>
              <th>Batch Size</th>
              <th>Optimal Spec Len</th>
              <th>Speedup</th>
            </tr>
            <tr>
              <td>Llama3.1-8B</td>
              <td>Selfspec</td>
              <td>32000</td>
              <td>16</td>
              <td>2</td>
              <td>1.24</td>
            </tr>
            <tr>
              <td>Llama3.1-8B</td>
              <td>Selfspec</td>
              <td>32000</td>
              <td>32</td>
              <td>3</td>
              <td>1.42</td>
            </tr>
            <tr>
              <td>Llama2-7B</td>
              <td>Selfspec</td>
              <td>8000</td>
              <td>32</td>
              <td>3</td>
              <td>1.44</td>
            </tr>
            <tr>
              <td>Llama2-7B</td>
              <td>Selfspec</td>
              <td>8000</td>
              <td>64</td>
              <td>4</td>
              <td>1.68</td>
            </tr>
            <tr>
              <td>Llama2-7B</td>
              <td>Tinyllama1.1B</td>
              <td>8000</td>
              <td>32</td>
              <td>3</td>
              <td>1.60</td>
            </tr>
            <tr>
              <td>Llama2-7B</td>
              <td>Tinyllama1.1B</td>
              <td>8000</td>
              <td>64</td>
              <td>4</td>
              <td>1.84</td>
            </tr>
          </table>      
          <br>
          <!-- <h4 class="title is-5">
            <img src="static/images/icons/demo.png" style="height: 36px; display: inline; vertical-align: middle;"/>
            &nbsp; Summarize a Book of 127K Tokens
          </h4>
          <p>
            Here we present a demo for LWM-Text-Chat-128K inference on two RTX 4090s with 127K contexts (with and without TriForce). We prefill the model with 127K tokens from a book in NarrativeQA, directing the model to summarize the book's content. The video is displayed at normal speed (1x).
          </p>
          <div class="item item-video1">
            <video poster="" id="video1" autoplay controls muted height="100%">
              <source src="static/videos/TriForce.mp4" type="video/mp4">
            </video>
          </div> -->
        </div>
      </div>
    </div>
  </section>


<!-- Section: Motivation -->
<section class="section hero is-light">
  <div class="container is-fluid">
    <div class="columns is-centered">
      <div class="column is-full-width">
          <h2 class="title is-3" style="text-align: center;">
            <img src="static/images/icons/Idea.png" style="height: 50px; display: inline; vertical-align: middle;"/>
            &nbsp; Motivation
          </h2>
          <div class="content has-text-justified">
            <p>
              Speculative decoding leverages underutilized GPU compute during memory-bound autoregressive decoding. However, prior research [cite hao, deepspeed, etc.] has shown that speculative decoding becomes less effective as batch sizes increase and exhaust the available compute. These challenges have led to speculative decoding being underutilized in batch processing methods.
            </p>
            
            <h4 class="title is-5">
              <img src="static/images/icons/Switch.png" style="height: 36px; display: inline; vertical-align: middle;"/>
              &nbsp; Shift in Bottleneck from Compute to Memory
            </h4>
            <p>
              As context length increases, the KV becomes the bottleneck, scaling linearly with batch size. Beyond a critical sequence length, the KV loading time overtakes computation time as the dominant factor (Figure 2). This transition allows speculative decoding's speedup to scale with batch size.
  
              Our analysis for a batch size of 256 and a fixed StreamingLLM budget of 512 (Figure 2) shows that as sequences grow, bottlenecks shift from computation to memory. For instance, with a 4k sequence length for Llama-2, KV load times start to dominate as batch size increases.
            </p>
            
            <div class="image-container" style="display: flex; flex-direction: column; align-items: center;">
              <div style="display: flex; justify-content: space-around; width: 100%;">
                <img src="static/images/kvloadtime.png" alt="KV Load Time" style="height: 250px; width: auto; margin: 0 20px;" />
                <img src="static/images/arithmetic_intensity.png" alt="Arithmetic Intensity" style="height: 250px; width: auto; margin: 0 20px;" />
              </div>
              <figcaption style="margin-top: 10px; text-align: center;">
                <strong>Figure 2: (Left) KV, weights, activations, and compute times for different batch sizes </strong>(Self Speculation LLaMA-2-7B with Budget 512, Prefill 4096)
                <br> 
                <strong>(Right) Theoretical Arithmetic Intensity vs prefill length </strong>
                (Self Speculation for LLaMA-2-7B and LLaMA-3.1-8B with Budget 512, Batch Size 256)
              </figcaption>
            </div>
            <br>
            <h4 class="title is-5">
              <img src="static/images/Verification.png" style="height: 36px; display: inline; vertical-align: middle;"/>
              &nbsp; Verification vs Target Latency
            </h4>
            <div style="display: flex; align-items: top; gap: 10px;">
              <div style="flex: 1;">
                <p>
                  A ratio close to 1 is ideal. While this holds true for smaller batch sizes, verification becomes expensive for small sequences as batch size grows. However, for sufficiently long sequences, as inference becomes memory-bound, the verification-to-target ratio approaches 1 again with increasing batch size.
                </p>
                <div class="image-container" style="display: flex; flex-direction: column; align-items: center;">
                  <div style="display: flex; justify-content: space-around; width: 100%;">
                    <img src="static/images/verification_ratio_llama2self.png" alt="Ratio of Verification vs Target Time" style="height: 250px; width: auto; margin: 0 20px;" />
                  </div>
                  <figcaption style="margin-top: 10px; text-align: center;">
                    <strong>Figure 3: Theoretical Ratio of Verification to Target Time vs Batch Size</strong>
                    (Self Speculation LLaMA-2-7B with Budget 512, Prefill 4096)
                  </figcaption>
                </div>
              </div>
            </div>
            <br>
            <h4 class="title is-5">
              <img src="static/images/icons/Drafting.png" style="height: 36px; display: inline; vertical-align: middle;"/>
              &nbsp; Draft vs Target Latency
            </h4>
            <p>
              A low draft to target cost ratio (0) is ideal. However as sequence lengths and batch sizes grow, KV scales linearly. In this regime, using StreamingLLM with a fixed budget is particularly effective, as the ratio of draft cost to target autoregressive cost approaches 0.
            </p>
          </div>
        </div>
      </div>
    </div>
  </section>
  

  <!-- Section: Conclusion and Future Work -->
  <section class="section hero is-light">
    <div class="container is-fluid">
      <div class="columns is-centered">
        <div class="column is-full-width">
          <h2 class="title is-3" style="text-align: center;">
            <img src="static/images/icons/Telescope.png" style="height: 50px; display: inline; vertical-align: middle;"/>
            &nbsp; Conclusion and Future Work
          </h2>
          <div class="content has-text-justified">
            <p>
              This work reassesses the trade-off between throughput and latency in long-context scenarios. We show that <em>speculative decoding can enhance throughput, reduce latency, and maintain accuracy</em>. Our theoretical and empirical analysis reveals that as the sequence length and batch size increase, bottlenecks shift from being compute-bound to memory-bound. This shift enables effective use of speculative decoding for longer sequences, even with large batch sizes, achieving up to <strong>2x </strong> speedup for LLaMA-2-7B-32K and <strong>1.84x</strong> for LLaMA-3.1-8B on 8 A100 GPUs. These results highlight the need to integrate speculative decoding into throughput optimization systems as long-context workloads become more common.
            </p>
          </div>
          <div class="has-text-centered">
            <img src="static/images/icons/MagicDec.png" alt="<i>TriForce</i>" width="200" height="200" />
          </div>
        </div>
      </div>
    </div>
  </section>
  

  <!-- Section: References -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@article{ref,
    title={MagicDec: Breaking the Latency-Throughput Tradeoff for Long Contexts with Speculative Decoding},
    author={},
    journal={},
    year={2024}
  }</code></pre>
    </div>
  </section>
  
  <footer class="footer">
    <div class="container is-fluid">
      <div class="columns is-centered">
        <div class="column is-full-width">
          <div class="content">
            <p>
              This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
              You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
              Commons Attribution-ShareAlike 4.0 International License</a>. The icons are created by GPT4. 
            </p>
          </div>
        </div>
      </div>
    </div>
  </footer>