diff --git a/aurelio_sdk/client.py b/aurelio_sdk/client.py
index b567d5d..928f7de 100644
--- a/aurelio_sdk/client.py
+++ b/aurelio_sdk/client.py
@@ -10,7 +10,7 @@
 from requests_toolbelt.multipart.encoder import MultipartEncoder
 
 from aurelio_sdk.const import POLLING_INTERVAL, WAIT_TIME_BEFORE_POLLING
-from aurelio_sdk.exceptions import APIError, APITimeoutError
+from aurelio_sdk.exceptions import ApiError, ApiTimeoutError
 from aurelio_sdk.logger import logger
 from aurelio_sdk.schema import (
     ChunkingOptions,
@@ -105,18 +105,18 @@ def chunk(
                     error_content = response.json()
                 except Exception:
                     error_content = response.text
-                raise APIError(
+                raise ApiError(
                     message=error_content,
                     status_code=response.status_code,
                     base_url=self.base_url,
                 )
         except Exception as e:
-            raise APIError(message=str(e), base_url=self.base_url) from e
+            raise ApiError(message=str(e), base_url=self.base_url) from e
 
     def extract_file(
         self,
         file: Optional[Union[IO[bytes], bytes]] = None,
-        file_path: Optional[str] = None,
+        file_path: Optional[Union[str, pathlib.Path]] = None,
         quality: Literal["low", "high"] = "low",
         chunk: bool = True,
         wait: int = 30,
@@ -188,7 +188,7 @@ def extract_file(
                     error_content = response.json()
                 except Exception:
                     error_content = response.text
-                raise APIError(
+                raise ApiError(
                     message=error_content,
                     status_code=response.status_code,
                     base_url=self.base_url,
@@ -210,11 +210,11 @@ def extract_file(
                 document_id=document_id, wait=wait, polling_interval=polling_interval
             )
         except requests.exceptions.Timeout:
-            raise APITimeoutError(
+            raise ApiTimeoutError(
                 timeout=session_timeout, base_url=self.base_url
             ) from None
         except Exception as e:
-            raise APIError(message=str(e), base_url=self.base_url) from e
+            raise ApiError(message=str(e), base_url=self.base_url) from e
 
     def extract_url(
         self,
@@ -273,7 +273,7 @@ def extract_url(
                     error_content = response.json()
                 except Exception:
                     error_content = response.text
-                raise APIError(
+                raise ApiError(
                     message=error_content,
                     status_code=response.status_code,
                     base_url=self.base_url,
@@ -295,11 +295,11 @@ def extract_url(
                 document_id=document_id, wait=wait, polling_interval=polling_interval
             )
         except requests.exceptions.Timeout:
-            raise APITimeoutError(
+            raise ApiTimeoutError(
                 timeout=session_timeout, base_url=self.base_url
             ) from None
         except Exception as e:
-            raise APIError(
+            raise ApiError(
                 message=str(e),
                 base_url=self.base_url,
             ) from e
@@ -324,15 +324,15 @@ def get_document(self, document_id: str, timeout: int = 30) -> ExtractResponse:
                     error_content = response.json()
                 except Exception:
                     error_content = response.text
-                raise APIError(
+                raise ApiError(
                     message=error_content,
                     status_code=response.status_code,
                     base_url=self.base_url,
                 )
         except requests.exceptions.Timeout:
-            raise APITimeoutError(timeout=timeout, base_url=self.base_url) from None
+            raise ApiTimeoutError(timeout=timeout, base_url=self.base_url) from None
         except Exception as e:
-            raise APIError(message=str(e), base_url=self.base_url) from e
+            raise ApiError(message=str(e), base_url=self.base_url) from e
 
     def wait_for(
         self,
@@ -412,15 +412,15 @@ def embedding(
                     error_content = response.json()
                 except Exception:
                     error_content = response.text
-                raise APIError(
+                raise ApiError(
                     message=error_content,
                     status_code=response.status_code,
                     base_url=self.base_url,
                 )
         except requests.exceptions.Timeout:
-            raise APITimeoutError(timeout=timeout, base_url=self.base_url) from None
+            raise ApiTimeoutError(timeout=timeout, base_url=self.base_url) from None
         except Exception as e:
-            raise APIError(
+            raise ApiError(
                 message=str(e),
                 base_url=self.base_url,
             ) from e
diff --git a/aurelio_sdk/client_async.py b/aurelio_sdk/client_async.py
index 5f50584..c4d956c 100644
--- a/aurelio_sdk/client_async.py
+++ b/aurelio_sdk/client_async.py
@@ -15,7 +15,7 @@
     UPLOAD_CHUNK_SIZE,
     WAIT_TIME_BEFORE_POLLING,
 )
-from aurelio_sdk.exceptions import APIError, APITimeoutError
+from aurelio_sdk.exceptions import ApiError, ApiTimeoutError
 from aurelio_sdk.logger import logger
 from aurelio_sdk.schema import (
     ChunkingOptions,
@@ -120,24 +120,24 @@ async def chunk(
                             error_content = await response.json()
                         except Exception:
                             error_content = await response.text()
-                        raise APIError(
+                        raise ApiError(
                             message=error_content,
                             status_code=response.status,
                         )
         except asyncio.TimeoutError:
-            raise APITimeoutError(
+            raise ApiTimeoutError(
                 timeout=timeout,
                 base_url=self.base_url,
             ) from None
         except Exception as e:
-            raise APIError(message=str(e), base_url=self.base_url) from e
+            raise ApiError(message=str(e), base_url=self.base_url) from e
 
     async def extract_file(
         self,
         quality: Literal["low", "high"],
         chunk: bool,
         file: Optional[Union[IO[bytes], bytes]] = None,
-        file_path: Optional[str] = None,
+        file_path: Optional[Union[str, Path]] = None,
         wait: int = 30,
         polling_interval: int = POLLING_INTERVAL,
     ) -> ExtractResponse:
@@ -169,6 +169,13 @@ async def extract_file(
 
         client_url = f"{self.base_url}/v1/extract/file"
 
+        # Form data
+        data = aiohttp.FormData()
+        data.add_field("quality", quality)
+        data.add_field("chunk", str(chunk))
+        initial_wait = WAIT_TIME_BEFORE_POLLING if polling_interval > 0 else wait
+        data.add_field("wait", str(initial_wait))
+
         # Handle file from path, convert to AsyncGenerator
         if file_path:
             logger.debug(f"Uploading file from path, {file_path}")
@@ -176,30 +183,19 @@ async def extract_file(
                 raise FileNotFoundError(f"File not found: {file_path}")
             file_stream = _file_stream_generator(file_path)
             filename = Path(file_path).name
-        else:
-            filename = None
 
-        # Add file field
-        data = aiohttp.FormData()
-        if file_stream:
-            logger.debug("Uploading using stream")
             # Wrap the AsyncGenerator with an AsyncIterablePayload
             file_payload = aiohttp.payload.AsyncIterablePayload(value=file_stream)
+            file_payload.content_type
             data.add_field(
                 name="file",
                 value=file_payload,
                 filename=filename,
-                content_type="application/octet-stream",
+                content_type=file_payload.content_type,
             )
         else:
             logger.debug("Uploading file bytes")
-            data.add_field("file", file, filename=filename)
-
-        # Add other fields
-        data.add_field("quality", quality)
-        data.add_field("chunk", str(chunk))
-        initial_wait = WAIT_TIME_BEFORE_POLLING if polling_interval > 0 else wait
-        data.add_field("wait", str(initial_wait))
+            data.add_field("file", file)
 
         if wait <= 0:
             session_timeout = None
@@ -242,12 +238,12 @@ async def extract_file(
                 document_id=document_id, wait=wait, polling_interval=polling_interval
             )
         except asyncio.TimeoutError:
-            raise APITimeoutError(
+            raise ApiTimeoutError(
                 base_url=self.base_url,
                 timeout=session_timeout.total if session_timeout else None,
             ) from None
         except Exception as e:
-            raise APIError(
+            raise ApiError(
                 message=str(e), base_url=self.base_url, status_code=status_code
             ) from e
 
@@ -312,7 +308,7 @@ async def extract_url(
                             error_content = await response.json()
                         except Exception:
                             error_content = await response.text()
-                        raise APIError(
+                        raise ApiError(
                             message=error_content,
                             status_code=response.status,
                         )
@@ -333,12 +329,12 @@ async def extract_url(
                 document_id=document_id, wait=wait, polling_interval=polling_interval
             )
         except asyncio.TimeoutError:
-            raise APITimeoutError(
+            raise ApiTimeoutError(
                 base_url=self.base_url,
                 timeout=session_timeout.total if session_timeout else None,
             ) from None
         except Exception as e:
-            raise APIError(
+            raise ApiError(
                 message=str(e),
                 base_url=self.base_url,
             ) from e
@@ -370,12 +366,12 @@ async def get_document(
                             error_content = await response.json()
                         except Exception:
                             error_content = await response.text()
-                        raise APIError(
+                        raise ApiError(
                             message=error_content,
                             status_code=response.status,
                         )
             except aiohttp.ConnectionTimeoutError as e:
-                raise APITimeoutError(
+                raise ApiTimeoutError(
                     base_url=self.base_url,
                     timeout=session_timeout.total if session_timeout else None,
                 ) from e
@@ -461,17 +457,17 @@ async def embedding(
                             error_content = await response.json()
                         except Exception:
                             error_content = await response.text()
-                        raise APIError(
+                        raise ApiError(
                             message=error_content,
                             status_code=response.status,
                         )
         except asyncio.TimeoutError:
-            raise APITimeoutError(
+            raise ApiTimeoutError(
                 base_url=self.base_url,
                 timeout=session_timeout.total if session_timeout else None,
             ) from None
         except Exception as e:
-            raise APIError(message=str(e), base_url=self.base_url) from e
+            raise ApiError(message=str(e), base_url=self.base_url) from e
 
 
 async def _file_stream_generator(
diff --git a/aurelio_sdk/exceptions.py b/aurelio_sdk/exceptions.py
index 5ddd8ee..5c7270b 100644
--- a/aurelio_sdk/exceptions.py
+++ b/aurelio_sdk/exceptions.py
@@ -2,7 +2,7 @@
 from typing import Optional, Union
 
 
-class APIError(Exception):
+class ApiError(Exception):
     """
     Exception for API errors.
     """
@@ -30,7 +30,7 @@ def __init__(
         super().__init__(full_message)
 
 
-class APITimeoutError(TimeoutError):
+class ApiTimeoutError(TimeoutError):
     """
     Exception for timeout errors.
     """
@@ -47,10 +47,3 @@ def __init__(
             message += f" Base URL: {base_url}"
         super().__init__(message)
 
-
-class FileNotFoundError(Exception):
-    """
-    Exception for file not found errors.
-    """
-
-    pass
diff --git a/examples/01_chunk_async.ipynb b/examples/01_chunk_async.ipynb
index 683ce56..b3150e0 100644
--- a/examples/01_chunk_async.ipynb
+++ b/examples/01_chunk_async.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -49,7 +49,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -69,7 +69,7 @@
        "})"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -83,7 +83,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -112,9 +112,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 16,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChunkResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=42937, pages=None, seconds=None), message=None, processing_options=ChunkingOptions(max_chunk_length=400, chunker_type='regex', window_size=1, delimiters=[]), document=ResponseDocument(id='doc_106b2d3b-b3b3-41c1-a6a5-745a9b364c31', content='# Mamba: Linear-Time Sequence Modeling with Selective State Spaces\\n# Albert Gu*1 and Tri Dao*2\\n1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me\\n# Abstract\\nFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ\\x80\\x99 computational ineï¬\\x83ciency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second, even though this change prevents the use of eï¬\\x83cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simpliï¬\\x81ed end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5Ã\\x97 higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.\\n# 1 Introduction\\nFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬\\x80ective paradigm in modern machine learning. The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014). While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬\\x83cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data. However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬\\x81nite window, and quadratic scaling with respect to the window length. An enormous body of research has appeared on more eï¬\\x83cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬\\x80ective. As of yet, none of these variants have been shown to be empirically eï¬\\x80ective at scale across domains.\\nRecently, structured state space sequence models (SSMs) (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling. These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960). This class of models can be computed very eï¬\\x83ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length. Additionally, they have principled\\nEqual contribution.\\n1\\nmechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021). Many ï¬\\x82avors of SSMs (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023). However, they have been less eï¬\\x80ective at modeling discrete and information-dense data such as text.\\nWe propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.\\nSelection Mechanism. First, we identify a key limitation of prior models: the ability to eï¬\\x83ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs). Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input. This allows the model to ï¬\\x81lter out irrelevant information and remember relevant information indeï¬\\x81nitely.\\nHardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬\\x83cient. We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬\\x80erent levels of the GPU memory hierarchy. The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3Ã\\x97 faster on A100 GPUs).\\nArchitecture. We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces.\\nSelective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬\\x83ciency together yield performance improvements on real data up to sequence length 1M.\\nWe empirically validate Mambaâ\\x80\\x99s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬\\x81c task performance, on several types of modalities and settings:\\nâ\\x80¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬\\x81nitely long (>1M tokens).\\nâ\\x80¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences.\\nâ\\x80¢ Language Modeling. Mamba is the ï¬\\x81rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5Ã\\x97 generation throughput compared to Transformers of similar size, and Mamba-3Bâ\\x80\\x99s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B).\\nModel code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.\\n2\\n# Selective State Space Model\\n# with Hardware-aware State Expansion\\n# A\\nvuvy GPU SRAM Selection Mechanism es\\nSelection Mechanism\\nFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð\\x9d\\x90· = 5) of an input ð\\x9d\\x91¥ to output ð\\x9d\\x91¦ through a higher dimensional latent state â\\x84\\x8e (e.g. ð\\x9d\\x91\\x81 = 4). Prior SSMs avoid materializing this large effective state (ð\\x9d\\x90·ð\\x9d\\x91\\x81, times batch size ð\\x9d\\x90µ and sequence length ð\\x9d\\x90¿) through clever alternate computation paths requiring time-invariance: the (â\\x88\\x86, A, B, C) parameters are constant across time. Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy.\\n# 2 State Space Models\\nStructured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð\\x9d\\x91¥(ð\\x9d\\x91¡) â\\x88\\x88 â\\x84\\x9d â\\x86¦ ð\\x9d\\x91¦(ð\\x9d\\x91¡) â\\x88\\x88 â\\x84\\x9d through an implicit latent state â\\x84\\x8e(ð\\x9d\\x91¡) â\\x88\\x88 â\\x84\\x9dð\\x9d\\x91\\x81. Concretely, S4 models are deï¬\\x81ned with four parameters (â\\x88\\x86, A, B, C), which deï¬\\x81ne a sequence-to-sequence trans- formation in two stages.\\nâ\\x84\\x8eâ\\x80²(ð\\x9d\\x91¡) = Aâ\\x84\\x8e(ð\\x9d\\x91¡) + Bð\\x9d\\x91¥(ð\\x9d\\x91¡) ð\\x9d\\x91¦(ð\\x9d\\x91¡) = Câ\\x84\\x8e(ð\\x9d\\x91¡)\\n(1a) (1b) â\\x84\\x8eð\\x9d\\x91¡ = Aâ\\x84\\x8eð\\x9d\\x91¡â\\x88\\x921 + Bð\\x9d\\x91¥ð\\x9d\\x91¡ ð\\x9d\\x91¦ð\\x9d\\x91¡ = Câ\\x84\\x8eð\\x9d\\x91¡ (2a) (2b) ð\\x9d\\x91\\x98 ð\\x9d\\x91² = (Cð\\x9d\\x91©, Cð\\x9d\\x91¨ð\\x9d\\x91©, â\\x80¦ , Cð\\x9d\\x91¨ ð\\x9d\\x91¦ = ð\\x9d\\x91¥ â\\x88\\x97 ð\\x9d\\x91² ð\\x9d\\x91©, â\\x80¦ ) (3a) (3b)\\nDiscretization. The ï¬\\x81rst stage transforms the â\\x80\\x9ccontinuous parametersâ\\x80\\x9d (â\\x88\\x86, A, B) to â\\x80\\x9cdiscrete parametersâ\\x80\\x9d (A, B) through ï¬\\x81xed formulas A = ð\\x9d\\x91\\x93ð\\x9d\\x90´(â\\x88\\x86, A) and B = ð\\x9d\\x91\\x93ð\\x9d\\x90µ(â\\x88\\x86, A, B), where the pair (ð\\x9d\\x91\\x93ð\\x9d\\x90´, ð\\x9d\\x91\\x93ð\\x9d\\x90µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬\\x81ned in equation (4).\\nA = exp(â\\x88\\x86A) B = (â\\x88\\x86A)â\\x88\\x921(exp(â\\x88\\x86A) â\\x88\\x92 I) â\\x8b\\x85 â\\x88\\x86B (4)\\nDiscretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023). It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5. However, from a mechanical point of view discretization can simply be viewed as the ï¬\\x81rst step of the computation graph in the forward pass of an SSM. Alternate ï¬\\x82avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about.\\nComputation. After the parameters have been transformed from (â\\x88\\x86, A, B, C) â\\x86¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).\\n3\\nCommonly, the model uses the convolutional mode (3) for eï¬\\x83cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬\\x83cient autoregressive inference (where the inputs are seen one timestep at a time).\\nLinear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ\\x80\\x99s dynamics are constant through time. In other words (â\\x88\\x86, A, B, C), and consequently (A, B) as well, are ï¬\\x81xed for all time-steps. This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models.\\nThus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬\\x83ciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬\\x83ciency bottlenecks.\\nStructure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬\\x83ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use. In this case, the A â\\x88\\x88 â\\x84\\x9dð\\x9d\\x91\\x81Ã\\x97ð\\x9d\\x91\\x81, B â\\x88\\x88 â\\x84\\x9dð\\x9d\\x91\\x81Ã\\x971, C â\\x88\\x88 â\\x84\\x9d1Ã\\x97ð\\x9d\\x91\\x81 matrices can all be represented by ð\\x9d\\x91\\x81 numbers. To operate over an input sequence ð\\x9d\\x91¥ of batch size ð\\x9d\\x90µ and length ð\\x9d\\x90¿ with ð\\x9d\\x90· channels, the SSM is applied independently to each channel. Note that in this case, the total hidden state has dimension ð\\x9d\\x90·ð\\x9d\\x91\\x81 per input, and computing it over the sequence length requires ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90·ð\\x9d\\x91\\x81) time and memory; this is the root of the fundamental eï¬\\x83ciency bottleneck addressed in Section 3.3.\\nGeneral State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state. It has been used to refer to many disparate concepts in diï¬\\x80erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬\\x81lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).\\nThroughout this entire paper we use the term â\\x80\\x9cSSMâ\\x80\\x9d to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary.\\nSSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\\nâ\\x80¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.\\nâ\\x80¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.\\nâ\\x80¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).\\nâ\\x80¢ RetNet (Y. Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.\\n4\\nâ\\x80¢ RWKV (B. Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)). Its main â\\x80\\x9cWKVâ\\x80\\x9d mechanism involves LTI recurrences and can be viewed as the ratio of two SSMs.\\nOther closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM.\\n# 3 Selective State Space Models\\nWe motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\\x83ciently. We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5).\\n# 3.1 Motivation: Selection as a Means of Compression\\nWe argue that a fundamental problem of sequence modeling is compressing context into a smaller state. In fact, we can view the tradeoï¬\\x80s of popular sequence models from this point of view. For example, attention is both eï¬\\x80ective and ineï¬\\x83cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬\\x83cient because they have a ï¬\\x81nite state, implying constant-time inference and linear-time training. However, their eï¬\\x80ectiveness is limited by how well this state has compressed the context.\\nTo understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\\nâ\\x80¢ The Selective Copying task modiï¬\\x81es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬\\x81lter out the irrelevant ones (white).\\nâ\\x80¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black).\\nThese tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬\\x80ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬\\x83culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.\\nIn summary, the eï¬\\x83ciency vs. eï¬\\x80ectiveness tradeoï¬\\x80 of sequence models is characterized by how well they compress their state: eï¬\\x83cient models must have a small state, while eï¬\\x80ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬\\x81lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).\\n# Improving SSMs with Selection\\nOne method of incorporating a selection mechanism into models is by letting their parameters that aï¬\\x80ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the convolution kernel of a CNN) be input-dependent.\\n5\\nCopying Output noo am > mt HE nee Tt Solution\\n# Tetons\\n|\\n# oO S lective Copying\\n# aoe\\n# i)\\n# [coe\\n# Induction Heads\\n# EES\\n>\\n# fo\\nPerfectly solved by LTI (e.g. convolutional) models that do not need to look at the actual inputs\\nHi i Hl ] Bw H a H > BH\\nFigure 2: (Left) The standard version of the Copying task involves constant spacing between input and output elements and is easily solved by time-invariant models such as linear recurrences and global convolutions. (Right Top) The Selective Copying task has random spacing in between inputs and requires time-varying models that can selectively remember or ignore inputs depending on their content. (Right Bottom) The Induction Heads task is an example of associative recall that requires retrieving an answer based on context, a key ability for LLMs.\\nAlgorithm 2 SSM + Selection (S6) Input: ð\\x9d\\x91¥ â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³) Output: ð\\x9d\\x91¦ â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³) 1: A â\\x88¶ (ð\\x9d\\x99³, ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b â\\x8a³ Represents structured ð\\x9d\\x91\\x81 Ã\\x97 ð\\x9d\\x91\\x81 matrix â\\x8a³ Represents structured ð\\x9d\\x91\\x81 Ã\\x97 ð\\x9d\\x91\\x81 matrix 2: B â\\x88¶ (ð\\x9d\\x99³, ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b 3: C â\\x88¶ (ð\\x9d\\x99³, ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b 4: â\\x88\\x86 â\\x88¶ (ð\\x9d\\x99³) â\\x86\\x90 ð\\x9d\\x9c\\x8fâ\\x88\\x86(ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b) 5: A, B â\\x88¶ (ð\\x9d\\x99³, ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x96½ð\\x9d\\x97\\x82ð\\x9d\\x97\\x8cð\\x9d\\x96¼ð\\x9d\\x97\\x8bð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x97\\x82ð\\x9d\\x97\\x93ð\\x9d\\x96¾(â\\x88\\x86, A, B) 6: ð\\x9d\\x91¦ â\\x86\\x90 ð\\x9d\\x96²ð\\x9d\\x96²ð\\x9d\\x96¬(A, B, C)(ð\\x9d\\x91¥) 2: B â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x91\\xa0ð\\x9d\\x90µ(ð\\x9d\\x91¥) 3: C â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x91\\xa0ð\\x9d\\x90¶(ð\\x9d\\x91¥) 4: â\\x88\\x86 â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³) â\\x86\\x90 ð\\x9d\\x9c\\x8fâ\\x88\\x86(ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b+ð\\x9d\\x91\\xa0â\\x88\\x86(ð\\x9d\\x91¥)) 5: A, B â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³, ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x96½ð\\x9d\\x97\\x82ð\\x9d\\x97\\x8cð\\x9d\\x96¼ð\\x9d\\x97\\x8bð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x97\\x82ð\\x9d\\x97\\x93ð\\x9d\\x96¾(â\\x88\\x86, A, B) 6: ð\\x9d\\x91¦ â\\x86\\x90 ð\\x9d\\x96²ð\\x9d\\x96²ð\\x9d\\x96¬(A, B, C)(ð\\x9d\\x91¥) â\\x8a³ Time-invariant: recurrence or convolution â\\x8a³ Time-varying: recurrence (scan) only 7: return ð\\x9d\\x91¦ 7: return ð\\x9d\\x91¦\\nAlgorithms 1 and 2 illustrates the main selection mechanism that we use. The main diï¬\\x80erence is simply making several parameters â\\x88\\x86, B, C functions of the input, along with the associated changes to tensor shapes throughout. In particular, we highlight that these parameters now have a length dimension ð\\x9d\\x90¿, meaning that the model has changed from time-invariant to time-varying. (Note that shape annotations were described in Section 2). This loses the equivalence to convolutions (3) with implications for its eï¬\\x83ciency, discussed next.\\nWe speciï¬\\x81cally choose ð\\x9d\\x91\\xa0ð\\x9d\\x90µ(ð\\x9d\\x91¥) = ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x91\\x81(ð\\x9d\\x91¥), ð\\x9d\\x91\\xa0ð\\x9d\\x90¶(ð\\x9d\\x91¥) = ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x91\\x81(ð\\x9d\\x91¥), ð\\x9d\\x91\\xa0â\\x88\\x86(ð\\x9d\\x91¥) = ð\\x9d\\x96¡ð\\x9d\\x97\\x8bð\\x9d\\x97\\x88ð\\x9d\\x96ºð\\x9d\\x96½ð\\x9d\\x96¼ð\\x9d\\x96ºð\\x9d\\x97\\x8cð\\x9d\\x97\\x8dð\\x9d\\x90·(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b1(ð\\x9d\\x91¥)), and ð\\x9d\\x9c\\x8fâ\\x88\\x86 = ð\\x9d\\x97\\x8cð\\x9d\\x97\\x88ð\\x9d\\x96¿ð\\x9d\\x97\\x8dð\\x9d\\x97\\x89ð\\x9d\\x97\\x85ð\\x9d\\x97\\x8eð\\x9d\\x97\\x8c, where ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x91\\x91 is a parameterized projection to dimension ð\\x9d\\x91\\x91. The choice of ð\\x9d\\x91\\xa0â\\x88\\x86 and ð\\x9d\\x9c\\x8fâ\\x88\\x86 is due to a connection to RNN gating mechanisms explained in Section 3.5.\\n# 3.3 Efficient Implementation of Selective SSMs\\nHardware-friendly architectures such as convolutions (Krizhevsky, Sutskever, and Hinton 2012) and Transform- ers (Vaswani et al. 2017) enjoy widespread application. Here we aim to make selective SSMs eï¬\\x83cient on modern hardware (GPU) as well. The selection mechanism is quite natural, and earlier works attempted to incorporate special cases of selection, such as letting â\\x88\\x86 vary over time in recurrent SSMs (Gu, Dao, et al. 2020). However, as previously mentioned a core limitation in the usage of SSMs is their computational eï¬\\x83ciency, which was why S4 and all derivatives used LTI (non-selective) models, most commonly in the form of global convolutions.\\n# 3.3.1 Motivation of Prior Models\\nWe ï¬\\x81rst revisit this motivation and overview our approach to overcome limitations of prior methods.\\nâ\\x80¢ At a high level, recurrent models such as SSMs always balance a tradeoï¬\\x80 between expressivity and speed: as discussed in Section 3.1, models with larger hidden state dimension should be more eï¬\\x80ective but slower. Thus\\n6\\nwe want to maximize hidden state dimension without paying speed and memory costs.\\nâ\\x80¢ Note that the recurrent mode is more ï¬\\x82exible than the convolution mode, since the latter (3) is derived from expanding the former (2) (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021). However, this would require computing and materializing the latent state â\\x84\\x8e with shape (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³, ð\\x9d\\x99½), much larger (by a factor of ð\\x9d\\x91\\x81, the SSM state dimension) than the input ð\\x9d\\x91¥ and output ð\\x9d\\x91¦ of shape (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³). Thus the more eï¬\\x83cient convolution mode was introduced which could bypass the state computation and materializes a convolution kernel (3a) of only (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³).\\nâ\\x80¢ Prior LTI SSMs leverage the dual recurrent-convolutional forms to increase the eï¬\\x80ective state dimension by a factor of ð\\x9d\\x91\\x81 (â\\x89\\x88 10 â\\x88\\x92 100), much larger than traditional RNNs, without eï¬\\x83ciency penalties.\\n# 3.3.2 Overview of Selective Scan: Hardware-Aware State Expansion\\nThe selection mechanism is designed to overcome the limitations of LTI models; at the same time, we therefore need to revisit the computation problem of SSMs. We address this with three classical techniques: kernel fusion, parallel scan, and recomputation. We make two main observations:\\nâ\\x80¢ The naive recurrent computation uses ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90·ð\\x9d\\x91\\x81) FLOPs while the convolutional computation uses ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90· log(ð\\x9d\\x90¿)) FLOPs, and the former has a lower constant factor. Thus for long sequences and not-too-large state dimension ð\\x9d\\x91\\x81, the recurrent mode can actually use fewer FLOPs.\\nâ\\x80¢ The two challenges are the sequential nature of recurrence, and the large memory usage. To address the latter, just like the convolutional mode, we can attempt to not actually materialize the full state â\\x84\\x8e.\\nThe main idea is to leverage properties of modern accelerators (GPUs) to materialize the state â\\x84\\x8e only in more eï¬\\x83cient levels of the memory hierarchy. In particular, most operations (except matrix multiplication) are bounded by memory bandwidth (Dao, Fu, Ermon, et al. 2022; Ivanov et al. 2021; Williams, Waterman, and Patterson 2009). This includes our scan operation, and we use kernel fusion to reduce the amount of memory IOs, leading to a signiï¬\\x81cant speedup compared to a standard implementation.\\nConcretely, instead of preparing the scan input (A, B) of size (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³, ð\\x9d\\x99½) in GPU HBM (high-bandwidth memory), we load the SSM parameters (â\\x88\\x86, A, B, C) directly from slow HBM to fast SRAM, perform the discretization and recurrence in SRAM, and then write the ï¬\\x81nal outputs of size (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³) back to HBM.\\nTo avoid the sequential recurrence, we observe that despite not being linear it can still be parallelized with a work-eï¬\\x83cient parallel scan algorithm (Blelloch 1990; Martin and Cundy 2018; Smith, Warrington, and Linderman 2023).\\nFinally, we must also avoid saving the intermediate states, which are necessary for backpropagation. We carefully apply the classic technique of recomputation to reduce the memory requirements: the intermediate states are not stored but recomputed in the backward pass when the inputs are loaded from HBM to SRAM. As a result, the fused selective scan layer has the same memory requirements as an optimized transformer implementation with FlashAttention.\\nDetails of the fused kernel and recomputation are in Appendix D. The full Selective SSM layer and algorithm is illustrated in Figure 1.\\n# 3.4 A Simplified SSM Architecture\\nAs with structured SSMs, selective SSMs are standalone sequence transformations that can be ï¬\\x82exibly incorporated into neural networks. The H3 architecture is the basis for the most well-known SSM architectures (Section 2), which are generally comprised of a block inspired by linear attention interleaved with an MLP (multi-layer perceptron) block. We simplify this architecture by combining these two components into one, which is stacked homogenously (Figure 3). This is inspired by the gated attention unit (GAU) (Hua et al. 2022), which did something similar for attention.\\nThis architecture involves expanding the model dimension ð\\x9d\\x90· by a controllable expansion factor ð\\x9d\\x90¸. For each block, most of the parameters (3ð\\x9d\\x90¸ð\\x9d\\x90·2) are in the linear projections (2ð\\x9d\\x90¸ð\\x9d\\x90·2 for input projections, ð\\x9d\\x90¸ð\\x9d\\x90·2 for output projection) while the inner SSM contributes less. The number of SSM parameters (projections for â\\x88\\x86, B, C, and\\n7\\nLinear projection Sequence transformation Nonlinearity (activation multiplication) H3 Â®@ Gated MLP â\\x80\\x94 Mamba\\n# or\\nFigure 3: (Architecture.) Our simplified block design combines the H3 block, which is the basis of most SSM architectures, with the ubiquitous MLP block of modern neural networks. Instead of interleaving these two blocks, we simply repeat the Mamba block homogenously. Compared to the H3 block, Mamba replaces the first multiplicative gate with an activation function. Compared to the MLP block, Mamba adds an SSM to the main branch. For ð\\x9d\\x9c\\x8e we use the SiLU / Swish activation (Hendrycks and Gimpel 2016; Ramachandran, Zoph, and Quoc V Le 2017).\\nthe matrix A) are much smaller in comparison. We repeat this block, interleaved with standard normalization and residual connections, to form the Mamba architecture. We always ï¬\\x81x to ð\\x9d\\x90¸ = 2 in our experiments and use two stacks of the block to match the 12ð\\x9d\\x90·2 parameters of a Transformerâ\\x80\\x99s interleaved MHA (multi-head attention) and MLP blocks. We use the SiLU / Swish activation function (Hendrycks and Gimpel 2016; Ramachandran, Zoph, and Quoc V Le 2017), motivated so that the Gated MLP becomes the popular â\\x80\\x9cSwiGLUâ\\x80\\x9d variant (Chowdhery et al. 2023; Shazeer 2020; Touvron et al. 2023). Finally, we additionally use an optional normalization layer (we choose LayerNorm (J. L. Ba, Kiros, and Hinton 2016)), motivated by RetNetâ\\x80\\x99s usage of a normalization layer in a similar location (Y. Sun et al. 2023).\\n# 3.5 Properties of Selection Mechanisms\\nThe selection mechanism is a broader concept that can be applied in diï¬\\x80erent ways, such as to more traditional RNNs or CNNs, to diï¬\\x80erent parameters (e.g. A in Algorithm 2), or using diï¬\\x80erent transformations ð\\x9d\\x91\\xa0(ð\\x9d\\x91¥).\\n# 3.5.1 Connection to Gating Mechanisms\\nWe highlight the most important connection: the classical gating mechanism of RNNs is an instance of our selection mechanism for SSMs. We note that the connection between RNN gating and the discretization of continuous-time systems is well established (Funahashi and Nakamura 1993; Tallec and Ollivier 2018). In fact, Theorem 1 is an improvement of Gu, Johnson, Goel, et al. (2021, Lemma 3.1) generalizing to the ZOH discretization and input-dependent gates (proof in Appendix C). More broadly, â\\x88\\x86 in SSMs can be seen to play a generalized role of the RNN gating mechanism. In line with prior work, we adopt the view that discretization of SSMs is the principled foundation of heuristic gating mechanisms.\\nTheorem 1. When ð\\x9d\\x91\\x81 = 1, A = â\\x88\\x921, B = 1, ð\\x9d\\x91\\xa0â\\x88\\x86 = ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥), and ð\\x9d\\x9c\\x8fâ\\x88\\x86 = ð\\x9d\\x97\\x8cð\\x9d\\x97\\x88ð\\x9d\\x96¿ð\\x9d\\x97\\x8dð\\x9d\\x97\\x89ð\\x9d\\x97\\x85ð\\x9d\\x97\\x8eð\\x9d\\x97\\x8c, then the selective SSM recurrence (Algorithm 2) takes the form\\nð\\x9d\\x91\\x94ð\\x9d\\x91¡ = ð\\x9d\\x9c\\x8e(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)) â\\x84\\x8eð\\x9d\\x91¡ = (1 â\\x88\\x92 ð\\x9d\\x91\\x94ð\\x9d\\x91¡)â\\x84\\x8eð\\x9d\\x91¡â\\x88\\x921 + ð\\x9d\\x91\\x94ð\\x9d\\x91¡ð\\x9d\\x91¥ð\\x9d\\x91¡. (5)\\nAs mentioned in Section 3.2, our speciï¬\\x81c choices of ð\\x9d\\x91\\xa0â\\x88\\x86, ð\\x9d\\x9c\\x8fâ\\x88\\x86 is from this connection. In particular, note that if a given input ð\\x9d\\x91¥ð\\x9d\\x91¡ should be completely ignored (as necessary in the synthetic tasks), all ð\\x9d\\x90· channels should ignore it, and so we project the input down to 1 dimension before repeating/broadcasting with â\\x88\\x86.\\n8\\n# Interpretation of Selection Mechanisms\\nWe elaborate on two particular mechanistic eï¬\\x80ects of selection.\\nVariable Spacing. Selectivity allows ï¬\\x81ltering out irrelevant noise tokens that may occur between inputs of interest. This is exempliï¬\\x81ed by the Selective Copying task, but occurs ubiquitously in common data modalities, particularly for discrete data â\\x80\\x93 for example the presence of language ï¬\\x81llers such as â\\x80\\x9cumâ\\x80\\x9d. This property arises because the model can mechanistically ï¬\\x81lter out any particular input ð\\x9d\\x91¥ð\\x9d\\x91¡, for example in the gated RNN case (Theorem 1) when ð\\x9d\\x91\\x94ð\\x9d\\x91¡ â\\x86\\x92 0.\\nIt has been empirically observed that many sequence models do not improve with longer Filtering Context. context (F. Shi et al. 2023), despite the principle that more context should lead to strictly better performance. An explanation is that many sequence models cannot eï¬\\x80ectively ignore irrelevant context when necessary; an intuitive example are global convolutions (and general LTI models). On the other hand, selective models can simply reset their state at any time to remove extraneous history, and thus their performance in principle improves monotonicly with context length (e.g. Section 4.3.2).\\nIn settings where multiple independent sequences are stitched together, Transformers Boundary Resetting. can keep them separate by instantiating a particular attention mask, while LTI models will bleed information between the sequences. Selective SSMs can also reset their state at boundaries (e.g. â\\x88\\x86ð\\x9d\\x91¡ â\\x86\\x92 â\\x88\\x9e or Theorem 1 when ð\\x9d\\x91\\x94ð\\x9d\\x91¡ â\\x86\\x92 1). These settings may occur artiï¬\\x81cially (e.g. packing documents together to improve hardware utilization) or naturally (e.g. episode boundaries in reinforcement learning (Lu et al. 2023)).\\nAdditionally, we elaborate on eï¬\\x80ects of each selective parameter.\\nIn general, â\\x88\\x86 controls the balance between how much to focus or ignore the current input Interpretation of â\\x88\\x86. ð\\x9d\\x91¥ð\\x9d\\x91¡. It generalizes RNN gates (e.g. ð\\x9d\\x91\\x94ð\\x9d\\x91¡ in Theorem 1), mechanically, a large â\\x88\\x86 resets the state â\\x84\\x8e and focuses on the current input ð\\x9d\\x91¥, while a small â\\x88\\x86 persists the state and ignores the current input. SSMs (1)-(2) can be interpreted as a continuous system discretized by a timestep â\\x88\\x86, and in this context the intuition is that large â\\x88\\x86 â\\x86\\x92 â\\x88\\x9e represents the system focusing on the current input for longer (thus â\\x80\\x9cselectingâ\\x80\\x9d it and forgetting its current state) while a small â\\x88\\x86 â\\x86\\x92 0 represents a transient input that is ignored.\\nInterpretation of A. We remark that while the A parameter could also be selective, it ultimately aï¬\\x80ects the model only through its interaction with â\\x88\\x86 via A = exp(â\\x88\\x86A) (the discretization (4)). Thus selectivity in â\\x88\\x86 is enough to ensure selectivity in (A, B), and is the main source of improvement. We hypothesize that making A selective in addition to (or instead of) â\\x88\\x86 would have similar performance, and leave it out for simplicity.\\nInterpretation of B and C. As discussed in Section 3.1, the most important property of selectivity is ï¬\\x81ltering out irrelevant information so that a sequence modelâ\\x80\\x99s context can be compressed into an eï¬\\x83cient state. In an SSM, modifying B and C to be selective allows ï¬\\x81ner-grained control over whether to let an input ð\\x9d\\x91¥ð\\x9d\\x91¡ into the state â\\x84\\x8eð\\x9d\\x91¡ or the state into the output ð\\x9d\\x91¦ð\\x9d\\x91¡. These can be interpreted as allowing the model to modulate the recurrent dynamics based on content (input) and context (hidden states) respectively.\\n3.6 Additional Model Details Real vs. Complex. Most prior SSMs use complex numbers in their state â\\x84\\x8e, which is necessary for strong performance on many tasks (Gu, Goel, and RÃ© 2022). However, it has been empirically observed that completely real-valued SSMs seem to work ï¬\\x81ne, and possibly even better, in some settings (Ma et al. 2023). We use real values as the default, which work well for all but one of our tasks; we hypothesize that the complex-real tradeoï¬\\x80 is related to the continuous-discrete spectrum in data modalities, where complex numbers are helpful for continuous modalities (e.g. audio, video) but not discrete (e.g. text, DNA).\\n9\\nInitialization. Most prior SSMs also suggest special initializations, particularly in the complex-valued case, which can help in several settings such as low-data regimes. Our default initialization for the complex case is S4D-Lin and for the real case is S4D-Real (Gu, Gupta, et al. 2022), which is based on the HIPPO theory (Gu, Dao, et al. 2020). These deï¬\\x81ne the ð\\x9d\\x91\\x9b-th element of A as â\\x88\\x921â\\x88\\x952 + ð\\x9d\\x91\\x9bð\\x9d\\x91\\x96 and â\\x88\\x92(ð\\x9d\\x91\\x9b + 1) respectively. However, we expect many initializations to work ï¬\\x81ne, particularly in the large-data and real-valued SSM regimes; some ablations are considered in Section 4.6.\\nParameterization of â\\x88\\x86. We deï¬\\x81ned the selective adjustment to â\\x88\\x86 as ð\\x9d\\x91\\xa0â\\x88\\x86(ð\\x9d\\x91¥) = ð\\x9d\\x96¡ð\\x9d\\x97\\x8bð\\x9d\\x97\\x88ð\\x9d\\x96ºð\\x9d\\x96½ð\\x9d\\x96¼ð\\x9d\\x96ºð\\x9d\\x97\\x8cð\\x9d\\x97\\x8dð\\x9d\\x90·(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b1(ð\\x9d\\x91¥)), which was motivated by the mechanics of â\\x88\\x86 (Section 3.5). We observe that it can be generalized from dimension 1 to a larger dimension ð\\x9d\\x9a\\x81. We set this to be a small fraction of ð\\x9d\\x99³, which uses a negligible number of parameters compared to the main Linear projections in the block. We additionally note that the broadcasting operation can instead be viewed as another Linear projection, initialized to a speciï¬\\x81c pattern of 1â\\x80\\x99s and 0â\\x80\\x99s; if this projection is trainable, this leads to the alternative ð\\x9d\\x91\\xa0â\\x88\\x86(ð\\x9d\\x91¥) = ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x90·(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x91\\x85(ð\\x9d\\x91¥)), which can be viewed as a low-rank projection. In our experiments, the â\\x88\\x86 parameter (which can be viewed as a bias term) is initialized to ð\\x9d\\x9c\\x8fâ\\x88\\x921 â\\x88\\x86 following prior work on SSMs (Gu, Johnson, Timalsina, et al. 2023).\\nRemark 3.1. For brevity in our experimental results, we sometimes abbreviate selective SSMs as S6 models, because they are S4 models with a selection mechanism and computed with a scan.\\n# 4 Empirical Evaluation\\nIn Section 4.1 we test Mambaâ\\x80\\x99s ability to solve the two synthetic tasks motivated in Section 3.1. We then evaluate on three domains, each evaluated on autoregressive pretraining as well as downstream tasks.\\nSection 4.2: language model pretraining (scaling laws), and zero-shot downstream evaluation.\\nSection 4.3: DNA sequence pretraining, and ï¬\\x81ne-tuning on a long-sequence classiï¬\\x81cation task.\\nSection 4.4: audio waveform pretraining, and the quality of autoregressively generated speech clips.\\nFinally, Section 4.5 shows Mambaâ\\x80\\x99s computational eï¬\\x83ciency at both training and inference time, and Section 4.6 ablates various components of the architecture and selective SSMs.\\n# 4.1 Synthetic Tasks\\nFull experiment details for these tasks including task details and training protocol are in Appendix E.1.\\n# 4.1.1 Selective Copying\\nThe Copying task is one of the most well-studied synthetic tasks for sequence modeling, originally designed to test the memorization abilities of recurrent models. As discussed in Section 3.1, LTI SSMs (linear recurrences and global convolutions) can easily solve this task by only keeping track of time instead of reasoning about the data; for example, by constructing a convolution kernel of exactly the right length (Figure 2). This was explicitly validated in earlier work on global convolutions (Romero et al. 2021). The Selective Copying task prevents this shortcut by randomizing the spacing between tokens. Note that this task has been introduced before as the Denoising task (Jing et al. 2019).\\nNote that many previous works argue that adding architecture gating (multiplicative interactions) can endow models with â\\x80\\x9cdata-dependenceâ\\x80\\x9d and solve related tasks (Dao, Fu, Saab, et al. 2023; Poli et al. 2023). However, we ï¬\\x81nd this explanation insuï¬\\x83cient intuitively because such gating does not interact along the sequence axis, and cannot aï¬\\x80ect the spacing between tokens. In particular architecture gating is not an instance of a selection mechanism (Appendix A).\\nTable 1 conï¬\\x81rms that gated architectures such as H3 and Mamba only partially improve performance, while the selection mechanism (modifying S4 to S6) easily solves this task, particularly when combined with these more powerful architectures.\\n10\\nModel Arch. Layer Acc. S4 - No gate No gate S4 S6 18.3 97.0 H3 Hyena - H3 H3 H3 S4 Hyena S6 57.0 30.1 99.7 - - Mamba Mamba Mamba Mamba Hyena S4 S6 56.4 28.4 99.8\\nInduction Heads Extrapolation\\nExtrapolation 1.05 \\' â\\x80\\x94â\\x80\\x94 Mua-Absotute 08] ; â\\x80\\x94â\\x80\\x94 MHA-RoPE i =~ MHA-xPos 6) i â\\x80\\x94 HB oa = byena \\' Random 1 ran benath 0.0 , ; ; : , 10Â° 10Â° 108 10Â° 10Â° Test Sequence Length\\n> g 8\\nTable 1: (Selective Copying.) Accuracy for combinations of architectures and inner sequence layers.\\nTable 2: (Induction Heads.) Models are trained on sequence length 28 = 256, and tested on increasing sequence lengths of 26 = 64 up to 220 = 1048576. Full numbers in Table 11.\\n# 4.1.2 Induction Heads\\nInduction heads (Olsson et al. 2022) is a simple task from the mechanistic interpretability lens (Elhage et al. 2021) that is surprisingly predictive of the in-context learning ability of LLMs. It requires models to perform associative recall and copy: for example, if the model has seen a bigram such as â\\x80\\x9cHarry Potterâ\\x80\\x9d in the sequence, then the next time â\\x80\\x9cHarryâ\\x80\\x9d appears in the same sequence, the model should be able to predict â\\x80\\x9cPotterâ\\x80\\x9d by copying from history.\\nDataset. We train a 2-layer model on the induction heads task at sequence length 256, with a vocab size of 16, which is comparable to prior work on this task (Dao, Fu, Saab, et al. 2023) but with longer sequences. We additionally investigate generalization and extrapolation abilities by evaluating on a range of sequence lengths from 26 = 64 up to 220 = 1048576 at test time.\\nModels. Following established work on induction heads, we use 2 layer models, which allows attention to mechanistically solve the induction heads task (Olsson et al. 2022). We test both multi-head attention (8 heads, with various positional encodings) and SSM variants. We use a model dimension ð\\x9d\\x90· of 64 for Mamba and 128 for the other models.\\nResults. Table 2 shows that Mambaâ\\x80\\x94or more precisely, its selective SSM layerâ\\x80\\x94has the ability to solve the task perfectly because of its ability to selectively remember the relevant token while ignoring everything else in between. It generalizes perfectly to million-length sequences, or 4000Ã\\x97 longer than it saw during training, while no other method goes beyond 2Ã\\x97.\\nOut of positional encoding variants for attention models, xPos (which was designed for length extrapolation) is slightly better than the others; also note that all attention models were only tested up to sequence length 214 = 16384 due to memory limitations. Out of other SSMs, H3 and Hyena are similar, contrary to the ï¬\\x81ndings in Poli et al. (2023).\\n# 4.2 Language Modeling\\nWe evaluate the Mamba architecture on standard autoregressive language modeling against other architectures, on both pretraining metrics (perplexity) and zero-shot evaluations. We set the model sizes (depth and width) to mirror GPT3 speciï¬\\x81cations. We use the Pile dataset (L. Gao, Biderman, et al. 2020), and follow the training recipe described in Brown et al. (2020). All training details are in Appendix E.2.\\n# 4.2.1 Scaling Laws\\nFor baselines, we compare against the standard Transformer architecture (GPT3 architecture), as well as the strongest Transformer recipe we know of (here referred to as Transformer++), based on the PaLM and LLaMa\\n11\\nScaling Laws on The Pile (Sequence Length 2048) Scaling Laws on The Pile (Sequence Length 8192) 2x10\" 2x10 Hyena Hyena RWKV s RWKV â\\x80\\x94â\\x80\\x94 Transformer Fy â\\x80\\x94â\\x80\\x94 Transformer fd RetNet 2 â\\x80\\x94â\\x80\\x94 RetNet 3+ 2 â\\x80\\x94 HH wd â\\x80\\x94= Transformers |, | â\\x80\\x94â\\x80\\x94 Transformert+ â\\x80\\x94â\\x80\\x94 Mamba zg â\\x80\\x94â\\x80\\x94 Mamba 2 2 S a 6x 10Â° 1 7 6x 10Â° 1 7 10\"? 102 10 107Â° FLOPs (log scale) FLOPs (log scale)\\ns 8 fd 2 2\\n> 3 2 2 S a\\nFigure 4: (Scaling Laws.) Models of size â\\x89\\x88 125ð\\x9d\\x91\\x80 to â\\x89\\x88 1.3ð\\x9d\\x90µ parameters, trained on the Pile. Mamba scales better than all other attention-free models and is the first to match the performance of a very strong â\\x80\\x9cTransformer++â\\x80\\x9d recipe that has now become standard, particularly as the sequence length grows.\\narchitectures (e.g. rotary embedding, SwiGLU MLP, RMSNorm instead of LayerNorm, no linear bias, and higher learning rates). We also compare against other recent subquadratic architectures (Figure 4). All model details are in Appendix E.2.\\nFigure 4 shows scaling laws under the standard Chinchilla (Hoï¬\\x80mann et al. 2022) protocol, on models from â\\x89\\x88 125ð\\x9d\\x91\\x80 to â\\x89\\x88 1.3ð\\x9d\\x90µ parameters. Mamba is the ï¬\\x81rst attention-free model to match the performance of a very strong Transformer recipe (Transformer++) that has now become standard, particularly as the sequence length grows. We note that full results on context length 8k are missing for the RWKV and RetNet baselines, prior strong recurrent models that can also be interpreted as SSMs, due to a lack of eï¬\\x83cient implementation leading to out-of-memory or unrealistic computation requirements.\\n# 4.2.2 Downstream Evaluations\\nTable 3 shows the performance of Mamba on a range of popular downstream zero-shot evaluation tasks. We compare against the most well-known open source models at these sizes, most importantly Pythia (Biderman et al. 2023) and RWKV (B. Peng et al. 2023) which were trained with the same tokenizer, dataset, and training length (300B tokens) as our models. (Note that Mamba and Pythia are trained with context length 2048, while RWKV was trained with context length 1024.)\\n# 4.3 DNA Modeling\\nMotivated by the success of large language models, there has been recent exploration into using the foundation model paradigm for genomics. DNA has been likened to language in that it consists of sequences of discrete tokens with a ï¬\\x81nite vocab. It is also known for requiring long-range dependencies to model (Avsec et al. 2021). We investigate Mamba as a FM backbone for pretraining and ï¬\\x81ne-tuning in the same setting as recent works on long-sequence models for DNA (Nguyen, Poli, et al. 2023). In particular, we focus on two explorations of scaling laws across model size and sequence length (Figure 5), and a diï¬\\x83cult downstream synthetic classiï¬\\x81cation task requiring long context (Figure 6).\\nFor pretraining, we largely follow a standard causal language modeling (next token prediction) setup for the training and model details (see also Appendix E.2). For the dataset, we largely follow the setup of HyenaDNA (Nguyen, Poli, et al. 2023), which uses the HG38 dataset for pretraining consisting of a single human genome with about 4.5 billion tokens (DNA base pairs) in the training split.\\n# 4.3.1 Scaling: Model Size\\nIn this experiment, we investigate the scaling properties of genomics foundation models with various model backbones (Figure 5 Left).\\nTraining. To advantage the baselines, we train on a short sequence length of 1024; as shown in Section 4.3.2, we expect results to favor Mamba even more at longer sequence lengths. We ï¬\\x81x a global batch size of 1024, for a\\n12\\nTable 3: (Zero-shot Evaluations.) Best results for each size in bold. We compare against open source LMs with various tokenizers, trained for up to 300B tokens. Pile refers to the validation split, comparing only against models trained on the same dataset and tokenizer (GPT-NeoX-20B). For each model size, Mamba is best-in-class on every single evaluation result, and generally matches baselines at twice the model size.\\nModel Token. Pile ppl â\\x86\\x93 LAMBADA LAMBADA HellaSwag ppl â\\x86\\x93 acc â\\x86\\x91 acc â\\x86\\x91 acc â\\x86\\x91 acc â\\x86\\x91 acc â\\x86\\x91 acc â\\x86\\x91 Hybrid H3-130M GPT2 â\\x80\\x94 Pythia-160M Mamba-130M NeoX NeoX 29.64 10.56 89.48 38.10 16.07 25.77 33.0 44.3 31.7 30.2 35.3 64.2 61.4 64.5 44.4 43.2 48.0 24.2 24.1 24.3 50.6 51.9 51.9 40.1 40.6 44.7 Hybrid H3-360M GPT2 â\\x80\\x94 Pythia-410M Mamba-370M NeoX NeoX 9.95 8.28 12.58 10.84 8.14 48.0 51.4 55.6 41.5 40.6 46.5 68.1 66.9 69.5 51.4 52.1 55.1 24.7 24.6 28.0 54.1 53.8 55.3 48.0 48.2 50.0 Pythia-1B Mamba-790M NeoX NeoX 7.82 7.33 7.92 6.02 56.1 62.7 47.2 55.1 70.7 72.1 57.0 61.2 27.1 29.5 53.5 56.1 51.9 57.1 GPT-Neo 1.3B Hybrid H3-1.3B OPT-1.3B Pythia-1.4B RWKV-1.5B Mamba-1.4B GPT2 â\\x80\\x94 GPT2 â\\x80\\x94 â\\x80\\x94 OPT 7.51 NeoX 7.70 NeoX NeoX 6.80 7.50 11.25 6.64 6.08 7.04 5.04 57.2 49.6 58.0 61.7 56.4 64.9 48.9 52.6 53.7 52.1 52.5 59.1 71.1 71.3 72.4 71.0 72.4 74.2 56.2 59.2 56.7 60.5 60.5 65.5 25.9 28.1 29.6 28.5 29.4 32.8 54.9 56.9 59.5 57.2 54.6 61.5 52.4 53.0 55.0 55.2 54.3 59.7 GPT-Neo 2.7B Hybrid H3-2.7B OPT-2.7B Pythia-2.8B RWKV-3B Mamba-2.8B GPT2 â\\x80\\x94 GPT2 â\\x80\\x94 â\\x80\\x94 OPT 6.73 NeoX 7.00 NeoX NeoX 6.22 5.63 7.92 5.12 5.04 5.24 4.23 62.2 55.7 63.6 64.7 63.9 69.2 55.8 59.7 60.6 59.3 59.6 66.1 72.1 73.3 74.8 74.0 73.7 75.2 61.1 65.6 60.8 64.1 67.8 69.7 30.2 32.3 31.3 32.9 33.1 36.3 57.6 61.4 61.0 59.7 59.6 63.5 56.5 58.0 58.7 59.1 59.6 63.3 GPT-J-6B OPT-6.7B Pythia-6.9B RWKV-7.4B GPT2 OPT NeoX NeoX â\\x80\\x93 â\\x80\\x93 6.51 6.31 4.10 4.25 4.45 4.38 68.3 67.7 67.1 67.2 66.3 67.2 64.0 65.5 75.4 76.3 75.2 76.1 67.0 65.6 67.3 67.8 36.6 34.9 35.5 37.5 64.1 65.5 61.3 61.0 63.0 62.9 61.7 62.5\\ntotal of 220 â\\x89\\x88 1ð\\x9d\\x91\\x80 tokens per batch. Models were trained for 10ð\\x9d\\x90¾ gradient steps for a total of 10ð\\x9d\\x90µ tokens.\\nResults. Figure 5 (Left) shows that Mambaâ\\x80\\x99s pretraining perplexity improves smoothly with model size, and that Mamba scales better than both HyenaDNA and Transformer++. For example, at the largest model size of â\\x89\\x88 40ð\\x9d\\x91\\x80 parameters, the curve shows that Mamba can match the Transformer++ and HyenaDNA models with roughly 3Ã\\x97 to 4Ã\\x97 fewer parameters.\\n# 4.3.2 Scaling: Context Length\\nIn the next DNA experiment, we investigate the scaling properties of models with respect to sequence length. We only compare the HyenaDNA and Mamba models, as quadratic attention becomes prohibitively expensive at longer sequence lengths. We pretrain models on sequence lengths 210 = 1024, 212 = 4096, 214 = 16384, 216 = 65536, 218 = 262144, 220 = 1048576. We ï¬\\x81x a model size of 6 layers by width 128 (about 1.3M-1.4M parameters). Models were trained for 20ð\\x9d\\x90¾ gradient steps for a total of â\\x89\\x88 330ð\\x9d\\x90µ tokens. The longer sequence lengths used sequence length warmup similar to (Nguyen, Poli, et al. 2023).\\nResults. Figure 5 (Right) shows that Mamba is able to make use of longer context even up to extremely long sequences of length 1M, and its pretraining perplexity improves as the context increases. On the other hand, the HyenaDNA model gets worse with sequence length. This is intuitive from the discussion in Section 3.5 on properties of the selection mechanism. In particular, LTI models cannot selectively ignore information; from a convolutional perspective, a very long convolution kernel is aggregating all information across a long sequence\\n13\\nScaling Laws on the Human Genome (HG38) Scaling Laws - Sequence Length (HG38) â\\x80\\x94â\\x80\\x94 HyenaDNa 1.4m â\\x80\\x94= Mamba 1.4M â\\x80\\x94â\\x80\\x94 Mamba 7M ae â\\x80\\x94â\\x80\\x94 HyenaDNA 3.00 4 â\\x80\\x94 Mamba â\\x80\\x94â\\x80\\x94 Transformert+ 2.98 | Perplexity Perplexity 2.80 4 284 2.754 274 r T r r r ; 10Â° 107 103 10 105 10Â° Parameters (log scale) Sequence Length\\nFigure 5: (DNA Scaling Laws.) Pretraining on the HG38 (human genome) dataset. (Left) Fixing short context length 210 = 1024 and increasing size from â\\x89\\x88 200ð\\x9d\\x90¾ to â\\x89\\x88 40ð\\x9d\\x91\\x80 parameters, Mamba scales better than baselines. (Right) Fixing model size and increasing sequence lengths while keeping tokens/batch and total training tokens fixed. Unlike baselines, the selection mechanism of Mamba facilitates better performance with increasing context length.\\nFinetuning Accuracy (Species DNA Classification) 0.8] â\\x80\\x94â\\x80\\x94 HyenaDNA1.4M 0.7-| â\\x80\\x94â\\x80\\x94 Mamba 1.4m â\\x80\\x94â\\x80\\x94 Mamba 7M mag] â\\x80\\x94â\\x80\\x94 Random g 5 os 3 â\\x80\\x9c8 oA 034 024 --------------------------------- T T T T 103 10Â¢ 108 10 Sequence Length\\nScaling Laws - Sequence Length (YouTubeMix) 1.475 â\\x80\\x94â\\x80\\x94 SA+FEN 1.450 4 â\\x80\\x94â\\x80\\x94 Mamba @ 1.4254 2 1.400 4 5 o 1.375 4 Â© 1.3504 1.325 4 1.300 T T T 10* 10Â° 10 Sequence Length\\nFigure 6: (Great Apes DNA Classification.) Accuracy after fine-tuning on sequences of length 210 = 1024 up to 220 = 1048576 using pretrained models of the same context length. Nu- merical results in Table 13.\\nFigure 7: (Audio Pretraining.) Mamba improves performance over prior state-of-the-art (Sashimi) in autoregressive audio mod- eling, while improving up to minute-long context or million- length sequences (controlling for computation).\\nwhich may be very noisy. Note that while HyenaDNA claims to improve with longer context, their results do not control for computation time.\\n# 4.3.3 Synthetic Species Classification\\nWe evaluate models on a downstream task of classifying between 5 diï¬\\x80erent species by randomly sampling a contigu- ous segment of their DNA. This task is adapted from HyenaDNA, which used the species {human, lemur, mouse, pig, hippo}. We modify the task to be signiï¬\\x81cantly more challenging by classifying between the ï¬\\x81ve great apes species {human, chimpanzee, gorilla, orangutan, bonobo}, which are known to share 99% of their DNA.\\n# 4.4 Audio Modeling and Generation\\nFor the audio waveform modality, we compare primarily to the SaShiMi architecture and training protocols (Goel et al. 2022). This model comprises\\n1. a U-Net backbone with two stages of pooling by a factor ð\\x9d\\x91\\x9d that doubles the model dimension ð\\x9d\\x90· per stage,\\n2. alternating S4 and MLP blocks in each stage.\\nWe consider replacing the S4+MLP blocks with Mamba blocks. Experiment details are in Appendix E.4.\\n# 4.4.1 Long-Context Autoregressive Pretraining\\nWe evaluate pretraining quality (autoregressive next-sample prediction) on YouTubeMix (DeepSound 2017), a standard piano music dataset used by prior work consisting of 4 hours of solo piano music, sampled at a rate of\\n14\\n16000 Hz Pretraining details largely follow the standard language modeling setup (Section 4.2). Figure 7 evaluates the eï¬\\x80ect of increasing training sequence lengths from 213 = 8192 to 220 â\\x89\\x88 106, while keeping computation ï¬\\x81xed. (There are some slight edge cases to the way the data is curated, which may lead to kinks in the scaling curves. For example, only minute-long clips were available so the maximum sequence length is actually bounded by 60ð\\x9d\\x91\\xa0 â\\x8b\\x85 16000ð\\x9d\\x90»ð\\x9d\\x91§ = 960000.)\\nBoth Mamba and the SaShiMi (S4+MLP) baseline improve consistently with longer context lengths; Mamba is better throughout, and the gap widens at longer lengths. The main metric is bits per byte (BPB), which is a constant factor log(2) of the standard negative log-likelihood (NLL) loss for pretraining other modalities.\\nWe note one important detail: this is the only experiment in this paper in which we switched from the real parameterization to complex (Section 3.6). We show additional ablations in Appendix E.4.\\n# 4.4.2 Autoregressive Speech Generation\\nSC09 is a benchmark speech generation dataset (Donahue, McAuley, and Puckette 2019; Warden 2018), consisting of 1-second clips sampled at 16000 Hz of the digits â\\x80\\x9czeroâ\\x80\\x9d through â\\x80\\x9cnineâ\\x80\\x9d with highly variable characteristics. We largely follow the autoregressive training setup and generation protocol of Goel et al. (2022).\\nTable 4 shows automated metrics of the Mamba-UNet model compared to a variety of baselines from Goel et al. (2022): WaveNet (Oord et al. 2016), SampleRNN (Mehri et al. 2017), WaveGAN (Donahue, McAuley, and Puckette 2019), Diï¬\\x80Wave (Z. Kong et al. 2021), and SaShiMi. A small Mamba model outperforms the state-of-the-art (and much larger) GAN- and diï¬\\x80usion- based models. A larger model parameter-matched to the baselines further improves on ï¬\\x81delity metrics dramatically.\\nTable 5 takes the small Mamba model and investigates combinations of diï¬\\x80erent architectures for the outer stages and center stage. It shows that Mamba is consistently better than S4+MLP in the outer blocks, and Mamba > S4+MLP > MHA+MLP in the center blocks.\\nTable 4: (SC09) Automated metrics for unconditional generation on a challenging dataset of fixed-length speech clips. (Top to Bottom) Autoregressive baselines, non-autoregressive baselines, Mamba, and dataset metrics.\\nTable 5: (SC09 Model Ablations) Models with 6M parameters. In SaShiMiâ\\x80\\x99s U-Net backbone, there are 8 center blocks operat- ing on sequence length 1000, sandwiched on each side by 8 outer blocks on sequence length 4000, sandwiched by 8 outer blocks on sequence length 16000 (40 blocks total). The architecture of the 8 center blocks are ablated independently of the rest. Note that Transformers (MHA+MLP) were not tested in the more im- portant outer blocks because of efficiency constraints.\\nModel Params NLL â\\x86\\x93 FID â\\x86\\x93 IS â\\x86\\x91 mIS â\\x86\\x91 AM â\\x86\\x93 SampleRNN WaveNet SaShiMi 35.0M 4.2M 5.8M 2.042 1.925 1.873 8.96 5.08 1.99 1.71 2.27 5.13 3.02 5.80 42.57 1.76 1.47 0.74 WaveGAN DiffWave + SaShiMi Mamba Mamba Train Test 19.1M 24.1M 23.0M 6.1M 24.3M - - - - - 1.852 1.860 - - 2.03 1.92 1.42 0.94 0.67 0.00 0.02 4.90 5.26 5.94 6.26 7.33 8.56 8.33 36.10 51.21 69.17 88.54 144.9 292.5 257.6 0.80 0.68 0.59 0.52 0.36 0.16 0.19\\nOuter Center S4+MLP MHA+MLP S4+MLP S4+MLP Mamba Mamba Mamba Mamba S4+MLP MHA+MLP S4+MLP Mamba NLL â\\x86\\x93 1.859 1.867 1.859 1.850 1.853 1.852 FID â\\x86\\x93 1.45 1.43 1.42 1.37 1.07 0.94 IS â\\x86\\x91 5.06 5.42 5.71 5.63 6.05 6.26 mIS â\\x86\\x91 47.03 53.54 56.51 58.23 73.34 88.54 AM â\\x86\\x93 0.70 0.65 0.64 0.62 0.55 0.52\\n4.5 Speed and Memory Benchmarks We benchmark the speed of the SSM scan operation (state expansion ð\\x9d\\x91\\x81 = 16), as well as the end-to-end inference throughput of Mamba, in Figure 8. Our eï¬\\x83cient SSM scan is faster than the best attention implementation that we know of (FlashAttention-2 (Dao 2023)) beyond sequence length 2K, and up to 20-40Ã\\x97 faster than a standard scan implementation in PyTorch. Mamba achieves 4-5Ã\\x97 higher inference throughput than a Transformer of similar size, since without the KV cache it can use much higher batch sizes. For example, a Mamba-6.9B (untrained) would have higher inference throughput than a 5Ã\\x97 smaller Transformer-1.3B. Details in Appendix E.5, which additionally includes a benchmark of memory consumption.\\n15\\nScan vs Convolution vs Attention time (A100 80GB PCle) Inference throughput on A100 80GB (prompt length 2048) â\\x80\\x94 Flashattention-2 ame ee ES 1000-1 â\\x80\\x94 convolution @ 1500] mm Mamba 6.98 wwe â\\x80\\x94â\\x80\\x94 Scan (PyTorch) Py mmm Transformer 6.78 100 4 â\\x80\\x94â\\x80\\x94 Scan (ours) Ei % 00M 2 a tod S 1000 B us Ff = 2 500 â\\x80\\x9c = pad oid r S12 1k 2k Â«= 4k BKK 32K GK 128k 256K 512k 1 2 Hi A 16 32 oa 128 Sequence length Batch size\\n@ =\\n~ Â£\\nFigure 8: (Efficiency Benchmarks.) (Left) Training: our efficient scan is 40Ã\\x97 faster than a standard implementation. (Right) Inference: as a recurrent model, Mamba can achieve 5Ã\\x97 higher throughput than Transformers.\\n# 4.6 Model Ablations\\nWe perform a series of detailed ablations on components of our model, focusing on the setting of language modeling with size â\\x89\\x88 350M models at Chinchilla token counts (same setting as Figure 4).\\n# 4.6.1 Architecture\\nTable 6 investigates the eï¬\\x80ects of the architecture (block) and its inner SSM layer (Figure 3). We ï¬\\x81nd that\\nâ\\x80¢ Among previous non-selective (LTI) SSMs, which are equivalent to global convolutions, performance is very similar.\\nâ\\x80¢ Replacing the complex-valued S4 variant from previous work with a real-valued one does not aï¬\\x80ect performance much, suggesting that (at least for LM) real-valued SSMs may be a better choice when accounting for hardware eï¬\\x83ciency.\\nâ\\x80¢ Replacing any of these with a selective SSM (S6) signiï¬\\x81cantly improves performance, validating the motivation of Section 3.\\nâ\\x80¢ The Mamba architecture performs similarly to the H3 architecture (and seems slightly better when using a selective layer).\\nWe also investigate interleaving the Mamba block with other blocks such as MLP (a traditional architecture) MHA (a hybrid attention architecture) in Appendix E.2.2.\\n# 4.6.2 Selective SSM\\nTable 7 ablates the selective SSM layer by considering diï¬\\x80erent combinations of selective â\\x88\\x86, B, and C param- eters (Algorithm 2), showing that â\\x88\\x86 is the most important parameter due to its connection to RNN gating (Theorem 1).\\nTable 8 considers diï¬\\x80erent initializations of the SSM, which have been shown to make a large diï¬\\x80erence in some data modalities and settings (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022). On language modeling, we ï¬\\x81nd that simpler real-valued diagonal initializations (S4D-Real, row 3) instead of more standard complex-valued parameterizations (S4D-Lin, row 1) perform better. Random initializations also work well, consistent with ï¬\\x81ndings from prior work (Mehta et al. 2023).\\nTable 9 and Table 10 consider varying the dimension of the â\\x88\\x86 and (B, C) projections respectively. Changing them from static to selective provides the most beneï¬\\x81t, while increasing the dimensions further generally improves performance modestly with a small increase in parameter count.\\nOf particular note is the dramatic improvement of the selective SSM when the state size ð\\x9d\\x91\\x81 is increased, with over a 1.0 perplexity improvement for a cost of only 1% additional parameters. This validates our core motivation in Sections 3.1 and 3.3.\\n16\\nTable 6: (Ablations: Architecture and SSM layer.) The Mamba block performs similarly to H3 while being simpler. In the inner layer, there is little difference among different parameterizations of LTI models, while selective SSMs (S6) provide a large improvement. More specifically, the S4 (real) variant is S4D-Real and the S4 (complex) variant is S4D-Lin.\\nModel Arch. SSM Layer Perplexity Model Arch. SSM Layer Perplexity Hyena H3 H3 H3 H3 - H3 - Hyena S4 (complex) S4 (real) S6 10.24 10.30 10.34 8.95 Mamba Hyena - Mamba - - Mamba Mamba Mamba S4 (complex) S4 (real) S6 10.75 10.54 10.56 8.69\\nTable 7: (Ablations: Selective parameters.) â\\x88\\x86 is the most im- portant parameter (Theorem 1), but using multiple selective pa- rameters together synergizes.\\nTable 8: (Ablations: Parameterization of A.) The more standard initializations based on S4D-Lin (Gu, Gupta, et al. 2022) perform worse than S4D-Real or a random initializa- tion, when the SSM is selective.\\nSelective A Selective B SelectiveC Perplexity \\\\Qx& xX Qk *Â®QX Qk Q&X 1093 10.15 9.98 9.81 8.71\\nAð\\x9d\\x91\\x9b Initialization Að\\x9d\\x91\\x9b = â\\x88\\x92 1 Complex Real Að\\x9d\\x91\\x9b = â\\x88\\x921â\\x88\\x952 Að\\x9d\\x91\\x9b = â\\x88\\x92(ð\\x9d\\x91\\x9b + 1) Real Að\\x9d\\x91\\x9b â\\x88¼ exp(ð\\x9d\\x92©(0, 1)) Real Field + ð\\x9d\\x91\\x9bð\\x9d\\x91\\x96 2 9.16 8.85 8.71 8.71\\nTable 9: (Ablations: Expressivity of â\\x88\\x86.) The selection mechanism of â\\x88\\x86 constructs it with a projection of the input. Project- ing it even to dim. 1 provides a large in- crease in performance; increasing it fur- ther provides further improvements at the cost of a modest increase in parameters. State size fixed to ð\\x9d\\x91\\x81 = 16.\\nSize of â\\x88\\x86 proj. - 1 2 4 8 16 32 64 Params (M) 358.9 359.1 359.3 359.7 360.5 362.1 365.2 371.5 9.12 8.97 8.97 8.91 8.83 8.84 8.80 8.71\\n# Perplexity\\nTable 10: (Ablations: SSM state dimension.) (Top) Constant B and C (Bottom) Selective B and C. Increasing the SSM state dimension ð\\x9d\\x91\\x81, which can be viewed as an expansion factor on the dimension of the recurrent state, can significantly improve performance for a negligible cost in parameters/FLOPs, but only when B and C are also selective. Size of â\\x88\\x86 projection fixed to 64.\\nState dimension ð\\x9d\\x91\\x81 Params (M) Perplexity 1 2 4 8 16 1 2 4 8 16 367.1 367.4 368.0 369.1 371.5 367.1 367.4 368.0 369.1 371.5 9.88 9.86 9.82 9.82 9.81 9.73 9.40 9.09 8.84 8.71\\n# 5 Discussion\\nWe discuss related work, limitations, and some future directions.\\nRelated Work. Appendix A discusses how the selection mechanism relates to similar concepts. Appendix B has an extended related work of SSMs and other related models.\\nNo Free Lunch: Continuous-Discrete Spectrum. Structured SSMs were originally deï¬\\x81ned as discretizations of continuous systems (1), and have had a strong inductive bias toward continuous-time data modalities such as perceptual signals (e.g. audio, video). As discussed in Sections 3.1 and 3.5, the selection mechanism overcomes their weaknesses on discrete modalities such as text and DNA; but this conversely can impede their performance\\n17\\non data that LTI SSMs excel on. Our ablations on audio waveforms examine this tradeoï¬\\x80 in more detail.\\nDownstream Affordances. Transformer-based foundation models (particularly LLMs) have a rich ecosystem of properties and modes of interaction with pretrained models, such as ï¬\\x81ne-tuning, adaptation, prompting, in-context learning, instruction tuning, RLHF, quantization, and so on. We are particularly interested in whether Transformer alternatives such as SSMs have similar properties and aï¬\\x80ordances.\\nScaling. Our empirical evaluation is limited to small model sizes, below the threshold of most strong open source LLMs (e.g. Llama (Touvron et al. 2023)) as well as other recurrent models such as RWKV (B. Peng et al. 2023) and RetNet (Y. Sun et al. 2023), which have been evaluated at the 7B parameter scale and beyond. It remains to assess whether Mamba still compares favorably at these larger sizes. We also note that scaling SSMs may involve further engineering challenges and adjustments to the model that are not discussed in this paper.\\n# 6 Conclusion\\nWe introduce a selection mechanism to structured state space models, allowing them to perform context-dependent reasoning while scaling linearly in sequence length. When incorporated into a simple attention-free architecture, Mamba achieves state-of-the-art results on a diverse set of domains, where it matches or exceeds the performance of strong Transformer models. We are excited about the broad applications of selective state space models to build foundation models for diï¬\\x80erent domains, especially in emerging modalities requiring long context such as genomics, audio, and video. Our results suggest that Mamba is a strong candidate to be a general sequence model backbone.\\n# Acknowledgments\\nWe thank Karan Goel, Arjun Desai, and Kush Bhatia for helpful feedback on the draft.\\n# References\\n[1] Martin Arjovsky, Amar Shah, and Yoshua Bengio. â\\x80\\x9cUnitary Evolution Recurrent Neural Networksâ\\x80\\x9d. In: The\\nInternational Conference on Machine Learning (ICML). 2016, pp. 1120â\\x80\\x931128. iga Avsec, Vikram Agarwal, Daniel Visentin, Joseph R Ledsam, Agnieszka Grabska-Barwinska, Kyle R Taylor, Yannis Assael, John Jumper, Pushmeet Kohli, and David R Kelley. â\\x80\\x9cEffective Gene Expression Prediction from Sequence by Integrating Long-range Interactionsâ\\x80\\x9d. In: Nature Methods 18.10 (2021), pp. 1196â\\x80\\x931203. Jimmy Ba, Geoffrey E Hinton, Volodymyr Mnih, Joel Z Leibo, and Catalin Ionescu. â\\x80\\x9cUsing Fast Weights to Attend to the Recent Pastâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 29 (2016). Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. â\\x80\\x9cLayer Normalizationâ\\x80\\x9d. In: arXiv preprint arXiv:1607.06450 (2016).\\n[2]\\n[3]\\n[4]\\n[5] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. â\\x80\\x9cNeural Machine Translation by Jointly Learning to Align and Translateâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2015.\\n[6] David Balduzzi and Muhammad Ghifary. â\\x80\\x9cStrongly-typed Recurrent Neural Networksâ\\x80\\x9d. In: International Con- ference on Machine Learning. PMLR. 2016, pp. 1292â\\x80\\x931300.\\n[7] Stella Biderman, Hailey Schoelkopf, Quentin Gregory Anthony, Herbie Bradley, Kyle OBrien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, et al. â\\x80\\x9cPythia: A Suite for Analyzing Large Language Models across Training and Scalingâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 2397â\\x80\\x932430.\\n[8] Yonatan Bisk, Rowan Zellers, Jianfeng Gao, Yejin Choi, et al. â\\x80\\x9cPIQA: Reasoning about Physical Commonsense in Natural Languageâ\\x80\\x9d. In: Proceedings of the AAAI conference on Artificial Intelligence. Vol. 34. 05. 2020, pp. 7432â\\x80\\x93 7439.\\n[9] Guy E Blelloch. â\\x80\\x9cPrefix Sums and Their Applicationsâ\\x80\\x9d. In: (1990). [10]\\nJames Bradbury, Stephen Merity, Caiming Xiong, and Richard Socher. â\\x80\\x9cQuasi-recurrent Neural Networksâ\\x80\\x9d. In: arXiv preprint arXiv:1611.01576 (2016).\\n18\\n[11] Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Nee- lakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. â\\x80\\x9cLanguage Models are Few-shot Learnersâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 33 (2020), pp. 1877â\\x80\\x931901.\\n[12] Aydar Bulatov, Yuri Kuratov, and Mikhail S Burtsev. â\\x80\\x9cScaling Transformer to 1M tokens and Beyond with RMTâ\\x80\\x9d. In: arXiv preprint arXiv:2304.11062 (2023).\\n[13] Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. â\\x80\\x9cGenerating Long Sequences with Sparse Trans- formersâ\\x80\\x9d. In: arXiv preprint arXiv:1904.10509 (2019).\\n[14] Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Pe- ter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, et al. â\\x80\\x9cRethinking Attention with Performersâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2021.\\n[15] Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. â\\x80\\x9cPaLM: Scaling Language Modeling with Pathwaysâ\\x80\\x9d. In: Journal of Machine Learning Research 24.240 (2023), pp. 1â\\x80\\x93113. url: http://jmlr.org/ papers/v24/22-1144.html. Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio. â\\x80\\x9cEmpirical Evaluation of Gated Re- current Neural Networks on Sequence Modelingâ\\x80\\x9d. In: arXiv preprint arXiv:1412.3555 (2014).\\n[17] Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. â\\x80\\x9cThink you have Solved Question Answering? Try ARC, the AI2 Reasoning Challengeâ\\x80\\x9d. In: arXiv preprint arXiv:1803.05457 (2018).\\n[18] Tri Dao. â\\x80\\x9cFlashAttention-2: Faster Attention with Better Parallelism and Work Partitioningâ\\x80\\x9d. In: (2023). [19] Tri Dao, Daniel Y Fu, Stefano Ermon, Atri Rudra, and Christopher RÃ©. â\\x80\\x9cFlashAttention: Fast and Memory- Efficient Exact Attention with IO-Awarenessâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2022.\\n[20] Tri Dao, Daniel Y Fu, Khaled K Saab, Armin W Thomas, Atri Rudra, and Christopher RÃ©. â\\x80\\x9cHungry Hungry Hippos: Towards Language Modeling with State Space Modelsâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[21] Yann N Dauphin, Angela Fan, Michael Auli, and David Grangier. â\\x80\\x9cLanguage Modeling with Gated Convolu- tional Networksâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). PMLR. 2017, pp. 933â\\x80\\x93941.\\n# [22] DeepSound. SampleRNN. https://github.com/deepsound-project/samplernn-pytorch. 2017. [23]\\nJiayu Ding, Shuming Ma, Li Dong, Xingxing Zhang, Shaohan Huang, Wenhui Wang, and Furu Wei. â\\x80\\x9cLongNet: Scaling Transformers to 1,000,000,000 Tokensâ\\x80\\x9d. In: arXiv preprint arXiv:2307.02486 (2023).\\n[24] Chris Donahue, Julian McAuley, and Miller Puckette. â\\x80\\x9cAdversarial Audio Synthesisâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2019.\\n[25] Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. â\\x80\\x9cAn Image is Worth 16x16 Words: Transformers for Image Recognition at Scaleâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2020.\\n[26] Nelson Elhage, Neel Nanda, Catherine Olsson, Tom Henighan, Nicholas Joseph, Ben Mann, Amanda Askell, Yuntao Bai, Anna Chen, Tom Conerly, Nova DasSarma, Dawn Drain, Deep Ganguli, Zac Hatfield-Dodds, Danny Hernandez, Andy Jones, Jackson Kernion, Liane Lovitt, Kamal Ndousse, Dario Amodei, Tom Brown, Jack Clark, Jared Kaplan, Sam McCandlish, and Chris Olah. â\\x80\\x9cA Mathematical Framework for Transformer Circuitsâ\\x80\\x9d. In: Transformer Circuits Thread (2021). https://transformer-circuits.pub/2021/framework/index.html. [27] Mahan Fathi, Jonathan Pilault, Pierre-Luc Bacon, Christopher Pal, Orhan Firat, and Ross Goroshin. â\\x80\\x9cBlock-\\nState Transformerâ\\x80\\x9d. In: arXiv preprint arXiv:2306.09539 (2023).\\n[28] Yassir Fathullah, Chunyang Wu, Yuan Shangguan, Junteng Jia, Wenhan Xiong, Jay Mahadeokar, Chunxi Liu, Yangyang Shi, Ozlem Kalinli, Mike Seltzer, et al. â\\x80\\x9cMulti-Head State Space Model for Sequence Modelingâ\\x80\\x9d. In: INTERSPEECH. 2023.\\n[29] Karl J Friston, Lee Harrison, and Will Penny. â\\x80\\x9cDynamic Causal Modellingâ\\x80\\x9d. In: Neuroimage 19.4 (2003), pp. 1273â\\x80\\x93 1302.\\n[30] Daniel Y Fu, Elliot L Epstein, Eric Nguyen, Armin W Thomas, Michael Zhang, Tri Dao, Atri Rudra, and Christo- pher RÃ©. â\\x80\\x9cSimple Hardware-efficient Long Convolutions for Sequence Modelingâ\\x80\\x9d. In: The International Confer- ence on Machine Learning (ICML) (2023).\\n[31] Ken-ichi Funahashi and Yuichi Nakamura. â\\x80\\x9cApproximation of Dynamical Systems by Continuous Time Recur- rent Neural Networksâ\\x80\\x9d. In: Neural Networks 6.6 (1993), pp. 801â\\x80\\x93806.\\n19\\n[32] Leo Gao, Stella Biderman, Sid Black, Laurence Golding, Travis Hoppe, Charles Foster, Jason Phang, Horace He, Anish Thite, Noa Nabeshima, Shawn Presser, and Connor Leahy. â\\x80\\x9cThe Pile: An 800GB Dataset of Diverse Text for Language Modelingâ\\x80\\x9d. In: arXiv preprint arXiv:2101.00027 (2020).\\n[33] Leo Gao, Jonathan Tow, Stella Biderman, Sid Black, Anthony DiPofi, Charles Foster, Laurence Golding, Jeffrey Hsu, Kyle McDonell, Niklas Muennighoff, Jason Phang, Laria Reynolds, Eric Tang, Anish Thite, Ben Wang, Kevin Wang, and Andy Zou. A Framework for Few-shot Language Model Evaluation. Version v0.0.1. Sept. 2021. doi: 10.5281/zenodo.5371628. url: https://doi.org/10.5281/zenodo.5371628.\\n[34] Karan Goel, Albert Gu, Chris Donahue, and Christopher RÃ©. â\\x80\\x9cItâ\\x80\\x99s Raw! Audio Generation with State-Space Modelsâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2022.\\n[35] Albert Gu, Tri Dao, Stefano Ermon, Atri Rudra, and Christopher RÃ©. â\\x80\\x9cHIPPO: Recurrent Memory with Optimal Polynomial Projectionsâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2020.\\n[36] Albert Gu, Karan Goel, and Christopher RÃ©. â\\x80\\x9cEfficiently Modeling Long Sequences with Structured State Spacesâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2022.\\n[37] Albert Gu, Caglar Gulcehre, Tom Le Paine, Matt Hoffman, and Razvan Pascanu. â\\x80\\x9cImproving the Gating Mech- anism of Recurrent Neural Networksâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2020.\\n[38] Albert Gu, Ankit Gupta, Karan Goel, and Christopher RÃ©. â\\x80\\x9cOn the Parameterization and Initialization of Diag-\\nonal State Space Modelsâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2022.\\n[39] Albert Gu, Isys Johnson, Karan Goel, Khaled Saab, Tri Dao, Atri Rudra, and Christopher RÃ©. â\\x80\\x9cCombining Recur- rent, Convolutional, and Continuous-time Models with the Linear State Space Layerâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2021.\\n[40] Albert Gu, Isys Johnson, Aman Timalsina, Atri Rudra, and Christopher RÃ©. â\\x80\\x9cHow to Train Your HIPPO: State Space Models with Generalized Basis Projectionsâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[41] Ankit Gupta, Albert Gu, and Jonathan Berant. â\\x80\\x9cDiagonal State Spaces are as Effective as Structured State Spacesâ\\x80\\x9d. In: Advances in Neural Information Processing Systems 35 (2022), pp. 22982â\\x80\\x9322994.\\n[42] David Ha, Andrew Dai, and Quoc V. Le. â\\x80\\x9cHyperNetworksâ\\x80\\x9d. In: The International Conference on Learning Rep- resentations (ICLR). 2017.\\n[43] Danijar Hafner, Timothy Lillicrap, Jimmy Ba, and Mohammad Norouzi. â\\x80\\x9cDream to Control: Learning Behav- iors by Latent Imaginationâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2020. [44] Ramin Hasani, Mathias Lechner, Tsun-Hsuan Wang, Makram Chahine, Alexander Amini, and Daniela Rus. â\\x80\\x9cLiquid Structural State-Space Modelsâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[45] Mikael Henaff, Arthur Szlam, and Yann LeCun. â\\x80\\x9cRecurrent Orthogonal Networks and Long-Memory Tasksâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2016.\\n[46] Dan Hendrycks and Kevin Gimpel. â\\x80\\x9cGaussian Error Linear Units (GELUs)â\\x80\\x9d. In: arXiv preprint arXiv:1606.08415 (2016).\\n[47] Sepp Hochreiter and JÃ¼rgen Schmidhuber. â\\x80\\x9cLong Short-Term Memoryâ\\x80\\x9d. In: Neural Computation 9.8 (1997),\\npp. 1735â\\x80\\x931780. Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. â\\x80\\x9cAn Empirical Analysis of Compute- Optimal Large Language Model Trainingâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 35 (2022), pp. 30016â\\x80\\x9330030.\\n48\\n[49] Weizhe Hua, Zihang Dai, Hanxiao Liu, and Quoc Le. â\\x80\\x9cTransformer Quality in Linear Timeâ\\x80\\x9d. In: The Interna- tional Conference on Machine Learning (ICML). PMLR. 2022, pp. 9099â\\x80\\x939117.\\n[50] Hassan Ismail Fawaz, Germain Forestier, Jonathan Weber, Lhassane Idoumghar, and Pierre-Alain Muller. â\\x80\\x9cDeep Learning for Time Series Classification: A Reviewâ\\x80\\x9d. In: Data Mining and Knowledge Discovery 33.4 (2019), pp. 917â\\x80\\x93963.\\n[51] Andrei Ivanov, Nikoli Dryden, Tal Ben-Nun, Shigang Li, and Torsten Hoefler. â\\x80\\x9cData Movement is All You Need: A Case Study on Optimizing Transformersâ\\x80\\x9d. In: Proceedings of Machine Learning and Systems 3 (2021), pp. 711â\\x80\\x93 732.\\n[52] Li Jing, Caglar Gulcehre, John Peurifoy, Yichen Shen, Max Tegmark, Marin Soljacic, and Yoshua Bengio. â\\x80\\x9cGated Orthogonal Recurrent Units: On Learning to Forgetâ\\x80\\x9d. In: Neural Computation 31.4 (2019), pp. 765â\\x80\\x93783. [53] Rudolph Emil Kalman. â\\x80\\x9cA New Approach to Linear Filtering and Prediction Problemsâ\\x80\\x9d. In: (1960).\\n20\\n[54] Angelos Katharopoulos, Apoorv Vyas, Nikolaos Pappas, and FranÃ§ois Fleuret. â\\x80\\x9cTransformers are RNNs: Fast Autoregressive Transformers with Linear Attentionâ\\x80\\x9d. In: International Conference on Machine Learning. PMLR. 2020, pp. 5156â\\x80\\x935165.\\n[55] Zhifeng Kong, Wei Ping, Jiaji Huang, Kexin Zhao, and Bryan Catanzaro. â\\x80\\x9cDiffWave: A Versatile Diffusion Model for Audio Synthesisâ\\x80\\x9d. In: International Conference on Learning Representations. 2021.\\n[56] Chrysoula Kosma, Giannis Nikolentzos, and Michalis Vazirgiannis. â\\x80\\x9cTime-Parameterized Convolutional Neu- ral Networks for Irregularly Sampled Time Seriesâ\\x80\\x9d. In: arXiv preprint arXiv:2308.03210 (2023).\\n[57] Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. â\\x80\\x9cImageNet Classification with Deep Convolutional Neural Networksâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 25 (2012).\\n[58] Tao Lei. â\\x80\\x9cWhen Attention Meets Fast Recurrence: Training Language Models with Reduced Computeâ\\x80\\x9d. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing. 2021, pp. 7633â\\x80\\x937648. [59] Tao Lei, Yu Zhang, Sida I Wang, Hui Dai, and Yoav Artzi. â\\x80\\x9cSimple Recurrent Units for Highly Parallelizable\\nRecurrenceâ\\x80\\x9d. In: arXiv preprint arXiv:1709.02755 (2017).\\n[60] Mario Lezcano-Casado and David MartÃ\\xadnez-Rubio. â\\x80\\x9cCheap Orthogonal Constraints in Neural Networks: A Simple Parametrization of the Orthogonal and Unitary Groupâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2019.\\n[61] Yuhong Li, Tianle Cai, Yi Zhang, Deming Chen, and Debadeepta Dey. â\\x80\\x9cWhat Makes Convolutional Models Great on Long Sequence Modeling?â\\x80\\x9d In: The International Conference on Learning Representations (ICLR). 2023. [62] Vasileios Lioutas and Yuhong Guo. â\\x80\\x9cTime-aware Large Kernel Convolutionsâ\\x80\\x9d. In: The International Conference\\non Machine Learning (ICML). PMLR. 2020, pp. 6172â\\x80\\x936183.\\n[63] Chris Lu, Yannick Schroecker, Albert Gu, Emilio Parisotto, Jakob Foerster, Satinder Singh, and Feryal Behba- hani. â\\x80\\x9cStructured State Space Models for In-Context Reinforcement Learningâ\\x80\\x9d. In: Advances in Neural Informa- tion Processing Systems (NeurIPS). 2023.\\n[64] Shahar Lutati, Itamar Zimerman, and Lior Wolf. â\\x80\\x9cFocus Your Attention (with Adaptive IIR Filters)â\\x80\\x9d. In: arXiv preprint arXiv:2305.14952 (2023).\\n[65] Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. â\\x80\\x9cMega: Moving Average Equipped Gated Attentionâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[66] Eric Martin and Chris Cundy. â\\x80\\x9cParallelizing Linear Recurrent Neural Nets Over Sequence Lengthâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2018.\\n[67] Soroush Mehri, Kundan Kumar, Ishaan Gulrajani, Rithesh Kumar, Shubham Jain, Jose Sotelo, Aaron Courville, and Yoshua Bengio. â\\x80\\x9cSampleRNN: An Unconditional End-to-End Neural Audio Generation Modelâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2017.\\n[68] Harsh Mehta, Ankit Gupta, Ashok Cutkosky, and Behnam Neyshabur. â\\x80\\x9cLong Range Language Modeling via Gated State Spacesâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[69] Zakaria Mhammedi, Andrew Hellicar, Ashfaqur Rahman, and James Bailey. â\\x80\\x9cEfficient Orthogonal Parametri- sation of Recurrent Neural Networks using Householder Reflectionsâ\\x80\\x9d. In: International Conference on Machine Learning. PMLR. 2017, pp. 2401â\\x80\\x932409.\\n[70] Eric Nguyen, Karan Goel, Albert Gu, Gordon Downs, Preey Shah, Tri Dao, Stephen Baccus, and Christopher RÃ©. â\\x80\\x9cS4ND: Modeling Images and Videos as Multidimensional Signals with State Spacesâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2022.\\n[71] Eric Nguyen, Michael Poli, Marjan Faizi, Armin Thomas, Callum Birch-Sykes, Michael Wornow, Aman Pa- tel, Clayton Rabideau, Stefano Massaroli, Yoshua Bengio, et al. â\\x80\\x9cHyenaDNA: Long-range Genomic Sequence Modeling at Single Nucleotide Resolutionâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2023.\\n[72] Catherine Olsson, Nelson Elhage, Neel Nanda, Nicholas Joseph, Nova DasSarma, Tom Henighan, Ben Mann, Amanda Askell, Yuntao Bai, Anna Chen, Tom Conerly, Dawn Drain, Deep Ganguli, Zac Hatfield-Dodds, Danny Hernandez, Scott Johnston, Andy Jones, Jackson Kernion, Liane Lovitt, Kamal Ndousse, Dario Amodei, Tom Brown, Jack Clark, Jared Kaplan, Sam McCandlish, and Chris Olah. â\\x80\\x9cIn-context Learning and Induction Headsâ\\x80\\x9d. In: Transformer Circuits Thread (2022). https://transformer-circuits.pub/2022/in-context-learning-and-induction- heads/index.html.\\n[73] Aaron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalch- brenner, Andrew Senior, and Koray Kavukcuoglu. â\\x80\\x9cWaveNet: A Generative Model for Raw Audioâ\\x80\\x9d. In: arXiv preprint arXiv:1609.03499 (2016).\\n21\\n[74] Antonio Orvieto, Samuel L Smith, Albert Gu, Anushan Fernando, Caglar Gulcehre, Razvan Pascanu, and So- ham De. â\\x80\\x9cResurrecting Recurrent Neural Networks for Long Sequencesâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2023.\\n[75] Denis Paperno, GermÃ¡n Kruszewski, Angeliki Lazaridou, Ngoc-Quan Pham, Raffaella Bernardi, Sandro Pezzelle, Marco Baroni, Gemma Boleda, and Raquel FernÃ¡ndez. â\\x80\\x9cThe LAMBADA Dataset: Word Prediction Requiring a Broad Discourse Contextâ\\x80\\x9d. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics. 2016, pp. 1525â\\x80\\x931534.\\n[76] Razvan Pascanu, Tomas Mikolov, and Yoshua Bengio. â\\x80\\x9cOn the Difficulty of Training Recurrent Neural Net- worksâ\\x80\\x9d. In: International Conference on Machine Learning. 2013, pp. 1310â\\x80\\x931318.\\n[77] Bo Peng, Eric Alcaide, Quentin Anthony, Alon Albalak, Samuel Arcadinho, Huanqi Cao, Xin Cheng, Michael Chung, Matteo Grella, Kranthi Kiran GV, et al. â\\x80\\x9cRWKV: Reinventing RNNs for the Transformer Eraâ\\x80\\x9d. In: arXiv preprint arXiv:2305.13048 (2023).\\n[78] Hao Peng, Nikolaos Pappas, Dani Yogatama, Roy Schwartz, Noah A Smith, and Lingpeng Kong. â\\x80\\x9cRandom Feature Attentionâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2021.\\n[79] Michael Poli, Stefano Massaroli, Eric Nguyen, Daniel Y Fu, Tri Dao, Stephen Baccus, Yoshua Bengio, Stefano Ermon, and Christopher RÃ©. â\\x80\\x9cHyena Hierarchy: Towards Larger Convolutional Language Modelsâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2023.\\n[80] Zhen Qin, Xiaodong Han, Weixuan Sun, Bowen He, Dong Li, Dongxu Li, Yuchao Dai, Lingpeng Kong, and Yiran Zhong. â\\x80\\x9cToeplitz Neural Network for Sequence Modelingâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[81] Zhen Qin, Xiaodong Han, Weixuan Sun, Dongxu Li, Lingpeng Kong, Nick Barnes, and Yiran Zhong. â\\x80\\x9cThe devil in linear transformerâ\\x80\\x9d. In: arXiv preprint arXiv:2210.10340 (2022).\\n[82] Zhen Qin, Weixuan Sun, Hui Deng, Dongxu Li, Yunshen Wei, Baohong Lv, Junjie Yan, Lingpeng Kong, and Yiran Zhong. â\\x80\\x9cCosFormer: Rethinking Softmax in Attentionâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2022.\\n[83] Ali Rahimi and Benjamin Recht. â\\x80\\x9cRandom features for large-scale kernel machinesâ\\x80\\x9d. In: Advances in neural information processing systems 20 (2007).\\n[84] Prajit Ramachandran, Barret Zoph, and Quoc V Le. â\\x80\\x9cSwish: A Self-gated Activation Functionâ\\x80\\x9d. In: arXiv preprint arXiv:1710.05941 7.1 (2017), p. 5.\\n[85] David W Romero, Anna Kuzina, Erik J Bekkers, Jakub M Tomczak, and Mark Hoogendoorn. â\\x80\\x9cCKConv: Con- tinuous Kernel Convolution For Sequential Dataâ\\x80\\x9d. In: arXiv preprint arXiv:2102.02611 (2021).\\n[86] Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. â\\x80\\x9cWinogrande: An Adversarial Wino- grad Schema Challenge at Scaleâ\\x80\\x9d. In: Communications of the ACM 64.9 (2021), pp. 99â\\x80\\x93106.\\n[87] George Saon, Ankit Gupta, and Xiaodong Cui. â\\x80\\x9cDiagonal State Space Augmented Transformers for Speech Recognitionâ\\x80\\x9d. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE. 2023, pp. 1â\\x80\\x935. Imanol Schlag, Kazuki Irie, and JÃ¼rgen Schmidhuber. â\\x80\\x9cLinear Transformers are Secretly Fast Weight Program- mersâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). PMLR. 2021, pp. 9355â\\x80\\x939366. [89] Noam Shazeer. â\\x80\\x9cGLU Variants Improve Transformerâ\\x80\\x9d. In: arXiv preprint arXiv:2002.05202 (2020). [90] Freda Shi, Xinyun Chen, Kanishka Misra, Nathan Scales, David Dohan, Ed H Chi, Nathanael SchÃ¤rli, and Denny Zhou. â\\x80\\x9cLarge Language Models can be Easily Distracted by Irrelevant Contextâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 31210â\\x80\\x9331227. Jiaxin Shi, Ke Alexander Wang, and Emily Fox. â\\x80\\x9cSequence Modeling with Multiresolution Convolutional Mem- oryâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 31312â\\x80\\x9331327. Jimmy TH Smith, Andrew Warrington, and Scott W Linderman. â\\x80\\x9cSimplified State Space Layers for Sequence Modelingâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023. Jianlin Su, Yu Lu, Shengfeng Pan, Ahmed Murtadha, Bo Wen, and Yunfeng Liu. â\\x80\\x9cRoformer: Enhanced Trans- former with Rotary Position Embeddingâ\\x80\\x9d. In: arXiv preprint arXiv:2104.09864 (2021).\\n[93]\\n[94] Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, and Furu Wei. â\\x80\\x9cRetentive network: A successor to transformer for large language modelsâ\\x80\\x9d. In: arXiv preprint arXiv:2307.08621 (2023). Ilya Sutskever, Oriol Vinyals, and Quoc V Le. â\\x80\\x9cSequence to Sequence Learning with Neural Networksâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 27 (2014).\\n22\\n[96] Corentin Tallec and Yann Ollivier. â\\x80\\x9cCan Recurrent Neural Networks Warp Time?â\\x80\\x9d In: The International Con- ference on Learning Representations (ICLR). 2018.\\n[97] Yi Tay, Mostafa Dehghani, Samira Abnar, Yikang Shen, Dara Bahri, Philip Pham, Jinfeng Rao, Liu Yang, Se- bastian Ruder, and Donald Metzler. â\\x80\\x9cLong Range Arena: A Benchmark for Efficient Transformersâ\\x80\\x9d. In: Inter- national Conference on Learning Representations (ICLR). 2021.\\n[98] Yi Tay, Mostafa Dehghani, Dara Bahri, and Donald Metzler. â\\x80\\x9cEfficient Transformers: A Surveyâ\\x80\\x9d. In: ACM Com- puting Surveys 55.6 (2022), pp. 1â\\x80\\x9328.\\n[99] Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, TimothÃ©e Lacroix, Bap- tiste RoziÃ¨re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. â\\x80\\x9cLlama: Open and Efficient Foundation Language Modelsâ\\x80\\x9d. In: arXiv preprint arXiv:2302.13971 (2023).\\n[100] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. â\\x80\\x9cAttention Is All You Needâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2017.\\n[101] Eugene Vorontsov, Chiheb Trabelsi, Samuel Kadoury, and Chris Pal. â\\x80\\x9cOn Orthogonality and Learning Recur- rent Networks with Long Term Dependenciesâ\\x80\\x9d. In: International Conference on Machine Learning. PMLR. 2017, pp. 3570â\\x80\\x933578. Jue Wang, Wentao Zhu, Pichao Wang, Xiang Yu, Linda Liu, Mohamed Omar, and Raffay Hamid. â\\x80\\x9cSelective Structured State-Spaces for Long-form Video Understandingâ\\x80\\x9d. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2023, pp. 6387â\\x80\\x936397.\\n[102]\\n[103] Pete Warden. â\\x80\\x9cSpeech Commands: A Dataset for Limited-Vocabulary Speech Recognitionâ\\x80\\x9d. In: ArXiv abs/1804.03209 (2018).\\n[104] Samuel Williams, Andrew Waterman, and David Patterson. â\\x80\\x9cRoofline: An Insightful Visual Performance Model for Multicore Architecturesâ\\x80\\x9d. In: Communications of the ACM 52.4 (2009), pp. 65â\\x80\\x9376.\\n[105] Brandon Yang, Gabriel Bender, Quoc V Le, and Jiquan Ngiam. â\\x80\\x9cCondConv: Conditionally Parameterized Con- volutions for Efficient Inferenceâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 32 (2019). [106] Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. â\\x80\\x9cHellaSwag: Can a Machine Really Finish Your Sentence?â\\x80\\x9d In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguis- tics. 2019.\\n[107] Shuangfei Zhai, Walter Talbott, Nitish Srivastava, Chen Huang, Hanlin Goh, Ruixiang Zhang, and Josh Susskind. â\\x80\\x9cAn Attention Free Transformerâ\\x80\\x9d. In: arXiv preprint arXiv:2105.14103 (2021).\\n[108] Michael Zhang, Khaled K Saab, Michael Poli, Tri Dao, Karan Goel, and Christopher RÃ©. â\\x80\\x9cEffectively Modeling Time Series with Simple Discrete State Spacesâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[109] Lin Zheng, Chong Wang, and Lingpeng Kong. â\\x80\\x9cLinear complexity randomized self-attention mechanismâ\\x80\\x9d. In: International Conference on Machine Learning. PMLR. 2022, pp. 27011â\\x80\\x9327041.\\n[110] Simiao Zuo, Xiaodong Liu, Jian Jiao, Denis Charles, Eren Manavoglu, Tuo Zhao, and Jianfeng Gao. â\\x80\\x9cEfficient Long Sequence Modeling via State Space Augmented Transformerâ\\x80\\x9d. In: arXiv preprint arXiv:2212.08136 (2022).\\n23\\n# A Discussion: Selection Mechanism\\nOur selection mechanism is inspired by and related to concepts such as gating, hypernetworks, and data-dependence. It can also be viewed as related to â\\x80\\x9cfast weightsâ\\x80\\x9d (J. Ba et al. 2016), which connects classical RNNs with the mechanism of linear attention (Schlag, Irie, and Schmidhuber 2021). However, we believe that it is a distinct concept that is worth clarifying.\\nGating. Gating originally referred to the gating mechanisms of RNNs such as the LSTM (Hochreiter and Schmidhuber 1997) and GRU (J. Chung et al. 2014), or the gated equation (5)n Theorem 1. This was interpreted as a particular mechanism for controlling whether to let an input into the hidden state of an RNN. In particular, this aï¬\\x80ects the propagation of signal through time and causes inputs to interact along the sequence length dimension.\\nHowever, the concept of gating has since been relaxed in popular usage to simply mean any multiplicative interaction (often with an activation function). For example, elementwise multiplicative components of neural network architectures (that do not interact along sequence length) are now commonly referred to as gated architectures (Hua et al. 2022; Mehta et al. 2023), despite a very diï¬\\x80erent meaning than the original RNN sense. Thus we believe the original concept of RNN gating versus the popular usage of multiplicative gating actually have a very diï¬\\x80erent semantic meaning.\\nHypernetworks. Hypernetworks refer to neural networks whose parameters are themselves generated by smaller neural networks. The original idea (Ha, Dai, and Quoc V. Le 2017) used it in a narrow sense to deï¬\\x81ne a large RNN whose recurrent parameters are generated by a smaller RNN.\\nData-dependence. Similar to hypernetworks, data-dependence can refer to any notion where some parameters of the model depend on the data (Poli et al. 2023).\\nExample: GLU Activation. To illustrate the issues with these concepts, consider a simple diagonal linear layer ð\\x9d\\x91¦ = Dð\\x9d\\x91¥, where D is a diagonal weight parameter. Now suppose that D is itself generated from a linear transformation of ð\\x9d\\x91¥, with an optional nonlinearity: D = ð\\x9d\\x9c\\x8e(W ð\\x9d\\x91¥). Since it is diagonal, the multiplication becomes an elementwise product: ð\\x9d\\x91¦ = ð\\x9d\\x9c\\x8e(W ð\\x9d\\x91¥)â\\x97¦ð\\x9d\\x91¥.\\nThis is a rather trivial transformation, yet it technically satisï¬\\x81es the common meanings of gating (since it has a multiplicative â\\x80\\x9cbranchâ\\x80\\x9d), hypernetworks (since the parameter D is generated by another layer), and data-dependent (since D depends on the data ð\\x9d\\x91¥). However, this in fact simply deï¬\\x81nes a GLU function, which is so simple that it is often considered just an activation function (Dauphin et al. 2017; Shazeer 2020) instead of a meaningful layer.\\nSelection. Thus, while selection mechanisms could be considered a special case of ideas such as architectural gating, hypernetworks, or data-dependence, so can an enormous range of other constructionsâ\\x80\\x94essentially anything with a multiplication, including standard attention mechanisms (Bahdanau, Cho, and Bengio 2015; Vaswani et al. 2017) as wellâ\\x80\\x94and we ï¬\\x81nd it uninformative to think of them as such.\\nInstead, we view it as most closely related to the gating mechanism of traditional RNNs, which is a special case (Theorem 1) and also has a deeper history of connections to SSMs through variable (input-dependent) discretization of â\\x88\\x86 (Funahashi and Nakamura 1993; Gu, Dao, et al. 2020; Tallec and Ollivier 2018). We also eschew the term â\\x80\\x9cgatingâ\\x80\\x9d in favor of selection to clarify the overloaded use of former. More narrowly, we use selection to refer to the mechanistic action of a model to select or ignore inputs and facilitate data interaction along the sequence length (Section 3.1). Beyond selective SSMs and gated RNNs, other examples may include input-dependent convolutions (Kosma, Nikolentzos, and Vazirgiannis 2023; Lioutas and Guo 2020; Lutati, Zimerman, and Wolf 2023; Yang et al. 2019) and even attention.\\n24\\n# B Related Work\\nWe overview several prior works related to our methods. We mention that some of the most closely related models include recurrent layers such as S4, S5, and quasi-RNNs; as well as end-to-end architectures such as H3, RetNet, and RWKV.\\n# B.1 S4 Variants and Derivatives\\nWe describe a brief overview of some structured SSMs from past work, particularly those that have a relation to our method.\\nâ\\x80¢ S4 (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021) introduced the ï¬\\x81rst structured SSM, describing diagonal structure and diagonal plus low-rank (DPLR). It focused on eï¬\\x83cient convolutional algorithms for DPLR SSMs due to a connection to continuous-time online memorization (HIPPO) (Gu, Dao, et al. 2020).\\nâ\\x80¢ DSS (Gupta, Gu, and Berant 2022) ï¬\\x81rst discovered the empirical eï¬\\x80ectiveness of diagonal structured SSMs by approximating the HIPPO initialization. This was expanded on theoretically in S4D (Gu, Gupta, et al. 2022).\\nâ\\x80¢ S5 (Smith, Warrington, and Linderman 2023) independently discovered the diagonal SSM approximation, and is the ï¬\\x81rst S4 model to be computed recurrently with the parallel scan. However, this required lowering the eï¬\\x80ective state dimension, which they accomplished by switching the SSM dimensions from a SISO (single-input single-output) to MIMO (multi-input multi-output) formulation. Our proposed S6 shares the scan, but diï¬\\x80ers by (i) keeping the SISO dimensions, which provides a larger eï¬\\x80ective recurrent state, (ii) using a hardware-aware algorithm to overcome the computation issue, (iii) adding the selection mechanism.\\nLu et al. (2023) applied S5 to meta-RL in order to handle resetting the SSM state between episode trajectories. Their mechanism can be viewed as a particular hard-coded instance of a selection mechanism, where A is manually set to 0, instead of our learnable mechanism that depends on the input. It would be interesting to apply selective SSMs generically to this setting and probe if the model has learned to automatically reset its state on episode boundaries.\\nâ\\x80¢ Mega (Ma et al. 2023) introduced a simpliï¬\\x81cation of S4 to be real- instead of complex- valued, giving it an interpretation of being an exponential moving average (EMA). They additionally make an interesting connection of the discretization step of SSMs to an EMA damping term. Contrary to ï¬\\x81ndings in the original S4 papers, this was the ï¬\\x81rst model to show that real-valued SSMs are empirically eï¬\\x80ective in certain settings or when combined with diï¬\\x80erent architectural components.\\nâ\\x80¢ Liquid S4 (Hasani et al. 2023) is also motivated by augmenting S4 with an input-dependent state transition. From this perspective it shares similarity to selection mechanisms, although in a limited form which is still computed convolutionally and close to LTI.\\nâ\\x80¢ SGConv (Y. Li et al. 2023), Hyena (Poli et al. 2023), LongConv (Fu et al. 2023), MultiresConv (J. Shi, K. A. Wang, and Fox 2023), and Toeplitz Neural Network (Qin, Han, W. Sun, He, et al. 2023) all focus on the convolutional representation of S4 and create global or long convolution kernels with diï¬\\x80erent parameterizations. However, these methods cannot do fast autoregressive inference directly.\\nNotably, all of these methods, and all other structured SSMs that we are aware of, have been non-selective and usually strictly LTI (linear time invariant).\\n# B.2 SSM Architectures\\nWe use SSM architectures or state space neural networks (SSNN) to refer to deep neural network architectures incorporating one of the previous SSMs as a black box layer.\\nâ\\x80¢ GSS (Mehta et al. 2023) was the ï¬\\x81rst gated neural network architecture incorporating SSMs. It is motivated by the gated attention unit (GAU) of Hua et al. (2022) and looks quite similar to our block, except with additional projections. Most importantly, its projection contracts the model dimension to reduce the state size of the SSM, while ours expands the model dimension in order to increase the state size, based on the motivation in Section 3.1.\\n25\\nâ\\x80¢ Mega (Ma et al. 2023) combined the EMA simpliï¬\\x81cation of S4 described above into a hybrid architecture using an eï¬\\x83cient attention approximation.\\nâ\\x80¢ H3 (Dao, Fu, Saab, et al. 2023) is motivated by combining S4 with linear attention (Katharopoulos et al. 2020). It is the ï¬\\x81rst to generalize this formulation of linear attention to more general recurrences, which is also the basis of later architectures.\\nâ\\x80¢ Selective S4 (J. Wang et al. 2023) incorporates S4 as a black box to generate a binary mask which is multiplied on the input. While sharing the â\\x80\\x9cselectionâ\\x80\\x9d name, we consider this an architectural modiï¬\\x81cation that is closer to architectural gating than a selection mechanism (Appendix A). For example, we hypothesize that it would not solve the Selective Copying task because simply masking out the irrelevant inputs does not aï¬\\x80ect the spacing between the relevant ones (indeed, the Selective Copying task can even be viewed as coming pre-masked if the noise tokens are embedded to 0).\\nâ\\x80¢ RetNet (Y. Sun et al. 2023) is also based on Linear Attention and very similar to H3, but reduces the inner S4 layer to a special case where the state dimension is ð\\x9d\\x91\\x81 = 1. Although not framed as such, its recurrence can be viewed as a special case of a linear SSM.\\nIts primary source of improvement is using a linear attention with large head dimension, which can be viewed as another method to perform input-dependent state expansion. Using a larger head dimension in the context of linear attention variants was ï¬\\x81rst done by H3, but not extensively used since this requires a proportional amount of extra computation. RetNet avoids this with an alternate way to parallelize the computation with a variant of standard multi-head attention instead of convolutions, made feasible by their particular special case of SSMs which acts as a simple EMA.\\nâ\\x80¢ RWKV (B. Peng et al. 2023) is another recent RNN designed for language modeling. It is based on AFT (attention-free Transformer (S. Zhai et al. 2021)), another variant of linear attention. Its main â\\x80\\x9cWKVâ\\x80\\x9d mechanism involves LTI recurrences and can be seen as the ratio of two SSMs.\\nWe also highlight the gated attention unit (GAU) from Hua et al. (2022), which was motivated by combining the Transformerâ\\x80\\x99s MHA and MLP blocks together and was an inspiration for our architecture (Section 3.4) combining the H3 and MLP blocks.\\n# B.3 Relationship to RNNs\\nRNNs and SSMs are broadly related, as they both involve the concepts of recurrence on a latent state.\\nSeveral older RNNs such as the strongly typed RNN (Balduzzi and Ghifary 2016), quasi-RNN (QRNN) (Bradbury et al. 2016), and simple recurrent unit (SRU) (Lei 2021; Lei et al. 2017) involve forms of gated RNNs without time-wise nonlinearities. Because of the connections of gating mechanisms and selection mechanisms, these can be viewed as cases of selective SSMs, and are thus more powerful in a sense than the family of LTI structured SSMs above. The main diï¬\\x80erences are:\\nâ\\x80¢ They do not use state expansion (ð\\x9d\\x91\\x81 = 1) or selective B, C parameters, both of which are important for performance (Section 4.6).\\nâ\\x80¢ They use a heuristic gating mechanism, which we generalize as a consequence of the selection mechanism + discretization (Theorem 1). The connections to principled SSM theory provides better parameterizations and initializations (Section 3.6).\\nAdditionally, older RNNs famously suï¬\\x80ered from eï¬\\x83ciency issues and the vanishing gradients problem (Pascanu, Mikolov, and Bengio 2013), both caused by their sequential nature. The latter could be solved for some of the above RNNs by leveraging the parallel scan (Martin and Cundy 2018), but the former was diï¬\\x83cult without theory later developed for SSMs. For example, modern structured SSMs diï¬\\x80er in more careful parameterization of the recurrent dynamics inspired by classical SSM theory (e.g. through discretization (Gu, Johnson, Goel, et al. 2021; Gu, Johnson, Timalsina, et al. 2023)), or direct analysis (Orvieto et al. 2023)).\\nWe also note that there is a long line of work on orthogonal RNNs (Arjovsky, Shah, and Bengio 2016; Henaï¬\\x80, Szlam, and LeCun 2016; Lezcano-Casado and MartÃ\\xadnez-Rubio 2019; Mhammedi et al. 2017; Vorontsov et al. 2017)\\n26\\nwhich are motivated by constraining the A transition matrix to be orthogonal or unitary, in order to control its eigenvalues and prevent the vanishing gradient problem. However, these had other limitations; we believe that these stem from the fact that orthogonal/unitary RNNs are also LTI. For example, they are almost always evaluated on the Copying task which they can solve perfectly, but observed to struggle on the Selective Copying task (Jing et al. 2019).\\n# B.4 Linear Attention\\nThe Linear Attention (LA) (Katharopoulos et al. 2020) framework is an important result popularizing kernel attention and showing how it relates to recurrent autoregressive models. Many variants have proposed alternative kernels and other modiï¬\\x81cations. Random Feature Attention (RFA) (H. Peng et al. 2021) chooses the kernel feature map to approximate softmax attention (i.e. the exp feature map) using the random Fourier feature approximation of Gaussian kernels (Rahimi and Recht 2007). Performer (Choromanski et al. 2021) ï¬\\x81nds an approximation to the exponential kernel involving only positive features, which also allows the softmax normalization term. TransNormer (Qin, Han, W. Sun, D. Li, et al. 2022) showed that the LA denominator term can be unstable and proposed replacing it with a LayerNorm. cosFormer (Qin, W. Sun, et al. 2022) augments RFA with a cosine reweighting mechanism that incorporates positional information to emphasize locality. Linear Randomized Attention (Zheng, C. Wang, and L. Kong 2022) generalize RFA from the perspective of importance sampling, and generalize it to provide better estimates of the full softmax kernel (rather than just the exp-transformed numerator).\\nAside from kernel attention, many other variants of eï¬\\x83cient attention exist; the survey Tay, Dehghani, Bahri, et al. (2022) oï¬\\x80ers an extensive categorization of many of these.\\n# B.5 Long Context Models\\nLong context has become a popular subject, and several recent models have claimed to scale to longer and longer sequences. However, these are often from a computational standpoint and have not been extensively validated. These include:\\nâ\\x80¢ Recurrent Memory Transformer (Bulatov, Kuratov, and Burtsev 2023), a lightweight wrapper around a Transformer backbone. It showed ability to generalize up to 1M sequences but only on synthetic memorization tasks; their main result is similar to our Induction Heads extrapolation experiment (Table 2).\\nâ\\x80¢ LongNet (Ding et al. 2023), which claimed to scale to 1B length but only evaluated on length < 100ð\\x9d\\x90¾ for actual tasks.\\nâ\\x80¢ Hyena and HyenaDNA (Nguyen, Poli, et al. 2023; Poli et al. 2023), which claimed to leverage up to 1M context. However, their experiments trained on proportionally more data at longer contexts, making it hard to conclude if quality improvements at 1M context are due to context length or due to more data and computation.\\nâ\\x80¢ Sparse Transformer (Child et al. 2019) showed a proof-of-concept of using a strided sparse attention Transformer to model audio waveforms of length 220 = 1048576, although did not discuss performance tradeoï¬\\x80s when controlling for computation and model size.\\nIn contrast, we believe this work presents one of the ï¬\\x81rst approaches to meaningfully demonstrate increasing performance with longer context.\\n# C Mechanics of Selective SSMs\\nProof of Theorem 1. Consider a selective SSM (Algorithm 2) with ð\\x9d\\x91\\x81 = 1, A = â\\x88\\x921, B = 1, ð\\x9d\\x91\\xa0â\\x88\\x86 = ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥), ð\\x9d\\x9c\\x8fâ\\x88\\x86 = ð\\x9d\\x97\\x8cð\\x9d\\x97\\x88ð\\x9d\\x96¿ð\\x9d\\x97\\x8dð\\x9d\\x97\\x89ð\\x9d\\x97\\x85ð\\x9d\\x97\\x8eð\\x9d\\x97\\x8c. The corresponding continuous-time SSM (1) is\\nâ\\x84\\x8e(ð\\x9d\\x91¡) = â\\x88\\x92â\\x84\\x8e(ð\\x9d\\x91¡) + ð\\x9d\\x91¥(ð\\x9d\\x91¡)\\nwhich is also called a leaky integrator.\\n27\\nThe discretization step size is\\nThe discretization step size is\\n# â\\x88\\x86ð\\x9d\\x91¡ = ð\\x9d\\x9c\\x8fâ\\x88\\x86(ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b + ð\\x9d\\x91\\xa0â\\x88\\x86(ð\\x9d\\x91¥ð\\x9d\\x91¡))\\n= ð\\x9d\\x97\\x8cð\\x9d\\x97\\x88ð\\x9d\\x96¿ð\\x9d\\x97\\x8dð\\x9d\\x97\\x89ð\\x9d\\x97\\x85ð\\x9d\\x97\\x8eð\\x9d\\x97\\x8c(ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b + ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)) = ð\\x9d\\x97\\x8cð\\x9d\\x97\\x88ð\\x9d\\x96¿ð\\x9d\\x97\\x8dð\\x9d\\x97\\x89ð\\x9d\\x97\\x85ð\\x9d\\x97\\x8eð\\x9d\\x97\\x8c(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡))\\nwhere we observe that the parameter can be viewed as a learnable bias and folded into the linear projection.\\nNow applying the zero-order hold (ZOH) discretization formulas:\\nAð\\x9d\\x91¡ = exp(â\\x88\\x86A) = 1 1 + exp(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡) = ð\\x9d\\x9c\\x8e(â\\x88\\x92ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)) = 1 â\\x88\\x92 ð\\x9d\\x9c\\x8e(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)) Bð\\x9d\\x91¡ = (â\\x88\\x86A)â\\x88\\x921(exp(â\\x88\\x86A) â\\x88\\x92 I) â\\x8b\\x85 â\\x88\\x86B = â\\x88\\x92(exp(â\\x88\\x86A) â\\x88\\x92 I) = 1 â\\x88\\x92 A = ð\\x9d\\x9c\\x8e(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)).\\nThus the final discrete recurrence (2a) is\\nð\\x9d\\x91\\x94ð\\x9d\\x91¡ = ð\\x9d\\x9c\\x8e(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)) â\\x84\\x8eð\\x9d\\x91¡ = (1 â\\x88\\x92 ð\\x9d\\x91\\x94ð\\x9d\\x91¡)â\\x84\\x8eð\\x9d\\x91¡â\\x88\\x921 + ð\\x9d\\x91\\x94ð\\x9d\\x91¡ð\\x9d\\x91¥ð\\x9d\\x91¡\\nas desired.\\n# D Hardware-aware Algorithm For Selective SSMs\\nWithout input-dependent selectivity, SSMs can be eï¬\\x83ciently implemented as a convolution (Dao, Fu, Saab, et al. 2023; Gu, Goel, and RÃ© 2022), which leverages the fast Fourier transform (FFT) as primitive. With selectivity, SSMs are no-longer equivalent to convolution, but we leverage the parallel associative scan. While SSM scans are theoretically eï¬\\x83cient (ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90·ð\\x9d\\x91\\x81) FLOPs, scaling linear in ð\\x9d\\x90¿), training foundation models with selective SSMs requires them to be eï¬\\x83cient on modern hardware (GPUs) as well. We describe how we use kernel fusion and recomputation to make SSM scan fast and memory-eï¬\\x83cient. We evaluate the speed of our scan implementation compared to convolution and attention in Section 4.5, showing that it is up to 7Ã\\x97 times faster than attention at sequence length 32K, and is as memory-eï¬\\x83cient as the best attention implementation (FlashAttention).\\nSpeed. On modern hardware accelerators (GPUs) most operations (except matrix multiply) are bounded by memory-bandwidth (Dao, Fu, Ermon, et al. 2022; Ivanov et al. 2021; Williams, Waterman, and Patterson 2009). This the case with our scan operation, and we use kernel fusion to reduce the amount of memory IOs, leading to signiï¬\\x81cant speedup compared to a standard implementation.\\nThe standard way to implement the scan algorithm in Section 3.2 is to prepare the scan input A, B of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·, ð\\x9d\\x91\\x81) in GPU HBM (high-bandwidth memory, commonly referred to as GPU memory), call a parallel associative scan implementation to write the scan output of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·, ð\\x9d\\x91\\x81) to GPU HBM, then multiply that scan output with C to produce an output of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·). However, this requires the number of memory reads/writes on the order of ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90·ð\\x9d\\x91\\x81). We can instead fuse the discretization step, the scan, and the multiplication with C into one kernel:\\n1. We read in ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90· + ð\\x9d\\x90·ð\\x9d\\x91\\x81) bytes of memory (â\\x88\\x86, A, B, C) from slow HBM to fast SRAM.\\n2. We discretize to produce A, B of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·, ð\\x9d\\x91\\x81) in SRAM.\\n3. We perform a parallel associative scan, yielding intermediate states of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·, ð\\x9d\\x91\\x81) in SRAM.\\n4. We multiply and sum with C, producing outputs of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·) and write it to HBM.\\nThis way, we reduce IOs by a factor of ð\\x9d\\x91\\x82(ð\\x9d\\x91\\x81) (the state dimension), which in practice speeds up the operation by 20-40 times (Section 4.5).\\n28\\nTable 11: (Induction heads.) Models are trained on sequence length 2Â° = 256, and tested on various sequence lengths of 2Â° = 64 up to 2Â° = 1048576. Y denotes perfect generalization accuracy, while X denotes out of memory.\\nModel Params Test Accuracy (%) at Sequence Length 26 7 28 29 210 gl 212 913 214915216 917918919920 MHA-Abs 137K v 99.6 100.0 58.6 266 188 98 10.9 7.8 X x x x x x MHA-RoPE = 137K v v 100.0 83.6 31.3 184 8.6 9.0 5.5 xX x x x x x MHA-xPos 137K v v 100.0 99.6 67.6 254 7.0 9.0 78 =X x x x x x H3 153K v v 100.0 80.9 39.5 238 148 82 59 66 82 47 82 63 74 Hyena 69M* 977 Vo 100.0 Vv 441 125 66 5.1 70 #59 66 66 59 63 98 Mamba 74K v v 100.0 Vv v v v v v v v v v v v\\nâ\\x88\\x97 Most of the parameters are in learnable positional encodings.\\nFor sequence length ð\\x9d\\x90¿ too long where we cannot ï¬\\x81t the sequence in SRAM (which is much smaller than HBM), we split the sequences into chunks and perform the fused scan on each chunk. As long as we have the intermediate scan states, we can continue the scan with the next chunk.\\nMemory. We describe how we use the classical technique of recomputation to reduce the total amount of memory required to train selective SSM layers.\\nFrom the way we fuse the forward pass, we do not save the intermediate states of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·, ð\\x9d\\x91\\x81) to avoid memory blowup. However, these intermediate states are necessary for the backward pass to compute gradients. We instead recompute those intermediate states in the backward pass. Since the inputs â\\x88\\x86, A, B, C and output gradient read from HBM to SRAM are of size ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x91\\x81 + ð\\x9d\\x90·ð\\x9d\\x91\\x81), and the input gradients are also of size ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x91\\x81 + ð\\x9d\\x90·ð\\x9d\\x91\\x81), recomputation avoids the cost of reading ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x91\\x81ð\\x9d\\x90·) elements from HBM. This means that recomputation of the SSM states in the backward pass speeds up the computation compared to storing them and reading them from HBM.\\nBeyond optimizing for the memory requirement of just the scan operation, we also use recomputation to optimize the memory requirement of the entire selective SSM block (input projection, convolution, activation, scan, output projection). In particular, we do not save intermediate activations that take a lot of memory but are fast to recompute (e.g. output of activation function or short convolution). As a result, the selective SSM layer has the same memory requirement as an optimized Transformer implementation with FlashAttention. In particular, each attention layer (FlashAttention) stores around 12 bytes of activations per token, an each MLP layer stores around 20 bytes of activations per token, for a total of 32 bytes ((assuming mixed-precision training in FP16 or BF16)). Each selective SSM stores around 16 bytes of activations per token. Hence two layers of selective SSMs have around the same activation memory as an attention layer and an MLP layer.\\n# E Experimental Details and Additional Results\\n# E.1 Synthetic Tasks\\nSelective Copying. Our setting is on sequences of length 4096, with a vocab size of 16 possible tokens (including the white â\\x80\\x9cnoiseâ\\x80\\x9d token from Figure 2) and requiring models to memorize 16 â\\x80\\x9cdataâ\\x80\\x9d tokens. We use 2 layer models with a model dimension of ð\\x9d\\x90· = 64.\\nModels are trained for 400K steps at a constant learning rate of 0.0001 with a batch size of 64.\\nInduction Heads. Training consists of randomly generating data every step, with a batch size of 8. We choose an â\\x80\\x9cepochâ\\x80\\x9d size of 8192 steps, and track the accuracy on ï¬\\x81xed validation sets (also randomly generated) of each target sequence length. For the MHA-Abs and Mamba models, results are reported after the 25th epoch (8192 Ã\\x97 25 = 204800 steps). For the MHA-RoPE and MHA-xPos models, results are reported after the 50th epoch (8192 Ã\\x97 50 = 409600 steps). For the LTI H3 and Hyena models, results are reported after the 10th epoch (81920 steps) because they had converged by then and failed to improve further.\\n29\\nTable 12: (Scaling Law Model Sizes.) Our model sizes and hyperparameters for scaling experiments. (Model dimension and number of heads applies only to Transformer models.)\\nParams ð\\x9d\\x9a\\x97_ð\\x9d\\x9a\\x95ð\\x9d\\x9a\\x8að\\x9d\\x9a¢ð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x9bð\\x9d\\x9a\\x9c ð\\x9d\\x9a\\x8d_ð\\x9d\\x9a\\x96ð\\x9d\\x9a\\x98ð\\x9d\\x9a\\x8dð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x95 ð\\x9d\\x9a\\x97_ð\\x9d\\x9a\\x91ð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x8að\\x9d\\x9a\\x8dð\\x9d\\x9a\\x9c / ð\\x9d\\x9a\\x8d_ð\\x9d\\x9a\\x91ð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x8að\\x9d\\x9a\\x8d Training steps Learning Rate Batch Size Tokens 125M 350M 760M 1.3B 12 24 24 24 768 1024 1536 2048 12 / 64 16 / 64 16 / 96 32 / 64 4800 13500 29000 50000 6e-4 3e-4 2.5e-4 2e-4 0.5M tokens 0.5M tokens 0.5M tokens 0.5M tokens 2.5B 7B 15B 26B\\nWe use the Adam optimizer with no weight decay. All models are trained at constant learning rates 2ð\\x9d\\x91\\x92 â\\x88\\x92 4 and 1ð\\x9d\\x91\\x92 â\\x88\\x92 3, and the better results are reported for each model (2ð\\x9d\\x91\\x92 â\\x88\\x92 4 for all models except Mamba). The attention and Hyena models did not learn at LR 1ð\\x9d\\x91\\x92 â\\x88\\x92 3. H3 learned at both LRs, but interestingly generalized better to shorter sequences at the smaller LR of 2ð\\x9d\\x91\\x92 â\\x88\\x92 4. Mamba learned at both LRs, but extrapolated better at the larger LR of 1ð\\x9d\\x91\\x92 â\\x88\\x92 3.\\n# E.2 Language Modeling\\n# E.2.1 Scaling Law Details\\nAll models were trained on the Pile.\\nModel Sizes. Table 12 speciï¬\\x81es the model sizes we use for scaling laws. This is taken directly from the GPT3 speciï¬\\x81cations (Brown et al. 2020), with very minor modiï¬\\x81cations. First, we changed the batch size of the 1.3B model from 1M tokens to 0.5M tokens, since we did not use enough parallelization to require the larger batch size. Second, we changed the number of training steps and total tokens to roughly match Chinchilla scaling laws (Hoï¬\\x80mann et al. 2022), which specify that training tokens should increase proportionally to model size.\\nTraining Recipes. All models used the AdamW optimizer with\\nâ\\x80¢ gradient clip value 1.0\\nâ\\x80¢ weight decay 0.1\\nno dropout\\nlinear learning rate warmup with cosine decay\\nBy default, the peak learning rate is the GPT3 speciï¬\\x81cation.\\nWe give several models an â\\x80\\x9cimproved recipeâ\\x80\\x9d, inspired by changes adopted by popular large language models such as PaLM (Chowdhery et al. 2023) and LLaMa (Touvron et al. 2023). These include:\\nâ\\x80¢ linear learning rate warmup with cosine decay to 1ð\\x9d\\x91\\x92 â\\x88\\x92 5, with a peak value of 5Ã\\x97 the GPT3 value\\nno linear bias terms\\nRMSNorm instead of LayerNorm\\nâ\\x80¢ AdamW hyperparameter ð\\x9d\\x9b½ = (.9, .95) (the GPT3 value) instead of the PyTorch default of ð\\x9d\\x9b½ = (.9, .999)\\nArchitecture and Training Details. Our models are: â\\x80¢ Transformer: The standard Transformer based on GPT3 (Table 12).\\nâ\\x80¢ Transformer++: A Transformer with an improved architecture, namely rotary positional encodings (Su et al. 2021) and SwiGLU MLP (Shazeer 2020), and the improved training recipe above.\\nâ\\x80¢ Hyena: Interleaving a Hyena block (the H3 block with S4 replaced by a global convolution parameterized by an MLP) with standard MLP blocks. The MLP blocks have expansion factor 2 instead of 4 and the number of layers is correspondingly increased by 1.5Ã\\x97 to preserve parameter count.\\n30\\nâ\\x80¢ H3++: The H3 architecture with a few modiï¬\\x81cations, including (i) using the same â\\x80\\x9cthinâ\\x80\\x9d Hyena dimensions above (ii) the improved training recipe above (iii) a linear attention head dimension of 8.\\nâ\\x80¢ RWKV: The default RWKV model from B. Peng et al. (2023), including its modiï¬\\x81ed MLP block. We also used as much of its speciï¬\\x81ed training recipe as possible, such as increasing the learning rates by 2Ã\\x97 or 3Ã\\x97 on certain parameters.\\nâ\\x80¢ RetNet: The default RetNet model from Y. Sun et al. (2023). We also gave it the improved training recipe above.\\nâ\\x80¢ Mamba: The standard Mamba architecture, with the improved training recipe.\\n# E.2.2 Additional Scaling Law Ablations\\nWe perform additional ablations on the architecture using the same protocol as the 2k context length scaling laws in Figure 4 (Left).\\nMamba Architecture: Interleaving Blocks. We test the eï¬\\x80ect of diï¬\\x80erent architectural blocks combined with the Mamba block. We focus on the viewpoint that the Mamba block is simply the standard SwiGLU block with an extra ð\\x9d\\x96¼ð\\x9d\\x97\\x88ð\\x9d\\x97\\x87ð\\x9d\\x97\\x8f â\\x86\\x92 ð\\x9d\\x96²ð\\x9d\\x96²ð\\x9d\\x96¬ path added. This leads to two natural ablations:\\nâ\\x80¢ What if the Mamba block is interleaved with a standard MLP block, instead of stacked homogenously? This can also be interpreted as taking Mamba and removing half of the SSMs.\\nâ\\x80¢ What if the Mamba block is interleaved with MHA (multi-head attention) blocks? This can also be interpreted as taking a Transformer with SwiGLU MLPs (i.e. what we call Transformer++) and simply adding SSMs to the MLP blocks.\\nFigure 9 (Right) shows these variants compared to the original (homogenous) Mamba architecture. Interestingly, neither change matters too much. The Mamba-MLP architecture is only slightly worse, and still better than all models except Transformer++. The Mamba-MHA architecture is only slightly better, which is somewhat surprising in light of the fact that many recent works have found that combining (LTI) SSMs with Attention can lead to substantial improvements (Dao, Fu, Saab, et al. 2023; Fathi et al. 2023; Fathullah et al. 2023; Saon, Gupta, and Cui 2023; Zuo et al. 2022).\\nH3 Architecture: Training Recipes. Next we ablate diï¬\\x80erences between the Hyena and H3++ models, our weakest and strongest models outside of Transformer++ and Mamba, particularly to isolate the eï¬\\x80ect of training recipes.\\nâ\\x80¢ Hyena: The Hyena block with its original architecture and GPT3 training recipe (same as Figure 4).\\nâ\\x80¢ Hyena+: The same architecture but with the improved training recipe described above.\\nâ\\x80¢ H3+: The same architecture as Hyena+ but with the Hyena convolution kernel swapped out for S4D convolution kernel.\\nâ\\x80¢ H3++: The same as H3+, but with a linear attention head dimension of 8. This increases computation inside the SSM recurrence but does not increase parameters.\\nOur general convention is that â\\x80\\x9cModel+â\\x80\\x9d represents the base model with the improved training recipe, and â\\x80\\x9cModel++â\\x80\\x9d also allows for architectural changes.\\nFigure 9 (Right) shows that\\nA large improvement is achieved by the improved training recipe, which was used for many of the models in the\\nmain Figure 4 (RetNet, H3++, Transformer++, Mamba).\\nThe choice of the inner LTI SSM does not matter (e.g. Hyena vs. S4), consistent with ï¬\\x81ndings throughout this\\npaper.\\nThe head dimension expansion improves performance, consistent with one of our main themes that expanded\\nstate dimension improves performance for SSMs (Section 3).\\n31\\nScaling Laws on The Pile (Sequence Length 2048) Scaling Laws on The Pile (Sequence Length 2048) â\\x80\\x94â\\x80\\x94 Mamba Hyena Mamba-mLp | = â\\x80\\x94 Hyenas â\\x80\\x94â\\x80\\x94 Members |g â\\x80\\x94â\\x80\\x94 He a â\\x80\\x94 He 3 Sox! = 2104 ext? 5 2S 7x0 Ea 1 1 1 1 10 30 10Â° 10â\\x80\\x9d FLOPS (log scale) FLOPs (log scale)\\ns 5 2 3\\n2 = 3 8\\nFigure 9: (Scaling laws: extra ablations.) (Left) Instead of (Right) Instead of\\n# E.2.3 Downstream Evaluation Details\\nThis pretraining procedure is the same as the scaling law protocol, but extended to 300B tokens. For the 1.3B model, we use a batch size of 1M tokens to be consistent with the GPT3 speciï¬\\x81cations. We report the perplexity on the Pile validation set, and for this metric only compare to models trained on the same dataset and with the same tokenizer, in particular Pythia and RWKV.\\nFor downstream evaluation, we use the LM evaluation harness from EleutherAI (L. Gao, Tow, et al. 2021), as done by most work in this area. We evaluate on the following tasks/datasets that measure common sense reasoning:\\nâ\\x80¢ LAMBADA (Paperno et al. 2016).\\nâ\\x80¢ HellaSwag (Zellers et al. 2019).\\nâ\\x80¢ PIQA (Bisk et al. 2020).\\nâ\\x80¢ ARC-challenge (P. Clark et al. 2018).\\nâ\\x80¢ ARC-easy: an easy subset of ARC-challenge.\\nâ\\x80¢ WinoGrande (Sakaguchi et al. 2021).\\nWe report accuracy for LAMBADA, WinoGrande, PIQA, and ARC-easy, and accuracy normalized by sequence length for HellaSwag and ARC-challenge (since normalized accuracy is higher for almost all models for these task).\\n# E.3 DNA Modeling\\n# E.3.1 Pretraining Details\\nWe describe the dataset and training procedure of the HG38 pretraining task in more detail.\\nThe dataset follows the splits from the prior Enformer work on genomics (Avsec et al. 2021); the training split contains a total of ð\\x9d\\x91\\x86 = 34021 segments of length 217 = 131072 that cover the genome, for a total of approximately 4.5 billion tokens (DNA base pairs). These segments are pairs of (chromosome number, starting index, ending index), and can be extended if necessary (e.g. to get longer segments). We deviate from HyenaDNA when the training sequence length is not 217. HyenaDNA always takes a ï¬\\x81xed sub-segment (e.g. the beginning or middle of the prescribed segment), and thus for any training sequence length each epoch is ï¬\\x81xed to 34021 samples and doesnâ\\x80\\x99t necessarily go through the whole genome. On the other hand, we use the entire training data: â\\x80¢ When the context length ð\\x9d\\x90¿ is less than (or equal to) 217, we divide up each segment into non-overlapping\\nsub-segments of length ð\\x9d\\x90¿, so that there are ð\\x9d\\x91\\x86 Ã\\x97 217 ð\\x9d\\x90¿ total samples and ð\\x9d\\x91\\x86 Ã\\x97 217 â\\x89\\x88 4.5ð\\x9d\\x90µ tokens per epoch.\\nâ\\x80¢ When the context length ð\\x9d\\x90¿ is greater than 217, we turn each segment into two samples, one that begins with the prescribed segment and one that ends with the prescribed segment. Thus each epoch has 2ð\\x9d\\x91\\x86 items and 2ð\\x9d\\x91\\x86ð\\x9d\\x90¿\\n32\\ntokens per epoch. For example, at sequence length 218 = 262144 there are 4Ã\\x97 as many tokens as the default, and at sequence length 220 there are 16Ã\\x97 as many tokens.\\nOther training details generally follow the same protocol as our language modeling experiments (Appendix E.2). For example, we use the AdamW with (ð\\x9d\\x9b½1, ð\\x9d\\x9b½2) = (0.9, 0.95), no dropout, weight decay 0.1. We use a cosine learning rate scheduler with linear warmup for 10% of total steps.\\n# E.3.2 Scaling: Model Size Details\\nModels. The models we consider are: â\\x80¢ Transformer++: a Transformer with improved architecture, notably the usage of RoPE positional encodings (Su et al. 2021). Informally, we found these to be noticeably better than vanilla positional encodings from (Vaswani et al. 2017).\\nâ\\x80¢ HyenaDNA: the Hyena model from Nguyen, Poli, et al. (2023) and Poli et al. (2023), which is roughly a Transformer with the MHA block replaced by an H3 block using a global convolution parameterized by an MLP.\\nâ\\x80¢ Mamba: the standard Mamba architecture.\\nModel Sizes. We use the following model sizes.\\nBlocks Model Dimension Params (Approx.) 4 64 250K 700K 1.4M 3.5M 7.0M 19.3M 40.7M 5 96 6 128 7 192 8 256 10 384 12 512\\nNote that the number of blocks for Mamba is doubled, because one Transformer â\\x80\\x9clayerâ\\x80\\x9d includes both the MHA and MLP blocks (and similarly for Hyena), which requires two Mamba blocks to match parameters (Section 3.4).\\nTraining. For each model (Transformer++, HyenaDNA, Mamba), we swept the learning rate across {1ð\\x9d\\x91\\x92 â\\x88\\x92 3, 2ð\\x9d\\x91\\x92 â\\x88\\x92 3, 4ð\\x9d\\x91\\x92 â\\x88\\x92 3, 8ð\\x9d\\x91\\x92 â\\x88\\x92 3}. The optimal Transformer and HyenaDNA learning rates were 2e-3 across all sizes. The optimal Mamba learning rate was 8e-3; note that Mamba performed better than baselines with matched learning rates (2e-3), but was more stable and improved even more at higher learning rates. (Furthermore, as this LR is on the upper range of the sweep, it is possible that our results are still suboptimal.)\\nNote that, in contrast to standard LM scaling laws (Table 12), our LR held constant across model sizes for simplicity. The optimal LR should go down for larger models, but we didnâ\\x80\\x99t ï¬\\x81nd a noticeable eï¬\\x80ect at the small model sizes (at most a few million parameters) we considered.\\nE.3.3 Scaling: Context Length Details We use a total batch size of 224 â\\x89\\x88 16ð\\x9d\\x91\\x80 tokens per training step, for every sequence length (e.g. at length 220 there are 16 segments per batch and at length 210 there are 16384 segments per batch). This is a large batch size relative to the model size by usual LM standards, but note that a batch size of 223 is the minimum possible on a machine with 8 GPUs and sequence length of 220, and that HyenaDNA used much larger batches of 228. The learning rate used was 0.008 for Mamba and 0.001 for HyenaDNA; we initially attempted to use the same learning rate of 0.002 from the previous section for HyenaDNA, but found that it was unstable at the longest context length.\\nSequence Length Warmup. Following (Nguyen, Poli, et al. 2023), we use sequence length warmup (SLW) during pretraining. We choose a simple schedule of 2 epochs at each power-of-two sequence length starting from 210 = 1024. (Note that because of how data is curated, at the longest sequence lengths more steps and tokens are spent proportionally. In particular, each stage up to length 217 processes the same number of tokens, but 4Ã\\x97 as many tokens are processed at length 218, 8Ã\\x97 as many at length 219, and 16Ã\\x97 as many at length 220.)\\nUnlike HyenaDNA, we always control for the number of tokens per gradient update, so the batch size is successively halved as the sequence lengths are doubled in each stage.\\n33\\nTable 13: (Great Apes DNA Classification.) Accuracy after fine-tuning on sequences of length 210 = 1024 up to 220 = 1048576 using pretrained models of the same context length. Random guessing is 20%.\\nParams Accuracy (%) at Sequence Length 210 212 214 216 218 220 28.04 31.47 28.43 27.50 41.17 27.66 42.22 40.72 31.10 42.41 7M 30.00 29.01 31.48 43.73 56.60\\nRemark E.1. We also note that the schedule was not tuned, and we never experimented with turning off sequence length warmup for these pretraining experiments. We later found that SLW did not help noticeably for audio pretraining at similar lengths (Section 4.4), and it is possible that it is not necessary for DNA pretraining either.\\n# E.3.4 Species (Great Apes) Classification\\nModels are causal and therefore only the last element (across the sequence length) of the modelâ\\x80\\x99s output is used for the classiï¬\\x81cation head. Note that we control for the total number of elements in the loss function per gradient step. The pretraining objective includes all positions across the sequence length, so that ð\\x9d\\x9a\\x8bð\\x9d\\x9a\\x8að\\x9d\\x9a\\x9dð\\x9d\\x9a\\x8cð\\x9d\\x9a\\x91_ð\\x9d\\x9a\\x9cð\\x9d\\x9a\\x92ð\\x9d\\x9a£ð\\x9d\\x9a\\x8eÃ\\x97ð\\x9d\\x9a\\x9cð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x9að\\x9d\\x9a\\x9eð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x97ð\\x9d\\x9a\\x8cð\\x9d\\x9a\\x8e_ð\\x9d\\x9a\\x95ð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x97ð\\x9d\\x9a\\x90ð\\x9d\\x9a\\x9dð\\x9d\\x9a\\x91 is held constant; in other words, the batch size decreases as the sequence length increases. However, for a classiï¬\\x81cation task, since only the last position enters the loss, the batch size itself is held constant. Note that this also means that ï¬\\x81ne-tuning models with longer sequence lengths is more computationally expensive.\\nTraining consists of 10 epochs, each of which has 1024 gradient steps. Each gradient step uses batch size 64, which are all independently randomly drawn by uniformly picking a species, uniformly picking a chromosome, and then uniformly picking a contiguous segment of DNA. Following (Nguyen, Poli, et al. 2023), models with a maximum context length greater than 214 = 16384 use sequence length warmup with 1 epoch at length 214 = 16384, 1 epoch at length 215 = 32768, 1 epoch at length 216 = 65536, and so on up to the maximum sequence length. For example, the model with 220 = 1048576 context undergoes 6 epochs of sequence length warmup before 4 more epochs at its maximum sequence length.\\nThe learning rate for all Hyena models is ð\\x9d\\x9fºð\\x9d\\x9a\\x8e â\\x88\\x92 ð\\x9d\\x9f», while the learning rate for all Mamba models is ð\\x9d\\x9f·ð\\x9d\\x9a\\x8e â\\x88\\x92 ð\\x9d\\x9fº. These were found by performing learning rate sweeps for each model among {1ð\\x9d\\x91\\x92 â\\x88\\x92 5, 2ð\\x9d\\x91\\x92 â\\x88\\x92 5, 4ð\\x9d\\x91\\x92 â\\x88\\x92 5, 1ð\\x9d\\x91\\x92 â\\x88\\x92 4, 2ð\\x9d\\x91\\x92 â\\x88\\x92 4} for the smaller sequence lengths (210, 212, 214, 216), and these values were consistently found to be the best for each model. An abridged learning rate sweep was done at length 218, which agreed with these values, and a single run at length 220 was performed (as described above, the computational cost of these experiments is proportional to the sequence length). The learning rate followed a cosine decay schedule with warmup with 5 epochs of linear warmup to the maximum learning rate, and 5 epochs of cosine decay down to 1ð\\x9d\\x91\\x92 â\\x88\\x92 6. The unusually long learning rate warmup schedule was chosen because the sequence length warmup was also long (e.g. comprising 6 out of 10 epochs for the model with context length 220); we did not experiment with this choice.\\nResults for the Species classiï¬\\x81cation task are in Table 13.\\n# E.4 Audio Details\\n# E.4.1 YouTubeMix Audio Pretraining\\nModel. We use a model with 3 blocks per stage (3 Ã\\x97 5 = 15 total Mamba blocks), pooling factor ð\\x9d\\x91\\x9d = 16, and outer dimension ð\\x9d\\x90· = 64, for about 3.5M parameters.\\nDataset. The data is mu-law encoded at 8 bits, so the model is modeling discrete tokens with a vocab size of 256.\\nThe dataset consists of clips of up to 1 minute long, or length 960000, which is subsampled and divided into segments of any desired sequence length. Since the architecture involves two stages of pooling by a factor of 16,\\n34\\nTable 14: YouTubeMix length scaling sequence lengths and batch sizes.\\n468 Ã\\x97 2048 = 958464 234 Ã\\x97 2048 = 479232 117 Ã\\x97 2048 = 239616 59 Ã\\x97 2048 = 120832 30 Ã\\x97 2048 = 61440 15 Ã\\x97 2048 = 30720 8 Ã\\x97 2048 = 16384 4 Ã\\x97 2048 = 8192 1 2 4 8 16 32 64 128 958464 958464 958464 966656 983040 983040 1048576 1048576\\nAudio Waveforms - SSM Parameterization aso â\\x80\\x94â\\x80\\x94 samp â\\x80\\x94â\\x80\\x94 Mamba (s6) = â\\x80\\x9csy = sSeaive B/C Â° 1.40 4 â\\x80\\x94â\\x80\\x94 -selective A s ras | __Mamba-$4) B 1204 124 108 108 Sequence Length\\nAudio Waveforms - SSM Parameterization â\\x80\\x94â\\x80\\x94 Mamba ($6) 4 â\\x80\\x94â\\x80\\x94 +complex = Solestive a | (Mamba-S4) 1.35 1.304 1.254 108 108 Sequence Length\\n1.48 21404 . Ã© ag\\nFigure 10: (Audio Pretraining (YouTubeMix) Ablations.) As a uniformly-sampled â\\x80\\x9ccontinuousâ\\x80\\x9d signal modality, audio wave- forms actually benefit from LTI models which have matching inductive bias. (Left) Homogenous models (all blocks have the same parameterization) (Right) Only the center U-Net blocks are ablated; the outer blocks are Mamba-S4. Purple line is same as figure on left.\\nand we want the resulting sequence length to be a a multiple of 8 for hardware eï¬\\x83ciency, the longest possible sequence is 468 Ã\\x97 2048 = 958464. The rest of our sequence lengths are deï¬\\x81ned by successively halving this and rounding up to the nearest multiple of 2048.\\nTable 14 lists the speciï¬\\x81cations used in Figure 7. Beyond the varying batch sizes, the number of valid segments in the training set varied between diï¬\\x80erent sequence lengths (e.g. the number of training steps per epoch was not constant for diï¬\\x80erent points in the graph), which may have contributed to kinks in the scaling curves.\\nTraining. Models were trained for 200ð\\x9d\\x90¾ training steps with a maximum learning rate of 0.002, 20ð\\x9d\\x90¾ (10%) warmup steps, and weight decay 0.1 (similar to our general pretraining recipe across domains).\\nAdditional Ablations: SSM Parameterizations. We investigate SSM parameterizations on long-form audio waveform pretraining in the setting of Figure 7. The setting is modiï¬\\x81ed slightly to use larger models (8 layers and ð\\x9d\\x90· = 64 for 6M params, the SaShiMi default), shorter sequences (211 = 2048 to 218 = 262144 instead of 213 to 220), lower LR (0.001 from 0.002), and shorter training cycles (100K instead of 200K steps).\\nFigure 10 shows that the change from S4 â\\x86\\x92 S6 (i.e. the selection mechanism) is not always beneï¬\\x81cial. On long-form audio waveforms, it in fact signiï¬\\x81cantly hampers performance, which may be intuitive from the point of view that audio is uniformly sampled and very smooth, and therefore beneï¬\\x81ts from continuous linear time-invariant (LTI) methods. After ablating away the selection mechanism, note that the resulting model is the S4 layer inside the Mamba block. To disambiguate, we call this Mamba-S4 as opposed the default Mamba architecture Mamba-S6.\\nHowever, on the right side, we keep the outer layers of the U-Net Mamba-S4 and ablate only the inner layers. The performance diï¬\\x80erences shrink dramatically; this reinforces the hypothesis that layers closer to the raw audio signal should be LTI, but once they are â\\x80\\x9ctokenizedâ\\x80\\x9d and compressed by the outer layers, the inner layers no longer need to be LTI. In this setting however, the real-valued SSM still underperforms the complex-valued one.\\n35\\n# E.4.2 SC09 Speech Generation\\nAutoregressive training largely followed the autoregressive language modeling protocol, such as\\nâ\\x80¢ Weight decay 0.1\\nâ\\x80¢ Learning rate warmup for 10% of total steps\\nâ\\x80¢ AdamW optimizer with ð\\x9d\\x9b½ = (0.9, 0.95)\\nâ\\x80¢ Gradient clip value 0.1\\nWe used a learning rate of 0.002 and 200000 training steps at a batch size of 16.\\nThe large Mamba model in Table 4 has 15 layers per stage with an outer dimension of ð\\x9d\\x90· = 96 and pooling factor 4. We note that this dataset is small (training went through 100 epochs) and for this large model, there was signiï¬\\x81cant overï¬\\x81tting of the BPB or NLL. However, automated metrics of generated samples continually improving throughout training.\\nThe models in the architecture ablations in Table 5 all have 8 layers per stage with an outer dimension of ð\\x9d\\x99³ = 64 and pooling factor 4. The S4+MLP block has roughly 2ð\\x9d\\x90·2 + 4ð\\x9d\\x90·2 parameters (expansion factor 2 in the MLP). The Transformer block has 4ð\\x9d\\x90·2 + 2ð\\x9d\\x90·2 parameters (expansion factor 1 in the MLP). The Mamba block has the usual â\\x89\\x88 6ð\\x9d\\x90·2 parameters. All models have roughly 6M total parameters.\\n# E.5 Efficiency Benchmark\\nScan Operation. We compare the core operation of selective SSMs, which is the parallel scan (Section 3.3), against convolution and attention, measured on an A100 80GB PCIe GPU. Note that these do not include the cost of other operations outside of this core operation, such as computing the convolutional kernel in global-convolution models, or computing the QKV projections in attention.\\nAs a baseline, we implement a standard parallel scan in PyTorch with no kernel fusion. This requires materializing the parameters A, B, C in HBM.\\nOur scan implementation fuses the discretization step and the parallel scan, avoiding the cost of materializing all the large parameters in HBM.\\nFor convolution, we use the standard implementation in PyTorch, which separately performs FFTs on the inputs and the ï¬\\x81lters, multiply them in frequency domain, then performs an inverse FFT to obtain the result. The theoretical complexity is ð\\x9d\\x91\\x82(ð\\x9d\\x90¿ log(ð\\x9d\\x90¿)) for sequence length ð\\x9d\\x90¿.\\nFor attention, we compare against the fastest implementation that we are aware of (FlashAttention-2 (Dao 2023)), with causal mask. Note that FlashAttention-2 with causal mask is about 1.7Ã\\x97 faster than without causal mask, since approximately only half of the attention entries are computed. We use batch size of 1 and increase the sequence length from 29 = 512, 210 â\\x89\\x88 1ð\\x9d\\x90¾, 211 â\\x89\\x88 2ð\\x9d\\x90¾, up to 219 â\\x89\\x88 500ð\\x9d\\x90¾ (some of the baselines run out of memory before reaching 500K). We use a model dimension of ð\\x9d\\x90· = 1024 and state dimension ð\\x9d\\x91\\x81 = 16. We measure with BF16 inputs, which is the data type most commonly used for large scale training.\\nEnd-to-end Inference. We measure the inference throughput of a Mamba 1.4B model and an untrained Mamba 6.9B model, against a standard Transformer (GPT3 architecture) at 1.3B and 6.7B size. We use the standard Transformer implementation in the Huggingface transformers library.\\nWe set the prompt length to be 2048 and the generation length to be 128. We vary the batch size from 1, 2, 4, 8, 16, 32, 64, to 128, and measure time time taken to generate 128 tokens. We then calculate the throughput (tokens/s) as batch size Ã\\x97 128â\\x88\\x95time taken. We repeat the measurements 3 times and take the average. Measurements are done on an A100 80GB PCIe GPU.\\nMemory Benchmark. The memory usage simply scales proportionally to the size of the activation tensors, as with most deep sequence models. We report measurements of the training memory requirements of 125M models\\n36\\nTable 15: (Memory benchmark.) Mambaâ\\x80\\x99s memory footprint is comparable to the most optimized Transformer. Results for 125M models.\\nBatch size Transformer (w/ FlashAttention-2) Mamba 1 2 4 8 16 32 4.6GB 5.2GB 6.9GB 11.5GB 20.7GB 34.5GB 4.8GB 5.8GB 7.3GB 12.3GB 23.1GB 38.2GB\\non 1 A100 80GB GPU. Each batch consists of sequences of length 2048. We compare to the most memory-eï¬\\x83cient Transformer implementation we are aware of (with kernel fusion from torch.compile and with FlashAttention-2). Table 15 shows that Mambaâ\\x80\\x99s memory requirement is comparable to a similar-sized Transformer with an extremely optimized implementation, and we expect further improvement in Mambaâ\\x80\\x99s memory footprint in the future.\\n37', source='', source_type=<SourceType.text_plain: 'text/plain'>, num_chunks=97, metadata={}, chunks=[ResponseChunk(id='chunk_4c9533d0-7753-4ba7-9ee3-a1b72cb14db9', content='# Mamba: Linear-Time Sequence Modeling with Selective State Spaces # Albert Gu*1 and Tri Dao*2 1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me # Abstract Foundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ computational ineï¬ ciency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second, even though this change prevents the use of eï¬ cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simpliï¬ ed end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5Ã higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation. # 1 Introduction Foundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ ective paradigm in modern machine learning.', chunk_index=1, num_tokens=401, metadata={}), ResponseChunk(id='chunk_51bd8032-83a3-464d-b893-7f8011d3ae7e', content='The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014). While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬ cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data. However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬ nite window, and quadratic scaling with respect to the window length. An enormous body of research has appeared on more eï¬ cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ ective. As of yet, none of these variants have been shown to be empirically eï¬ ective at scale across domains. Recently, structured state space sequence models (SSMs) (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling.', chunk_index=2, num_tokens=366, metadata={}), ResponseChunk(id='chunk_4f61da3c-e726-4df6-b8e2-889d4a2b8004', content='These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960). This class of models can be computed very eï¬ ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length. Additionally, they have principled Equal contribution. 1 mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021). Many ï¬ avors of SSMs (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023). However, they have been less eï¬ ective at modeling discrete and information-dense data such as text. We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length. Selection Mechanism. First, we identify a key limitation of prior models: the ability to eï¬ ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs).', chunk_index=3, num_tokens=368, metadata={}), ResponseChunk(id='chunk_55104334-daf7-4217-8379-96a2c8716f10', content='Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input. This allows the model to ï¬ lter out irrelevant information and remember relevant information indeï¬ nitely. Hardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬ cient. We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬ erent levels of the GPU memory hierarchy. The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3Ã faster on A100 GPUs). Architecture. We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M.', chunk_index=4, num_tokens=395, metadata={}), ResponseChunk(id='chunk_5ad3c109-4eea-43f4-9781-cf615e2a8b35', content='We empirically validate Mambaâ s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬ c task performance, on several types of modalities and settings: â ¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬ nitely long (>1M tokens). â ¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences. â ¢ Language Modeling. Mamba is the ï¬ rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5Ã generation throughput compared to Transformers of similar size, and Mamba-3Bâ s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B). Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba. 2 # Selective State Space Model # with Hardware-aware State Expansion # A vuvy GPU SRAM Selection Mechanism es Selection Mechanism Figure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð', chunk_index=5, num_tokens=398, metadata={}), ResponseChunk(id='chunk_585e9287-715b-494e-8498-1b384401f40e', content='· = 5) of an input ð ¥ to output ð ¦ through a higher dimensional latent state â (e.g. ð = 4). Prior SSMs avoid materializing this large effective state (ð ·ð , times batch size ð µ and sequence length ð ¿) through clever alternate computation paths requiring time-invariance: the (â , A, B, C) parameters are constant across time. Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy. # 2 State Space Models Structured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð ¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â through an implicit latent state â (ð ¡) â â ð . Concretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬ ne a sequence-to-sequence trans- formation in two stages. â â ²(ð ¡) = Aâ (ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â ð ¡ = Aâ ð ¡â 1 + Bð ¥ð ¡ ð ¦ð ¡ = Câ ð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â ¦ ) (3a) (3b) Discretization. The ï¬ rst stage transforms the â', chunk_index=6, num_tokens=421, metadata={}), ResponseChunk(id='chunk_6f546627-41de-48e5-bf1e-b29fa55c3ee1', content='continuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð ð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬ ned in equation (4). A = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4) Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023). It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5. However, from a mechanical point of view discretization can simply be viewed as the ï¬ rst step of the computation graph in the forward pass of an SSM. Alternate ï¬ avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation. After the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3). 3 Commonly, the model uses the convolutional mode (3) for eï¬', chunk_index=7, num_tokens=405, metadata={}), ResponseChunk(id='chunk_09236fec-800e-418a-a6ef-f33529145350', content='cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time). Linear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time. In other words (â , A, B, C), and consequently (A, B) as well, are ï¬ xed for all time-steps. This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models. Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ ciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ ciency bottlenecks. Structure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬ ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use. In this case, the A â â ð Ã ð , B â â ð Ã 1, C â â 1Ã ð matrices can all be represented by ð numbers. To operate over an input sequence ð ¥ of batch size ð µ and length ð', chunk_index=8, num_tokens=403, metadata={}), ResponseChunk(id='chunk_8ad20c8d-767e-4ad7-9114-2704d5d528e6', content='¿ with ð · channels, the SSM is applied independently to each channel. Note that in this case, the total hidden state has dimension ð ·ð per input, and computing it over the sequence length requires ð (ð µð ¿ð ·ð ) time and memory; this is the root of the fundamental eï¬ ciency bottleneck addressed in Section 3.3. General State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state. It has been used to refer to many disparate concepts in diï¬ erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬ lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning). Throughout this entire paper we use the term â SSMâ to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary. SSM Architectures.', chunk_index=9, num_tokens=400, metadata={}), ResponseChunk(id='chunk_5ceac24a-ff0d-48cc-be39-dee33847e027', content='SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines. â ¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM. â ¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer. â ¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021). â ¢ RetNet (Y. Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions. 4 â ¢ RWKV (B. Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)). Its main â WKVâ mechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B).', chunk_index=10, num_tokens=378, metadata={}), ResponseChunk(id='chunk_8d8a9f8e-af47-4100-8e98-dbbe233e77fd', content='We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬ ciently. We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5). # 3.1 Motivation: Selection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state. In fact, we can view the tradeoï¬ s of popular sequence models from this point of view. For example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬ cient because they have a ï¬ nite state, implying constant-time inference and linear-time training. However, their eï¬ ectiveness is limited by how well this state has compressed the context. To understand this principle, we focus on two running examples of synthetic tasks (Figure 2). â ¢ The Selective Copying task modiï¬', chunk_index=11, num_tokens=399, metadata={}), ResponseChunk(id='chunk_a4ba2fcc-d548-4ec7-b16e-10975f770023', content='es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬ lter out the irrelevant ones (white). â ¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black). These tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels. In summary, the eï¬ ciency vs. eï¬ ectiveness tradeoï¬ of sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).', chunk_index=12, num_tokens=402, metadata={}), ResponseChunk(id='chunk_5c91c282-cf3b-4ec9-9496-a87aedf950b8', content='# Improving SSMs with Selection One method of incorporating a selection mechanism into models is by letting their parameters that aï¬ ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the convolution kernel of a CNN) be input-dependent. 5 Copying Output noo am > mt HE nee Tt Solution # Tetons | # oO S lective Copying # aoe # i) # [coe # Induction Heads # EES > # fo Perfectly solved by LTI (e.g. convolutional) models that do not need to look at the actual inputs Hi i Hl ] Bw H a H > BH Figure 2: (Left) The standard version of the Copying task involves constant spacing between input and output elements and is easily solved by time-invariant models such as linear recurrences and global convolutions. (Right Top) The Selective Copying task has random spacing in between inputs and requires time-varying models that can selectively remember or ignore inputs depending on their content. (Right Bottom) The Induction Heads task is an example of associative recall that requires retrieving an answer based on context, a key ability for LLMs. Algorithm 2 SSM + Selection (S6) Input: ð ¥ â ¶ (ð ±, ð », ð ³) Output: ð ¦ â ¶ (ð ±, ð », ð ³) 1: A â ¶ (ð ³, ð ½) â ð ¯ð ºð ð ºð ð ¾ð ð ¾ð â ³ Represents structured ð Ã ð matrix â ³ Represents structured ð Ã ð matrix 2: B â ¶ (ð ³, ð ½) â ð ¯ð ºð ð ºð ð ¾ð ð ¾ð 3: C â ¶ (ð ³, ð ½) â ð ¯ð ºð ð ºð ð ¾ð ð ¾ð 4: â â', chunk_index=13, num_tokens=443, metadata={}), ResponseChunk(id='chunk_fbab53d1-de01-4cb4-8777-bbf4af2eac2c', content='¶ (ð ³) â ð â (ð ¯ð ºð ð ºð ð ¾ð ð ¾ð ) 5: A, B â ¶ (ð ³, ð ½) â ð ½ð ð ð ¼ð ð ¾ð ð ð ð ¾(â , A, B) 6: ð ¦ â ð ²ð ²ð ¬(A, B, C)(ð ¥) 2: B â ¶ (ð ±, ð », ð ½) â ð ð µ(ð ¥) 3: C â ¶ (ð ±, ð », ð ½) â ð ð ¶(ð ¥) 4: â â ¶ (ð ±, ð », ð ³) â ð â (ð ¯ð ºð ð ºð ð ¾ð ð ¾ð +ð â (ð ¥)) 5: A, B â ¶ (ð ±, ð », ð ³, ð ½) â ð ½ð ð ð ¼ð ð ¾ð ð ð ð ¾(â , A, B) 6: ð ¦ â ð ²ð ²ð ¬(A, B, C)(ð ¥) â ³ Time-invariant: recurrence or convolution â ³ Time-varying: recurrence (scan) only 7: return ð ¦ 7: return ð ¦ Algorithms 1 and 2 illustrates the main selection mechanism that we use. The main diï¬ erence is simply making several parameters â , B, C functions of the input, along with the associated changes to tensor shapes throughout. In particular, we highlight that these parameters now have a length dimension ð ¿, meaning that the model has changed from time-invariant to time-varying. (Note that shape annotations were described in Section 2). This loses the equivalence to convolutions (3) with implications for its eï¬ ciency, discussed next.', chunk_index=14, num_tokens=459, metadata={}), ResponseChunk(id='chunk_0de1bc5b-9c67-469f-b179-5b7da980d2ba', content='We speciï¬ cally choose ð ð µ(ð ¥) = ð «ð ð ð ¾ð ºð ð (ð ¥), ð ð ¶(ð ¥) = ð «ð ð ð ¾ð ºð ð (ð ¥), ð â (ð ¥) = ð ¡ð ð ð ºð ½ð ¼ð ºð ð ð ·(ð «ð ð ð ¾ð ºð 1(ð ¥)), and ð â = ð ð ð ¿ð ð ð ð ð , where ð «ð ð ð ¾ð ºð ð is a parameterized projection to dimension ð . The choice of ð â and ð â is due to a connection to RNN gating mechanisms explained in Section 3.5. # 3.3 Efficient Implementation of Selective SSMs Hardware-friendly architectures such as convolutions (Krizhevsky, Sutskever, and Hinton 2012) and Transform- ers (Vaswani et al. 2017) enjoy widespread application. Here we aim to make selective SSMs eï¬ cient on modern hardware (GPU) as well. The selection mechanism is quite natural, and earlier works attempted to incorporate special cases of selection, such as letting â vary over time in recurrent SSMs (Gu, Dao, et al. 2020). However, as previously mentioned a core limitation in the usage of SSMs is their computational eï¬ ciency, which was why S4 and all derivatives used LTI (non-selective) models, most commonly in the form of global convolutions. # 3.3.1 Motivation of Prior Models We ï¬ rst revisit this motivation and overview our approach to overcome limitations of prior methods. â ¢ At a high level, recurrent models such as SSMs always balance a tradeoï¬', chunk_index=15, num_tokens=425, metadata={}), ResponseChunk(id='chunk_4699a654-c5d7-47b1-ad2f-67a40d4339ad', content='between expressivity and speed: as discussed in Section 3.1, models with larger hidden state dimension should be more eï¬ ective but slower. Thus 6 we want to maximize hidden state dimension without paying speed and memory costs. â ¢ Note that the recurrent mode is more ï¬ exible than the convolution mode, since the latter (3) is derived from expanding the former (2) (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021). However, this would require computing and materializing the latent state â with shape (ð ±, ð », ð ³, ð ½), much larger (by a factor of ð , the SSM state dimension) than the input ð ¥ and output ð ¦ of shape (ð ±, ð », ð ³). Thus the more eï¬ cient convolution mode was introduced which could bypass the state computation and materializes a convolution kernel (3a) of only (ð ±, ð », ð ³). â ¢ Prior LTI SSMs leverage the dual recurrent-convolutional forms to increase the eï¬ ective state dimension by a factor of ð (â 10 â 100), much larger than traditional RNNs, without eï¬ ciency penalties. # 3.3.2 Overview of Selective Scan: Hardware-Aware State Expansion The selection mechanism is designed to overcome the limitations of LTI models; at the same time, we therefore need to revisit the computation problem of SSMs. We address this with three classical techniques: kernel fusion, parallel scan, and recomputation. We make two main observations: â ¢ The naive recurrent computation uses ð (ð µð ¿ð ·ð ) FLOPs while the convolutional computation uses ð (ð µð ¿ð · log(ð', chunk_index=16, num_tokens=399, metadata={}), ResponseChunk(id='chunk_1728c094-dcef-4e68-adfe-767c5646f9a8', content='¿)) FLOPs, and the former has a lower constant factor. Thus for long sequences and not-too-large state dimension ð , the recurrent mode can actually use fewer FLOPs. â ¢ The two challenges are the sequential nature of recurrence, and the large memory usage. To address the latter, just like the convolutional mode, we can attempt to not actually materialize the full state â . The main idea is to leverage properties of modern accelerators (GPUs) to materialize the state â only in more eï¬ cient levels of the memory hierarchy. In particular, most operations (except matrix multiplication) are bounded by memory bandwidth (Dao, Fu, Ermon, et al. 2022; Ivanov et al. 2021; Williams, Waterman, and Patterson 2009). This includes our scan operation, and we use kernel fusion to reduce the amount of memory IOs, leading to a signiï¬ cant speedup compared to a standard implementation. Concretely, instead of preparing the scan input (A, B) of size (ð ±, ð », ð ³, ð ½) in GPU HBM (high-bandwidth memory), we load the SSM parameters (â , A, B, C) directly from slow HBM to fast SRAM, perform the discretization and recurrence in SRAM, and then write the ï¬ nal outputs of size (ð ±, ð », ð ³) back to HBM. To avoid the sequential recurrence, we observe that despite not being linear it can still be parallelized with a work-eï¬ cient parallel scan algorithm (Blelloch 1990; Martin and Cundy 2018; Smith, Warrington, and Linderman 2023). Finally, we must also avoid saving the intermediate states, which are necessary for backpropagation.', chunk_index=17, num_tokens=389, metadata={}), ResponseChunk(id='chunk_f93dfb71-4ffe-4f5e-a8a9-9acd05b94816', content='We carefully apply the classic technique of recomputation to reduce the memory requirements: the intermediate states are not stored but recomputed in the backward pass when the inputs are loaded from HBM to SRAM. As a result, the fused selective scan layer has the same memory requirements as an optimized transformer implementation with FlashAttention. Details of the fused kernel and recomputation are in Appendix D. The full Selective SSM layer and algorithm is illustrated in Figure 1. # 3.4 A Simplified SSM Architecture As with structured SSMs, selective SSMs are standalone sequence transformations that can be ï¬ exibly incorporated into neural networks. The H3 architecture is the basis for the most well-known SSM architectures (Section 2), which are generally comprised of a block inspired by linear attention interleaved with an MLP (multi-layer perceptron) block. We simplify this architecture by combining these two components into one, which is stacked homogenously (Figure 3). This is inspired by the gated attention unit (GAU) (Hua et al. 2022), which did something similar for attention. This architecture involves expanding the model dimension ð · by a controllable expansion factor ð ¸. For each block, most of the parameters (3ð ¸ð ·2) are in the linear projections (2ð ¸ð ·2 for input projections, ð ¸ð ·2 for output projection) while the inner SSM contributes less. The number of SSM parameters (projections for â , B, C, and 7 Linear projection Sequence transformation Nonlinearity (activation multiplication) H3 Â®@ Gated MLP â Mamba # or Figure 3: (Architecture.) Our simplified block design combines the H3 block, which is the basis of most SSM architectures, with the ubiquitous MLP block of modern neural networks. Instead of interleaving these two blocks, we simply repeat the Mamba block homogenously.', chunk_index=18, num_tokens=400, metadata={}), ResponseChunk(id='chunk_893ba0c3-3068-4d12-a37b-73220f752664', content='Compared to the H3 block, Mamba replaces the first multiplicative gate with an activation function. Compared to the MLP block, Mamba adds an SSM to the main branch. For ð we use the SiLU / Swish activation (Hendrycks and Gimpel 2016; Ramachandran, Zoph, and Quoc V Le 2017). the matrix A) are much smaller in comparison. We repeat this block, interleaved with standard normalization and residual connections, to form the Mamba architecture. We always ï¬ x to ð ¸ = 2 in our experiments and use two stacks of the block to match the 12ð ·2 parameters of a Transformerâ s interleaved MHA (multi-head attention) and MLP blocks. We use the SiLU / Swish activation function (Hendrycks and Gimpel 2016; Ramachandran, Zoph, and Quoc V Le 2017), motivated so that the Gated MLP becomes the popular â SwiGLUâ variant (Chowdhery et al. 2023; Shazeer 2020; Touvron et al. 2023). Finally, we additionally use an optional normalization layer (we choose LayerNorm (J. L. Ba, Kiros, and Hinton 2016)), motivated by RetNetâ s usage of a normalization layer in a similar location (Y. Sun et al. 2023). # 3.5 Properties of Selection Mechanisms The selection mechanism is a broader concept that can be applied in diï¬ erent ways, such as to more traditional RNNs or CNNs, to diï¬ erent parameters (e.g. A in Algorithm 2), or using diï¬ erent transformations ð (ð ¥). # 3.5.1 Connection to Gating Mechanisms', chunk_index=19, num_tokens=389, metadata={}), ResponseChunk(id='chunk_4b405e17-b11d-4f93-afc8-0e551497fa21', content='We highlight the most important connection: the classical gating mechanism of RNNs is an instance of our selection mechanism for SSMs. We note that the connection between RNN gating and the discretization of continuous-time systems is well established (Funahashi and Nakamura 1993; Tallec and Ollivier 2018). In fact, Theorem 1 is an improvement of Gu, Johnson, Goel, et al. (2021, Lemma 3.1) generalizing to the ZOH discretization and input-dependent gates (proof in Appendix C). More broadly, â in SSMs can be seen to play a generalized role of the RNN gating mechanism. In line with prior work, we adopt the view that discretization of SSMs is the principled foundation of heuristic gating mechanisms. Theorem 1. When ð = 1, A = â 1, B = 1, ð â = ð «ð ð ð ¾ð ºð (ð ¥), and ð â = ð ð ð ¿ð ð ð ð ð , then the selective SSM recurrence (Algorithm 2) takes the form ð ð ¡ = ð (ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) â ð ¡ = (1 â ð ð ¡)â ð ¡â 1 + ð ð ¡ð ¥ð ¡. (5) As mentioned in Section 3.2, our speciï¬ c choices of ð â , ð â is from this connection. In particular, note that if a given input ð ¥ð ¡ should be completely ignored (as necessary in the synthetic tasks), all ð · channels should ignore it, and so we project the input down to 1 dimension before repeating/broadcasting with â . 8 # Interpretation of Selection Mechanisms We elaborate on two particular mechanistic eï¬ ects of selection. Variable Spacing.', chunk_index=20, num_tokens=424, metadata={}), ResponseChunk(id='chunk_89bf1063-367d-4584-b97e-1996da35de6c', content='Selectivity allows ï¬ ltering out irrelevant noise tokens that may occur between inputs of interest. This is exempliï¬ ed by the Selective Copying task, but occurs ubiquitously in common data modalities, particularly for discrete data â for example the presence of language ï¬ llers such as â umâ . This property arises because the model can mechanistically ï¬ lter out any particular input ð ¥ð ¡, for example in the gated RNN case (Theorem 1) when ð ð ¡ â 0. It has been empirically observed that many sequence models do not improve with longer Filtering Context. context (F. Shi et al. 2023), despite the principle that more context should lead to strictly better performance. An explanation is that many sequence models cannot eï¬ ectively ignore irrelevant context when necessary; an intuitive example are global convolutions (and general LTI models). On the other hand, selective models can simply reset their state at any time to remove extraneous history, and thus their performance in principle improves monotonicly with context length (e.g. Section 4.3.2). In settings where multiple independent sequences are stitched together, Transformers Boundary Resetting. can keep them separate by instantiating a particular attention mask, while LTI models will bleed information between the sequences. Selective SSMs can also reset their state at boundaries (e.g. â ð ¡ â â or Theorem 1 when ð ð ¡ â 1). These settings may occur artiï¬ cially (e.g. packing documents together to improve hardware utilization) or naturally (e.g. episode boundaries in reinforcement learning (Lu et al. 2023)). Additionally, we elaborate on eï¬ ects of each selective parameter. In general, â controls the balance between how much to focus or ignore the current input Interpretation of â . ð ¥ð ¡.', chunk_index=21, num_tokens=398, metadata={}), ResponseChunk(id='chunk_f686b76f-0841-4371-91c5-d183c52a10c8', content='It generalizes RNN gates (e.g. ð ð ¡ in Theorem 1), mechanically, a large â resets the state â and focuses on the current input ð ¥, while a small â persists the state and ignores the current input. SSMs (1)-(2) can be interpreted as a continuous system discretized by a timestep â , and in this context the intuition is that large â â â represents the system focusing on the current input for longer (thus â selectingâ it and forgetting its current state) while a small â â 0 represents a transient input that is ignored. Interpretation of A. We remark that while the A parameter could also be selective, it ultimately aï¬ ects the model only through its interaction with â via A = exp(â A) (the discretization (4)). Thus selectivity in â is enough to ensure selectivity in (A, B), and is the main source of improvement. We hypothesize that making A selective in addition to (or instead of) â would have similar performance, and leave it out for simplicity. Interpretation of B and C. As discussed in Section 3.1, the most important property of selectivity is ï¬ ltering out irrelevant information so that a sequence modelâ s context can be compressed into an eï¬ cient state. In an SSM, modifying B and C to be selective allows ï¬ ner-grained control over whether to let an input ð ¥ð ¡ into the state â ð ¡ or the state into the output ð ¦ð ¡. These can be interpreted as allowing the model to modulate the recurrent dynamics based on content (input) and context (hidden states) respectively. 3.6 Additional Model Details Real vs. Complex. Most prior SSMs use complex numbers in their state â', chunk_index=22, num_tokens=378, metadata={}), ResponseChunk(id='chunk_38cb6eb1-5a0d-426c-8929-d4d47e251a20', content=', which is necessary for strong performance on many tasks (Gu, Goel, and RÃ© 2022). However, it has been empirically observed that completely real-valued SSMs seem to work ï¬ ne, and possibly even better, in some settings (Ma et al. 2023). We use real values as the default, which work well for all but one of our tasks; we hypothesize that the complex-real tradeoï¬ is related to the continuous-discrete spectrum in data modalities, where complex numbers are helpful for continuous modalities (e.g. audio, video) but not discrete (e.g. text, DNA). 9 Initialization. Most prior SSMs also suggest special initializations, particularly in the complex-valued case, which can help in several settings such as low-data regimes. Our default initialization for the complex case is S4D-Lin and for the real case is S4D-Real (Gu, Gupta, et al. 2022), which is based on the HIPPO theory (Gu, Dao, et al. 2020). These deï¬ ne the ð -th element of A as â 1â 2 + ð ð and â (ð + 1) respectively. However, we expect many initializations to work ï¬ ne, particularly in the large-data and real-valued SSM regimes; some ablations are considered in Section 4.6. Parameterization of â . We deï¬ ned the selective adjustment to â as ð â (ð ¥) = ð ¡ð ð ð ºð ½ð ¼ð ºð ð ð ·(ð «ð ð ð ¾ð ºð 1(ð ¥)), which was motivated by the mechanics of â (Section 3.5). We observe that it can be generalized from dimension 1 to a larger dimension ð . We set this to be a small fraction of ð', chunk_index=23, num_tokens=414, metadata={}), ResponseChunk(id='chunk_8e544af7-4819-4b08-ba38-abc441903d30', content='³, which uses a negligible number of parameters compared to the main Linear projections in the block. We additionally note that the broadcasting operation can instead be viewed as another Linear projection, initialized to a speciï¬ c pattern of 1â s and 0â s; if this projection is trainable, this leads to the alternative ð â (ð ¥) = ð «ð ð ð ¾ð ºð ð ·(ð «ð ð ð ¾ð ºð ð (ð ¥)), which can be viewed as a low-rank projection. In our experiments, the â parameter (which can be viewed as a bias term) is initialized to ð â 1 â following prior work on SSMs (Gu, Johnson, Timalsina, et al. 2023). Remark 3.1. For brevity in our experimental results, we sometimes abbreviate selective SSMs as S6 models, because they are S4 models with a selection mechanism and computed with a scan. # 4 Empirical Evaluation In Section 4.1 we test Mambaâ s ability to solve the two synthetic tasks motivated in Section 3.1. We then evaluate on three domains, each evaluated on autoregressive pretraining as well as downstream tasks. Section 4.2: language model pretraining (scaling laws), and zero-shot downstream evaluation. Section 4.3: DNA sequence pretraining, and ï¬ ne-tuning on a long-sequence classiï¬ cation task. Section 4.4: audio waveform pretraining, and the quality of autoregressively generated speech clips. Finally, Section 4.5 shows Mambaâ s computational eï¬ ciency at both training and inference time, and Section 4.6 ablates various components of the architecture and selective SSMs. # 4.1 Synthetic Tasks', chunk_index=24, num_tokens=394, metadata={}), ResponseChunk(id='chunk_33ab6c24-e5dc-45e7-ac96-5da05263df3b', content='Full experiment details for these tasks including task details and training protocol are in Appendix E.1. # 4.1.1 Selective Copying The Copying task is one of the most well-studied synthetic tasks for sequence modeling, originally designed to test the memorization abilities of recurrent models. As discussed in Section 3.1, LTI SSMs (linear recurrences and global convolutions) can easily solve this task by only keeping track of time instead of reasoning about the data; for example, by constructing a convolution kernel of exactly the right length (Figure 2). This was explicitly validated in earlier work on global convolutions (Romero et al. 2021). The Selective Copying task prevents this shortcut by randomizing the spacing between tokens. Note that this task has been introduced before as the Denoising task (Jing et al. 2019). Note that many previous works argue that adding architecture gating (multiplicative interactions) can endow models with â data-dependenceâ and solve related tasks (Dao, Fu, Saab, et al. 2023; Poli et al. 2023). However, we ï¬ nd this explanation insuï¬ cient intuitively because such gating does not interact along the sequence axis, and cannot aï¬ ect the spacing between tokens. In particular architecture gating is not an instance of a selection mechanism (Appendix A). Table 1 conï¬ rms that gated architectures such as H3 and Mamba only partially improve performance, while the selection mechanism (modifying S4 to S6) easily solves this task, particularly when combined with these more powerful architectures. 10 Model Arch. Layer Acc.', chunk_index=25, num_tokens=347, metadata={}), ResponseChunk(id='chunk_de2dd66c-1dbf-46c5-ab76-4cfa53d4f653', content=\"S4 - No gate No gate S4 S6 18.3 97.0 H3 Hyena - H3 H3 H3 S4 Hyena S6 57.0 30.1 99.7 - - Mamba Mamba Mamba Mamba Hyena S4 S6 56.4 28.4 99.8 Induction Heads Extrapolation Extrapolation 1.05 ' â â Mua-Absotute 08] ; â â MHA-RoPE i =~ MHA-xPos 6) i â HB oa = byena ' Random 1 ran benath 0.0 , ; ; : , 10Â° 10Â° 108 10Â° 10Â° Test Sequence Length > g 8 Table 1: (Selective Copying.) Accuracy for combinations of architectures and inner sequence layers. Table 2: (Induction Heads.) Models are trained on sequence length 28 = 256, and tested on increasing sequence lengths of 26 = 64 up to 220 = 1048576. Full numbers in Table 11. # 4.1.2 Induction Heads Induction heads (Olsson et al. 2022) is a simple task from the mechanistic interpretability lens (Elhage et al. 2021) that is surprisingly predictive of the in-context learning ability of LLMs. It requires models to perform associative recall and copy: for example, if the model has seen a bigram such as â Harry Potterâ in the sequence, then the next time â Harryâ appears in the same sequence, the model should be able to predict â Potterâ by copying from history. Dataset.\", chunk_index=26, num_tokens=356, metadata={}), ResponseChunk(id='chunk_ddc57799-8644-4065-99b0-e91fc34e5f61', content='We train a 2-layer model on the induction heads task at sequence length 256, with a vocab size of 16, which is comparable to prior work on this task (Dao, Fu, Saab, et al. 2023) but with longer sequences. We additionally investigate generalization and extrapolation abilities by evaluating on a range of sequence lengths from 26 = 64 up to 220 = 1048576 at test time. Models. Following established work on induction heads, we use 2 layer models, which allows attention to mechanistically solve the induction heads task (Olsson et al. 2022). We test both multi-head attention (8 heads, with various positional encodings) and SSM variants. We use a model dimension ð · of 64 for Mamba and 128 for the other models. Results. Table 2 shows that Mambaâ or more precisely, its selective SSM layerâ has the ability to solve the task perfectly because of its ability to selectively remember the relevant token while ignoring everything else in between. It generalizes perfectly to million-length sequences, or 4000Ã longer than it saw during training, while no other method goes beyond 2Ã . Out of positional encoding variants for attention models, xPos (which was designed for length extrapolation) is slightly better than the others; also note that all attention models were only tested up to sequence length 214 = 16384 due to memory limitations. Out of other SSMs, H3 and Hyena are similar, contrary to the ï¬ ndings in Poli et al. (2023). # 4.2 Language Modeling We evaluate the Mamba architecture on standard autoregressive language modeling against other architectures, on both pretraining metrics (perplexity) and zero-shot evaluations. We set the model sizes (depth and width) to mirror GPT3 speciï¬ cations.', chunk_index=27, num_tokens=391, metadata={}), ResponseChunk(id='chunk_646c9011-91ba-4e39-a420-527b9e1de11f', content='We use the Pile dataset (L. Gao, Biderman, et al. 2020), and follow the training recipe described in Brown et al. (2020). All training details are in Appendix E.2. # 4.2.1 Scaling Laws For baselines, we compare against the standard Transformer architecture (GPT3 architecture), as well as the strongest Transformer recipe we know of (here referred to as Transformer++), based on the PaLM and LLaMa 11 Scaling Laws on The Pile (Sequence Length 2048) Scaling Laws on The Pile (Sequence Length 8192) 2x10\" 2x10 Hyena Hyena RWKV s RWKV â â Transformer Fy â â Transformer fd RetNet 2 â â RetNet 3+ 2 â HH wd â = Transformers |, | â â Transformert+ â â Mamba zg â â Mamba 2 2 S a 6x 10Â° 1 7 6x 10Â° 1 7 10\"? 102 10 107Â° FLOPs (log scale) FLOPs (log scale) s 8 fd 2 2 > 3 2 2 S a Figure 4: (Scaling Laws.) Models of size â 125ð to â 1.3ð µ parameters, trained on the Pile. Mamba scales better than all other attention-free models and is the first to match the performance of a very strong â Transformer++â recipe that has now become standard, particularly as the sequence length grows. architectures (e.g. rotary embedding, SwiGLU MLP, RMSNorm instead of LayerNorm, no linear bias, and higher learning rates). We also compare against other recent subquadratic architectures (Figure 4). All model details are in Appendix E.2.', chunk_index=28, num_tokens=388, metadata={}), ResponseChunk(id='chunk_2860ef78-1ff6-4861-ba2d-a559f5741111', content='Figure 4 shows scaling laws under the standard Chinchilla (Hoï¬ mann et al. 2022) protocol, on models from â 125ð to â 1.3ð µ parameters. Mamba is the ï¬ rst attention-free model to match the performance of a very strong Transformer recipe (Transformer++) that has now become standard, particularly as the sequence length grows. We note that full results on context length 8k are missing for the RWKV and RetNet baselines, prior strong recurrent models that can also be interpreted as SSMs, due to a lack of eï¬ cient implementation leading to out-of-memory or unrealistic computation requirements. # 4.2.2 Downstream Evaluations Table 3 shows the performance of Mamba on a range of popular downstream zero-shot evaluation tasks. We compare against the most well-known open source models at these sizes, most importantly Pythia (Biderman et al. 2023) and RWKV (B. Peng et al. 2023) which were trained with the same tokenizer, dataset, and training length (300B tokens) as our models. (Note that Mamba and Pythia are trained with context length 2048, while RWKV was trained with context length 1024.) # 4.3 DNA Modeling Motivated by the success of large language models, there has been recent exploration into using the foundation model paradigm for genomics. DNA has been likened to language in that it consists of sequences of discrete tokens with a ï¬ nite vocab. It is also known for requiring long-range dependencies to model (Avsec et al. 2021). We investigate Mamba as a FM backbone for pretraining and ï¬ ne-tuning in the same setting as recent works on long-sequence models for DNA (Nguyen, Poli, et al. 2023).', chunk_index=29, num_tokens=388, metadata={}), ResponseChunk(id='chunk_39e13362-f0ec-45ba-8c73-2cbcd1513ea4', content='In particular, we focus on two explorations of scaling laws across model size and sequence length (Figure 5), and a diï¬ cult downstream synthetic classiï¬ cation task requiring long context (Figure 6). For pretraining, we largely follow a standard causal language modeling (next token prediction) setup for the training and model details (see also Appendix E.2). For the dataset, we largely follow the setup of HyenaDNA (Nguyen, Poli, et al. 2023), which uses the HG38 dataset for pretraining consisting of a single human genome with about 4.5 billion tokens (DNA base pairs) in the training split. # 4.3.1 Scaling: Model Size In this experiment, we investigate the scaling properties of genomics foundation models with various model backbones (Figure 5 Left). Training. To advantage the baselines, we train on a short sequence length of 1024; as shown in Section 4.3.2, we expect results to favor Mamba even more at longer sequence lengths. We ï¬ x a global batch size of 1024, for a 12 Table 3: (Zero-shot Evaluations.) Best results for each size in bold. We compare against open source LMs with various tokenizers, trained for up to 300B tokens. Pile refers to the validation split, comparing only against models trained on the same dataset and tokenizer (GPT-NeoX-20B). For each model size, Mamba is best-in-class on every single evaluation result, and generally matches baselines at twice the model size. Model Token. Pile ppl â LAMBADA LAMBADA HellaSwag ppl â acc â acc â acc â acc â acc â acc â Hybrid H3-130M GPT2 â', chunk_index=30, num_tokens=374, metadata={}), ResponseChunk(id='chunk_45e9ec25-28a7-4dce-87f6-3769f115338c', content='Pythia-160M Mamba-130M NeoX NeoX 29.64 10.56 89.48 38.10 16.07 25.77 33.0 44.3 31.7 30.2 35.3 64.2 61.4 64.5 44.4 43.2 48.0 24.2 24.1 24.3 50.6 51.9 51.9 40.1 40.6 44.7 Hybrid H3-360M GPT2 â Pythia-410M Mamba-370M NeoX NeoX 9.95 8.28 12.58 10.84 8.14 48.0 51.4 55.6 41.5 40.6 46.5 68.1 66.9 69.5 51.4 52.1 55.1 24.7 24.6 28.0 54.1 53.8 55.3 48.0 48.2 50.0 Pythia-1B Mamba-790M NeoX NeoX 7.82 7.33 7.92 6.02 56.1 62.7 47.2 55.1 70.7 72.1 57.0 61.2 27.1 29.5 53.5 56.1 51.9 57.1 GPT-Neo 1.3B Hybrid H3-1.3B OPT-1.3B Pythia-1.4B RWKV-1.5B Mamba-1.4B GPT2 â GPT2 â â', chunk_index=31, num_tokens=389, metadata={}), ResponseChunk(id='chunk_7d8d9494-9638-4410-8fa1-9cdd0fff14bc', content='OPT 7.51 NeoX 7.70 NeoX NeoX 6.80 7.50 11.25 6.64 6.08 7.04 5.04 57.2 49.6 58.0 61.7 56.4 64.9 48.9 52.6 53.7 52.1 52.5 59.1 71.1 71.3 72.4 71.0 72.4 74.2 56.2 59.2 56.7 60.5 60.5 65.5 25.9 28.1 29.6 28.5 29.4 32.8 54.9 56.9 59.5 57.2 54.6 61.5 52.4 53.0 55.0 55.2 54.3 59.7 GPT-Neo 2.7B Hybrid H3-2.7B OPT-2.7B Pythia-2.8B RWKV-3B Mamba-2.8B GPT2 â GPT2 â â', chunk_index=32, num_tokens=263, metadata={}), ResponseChunk(id='chunk_8904f03c-6eca-47c8-8d12-2e1f638b0f70', content='OPT 6.73 NeoX 7.00 NeoX NeoX 6.22 5.63 7.92 5.12 5.04 5.24 4.23 62.2 55.7 63.6 64.7 63.9 69.2 55.8 59.7 60.6 59.3 59.6 66.1 72.1 73.3 74.8 74.0 73.7 75.2 61.1 65.6 60.8 64.1 67.8 69.7 30.2 32.3 31.3 32.9 33.1 36.3 57.6 61.4 61.0 59.7 59.6 63.5 56.5 58.0 58.7 59.1 59.6 63.3 GPT-J-6B OPT-6.7B Pythia-6.9B RWKV-7.4B GPT2 OPT NeoX NeoX â â 6.51 6.31 4.10 4.25 4.45 4.38 68.3 67.7 67.1 67.2 66.3 67.2 64.0 65.5 75.4 76.3 75.2 76.1 67.0 65.6 67.3 67.8 36.6 34.9 35.5 37.5 64.1 65.5 61.3 61.0 63.0 62.9 61.7 62.5 total of 220 â 1ð tokens per batch.', chunk_index=33, num_tokens=396, metadata={}), ResponseChunk(id='chunk_e5a2cc22-efa5-40d2-b232-5f4bc7606d41', content='Models were trained for 10ð ¾ gradient steps for a total of 10ð µ tokens. Results. Figure 5 (Left) shows that Mambaâ s pretraining perplexity improves smoothly with model size, and that Mamba scales better than both HyenaDNA and Transformer++. For example, at the largest model size of â 40ð parameters, the curve shows that Mamba can match the Transformer++ and HyenaDNA models with roughly 3Ã to 4Ã fewer parameters. # 4.3.2 Scaling: Context Length In the next DNA experiment, we investigate the scaling properties of models with respect to sequence length. We only compare the HyenaDNA and Mamba models, as quadratic attention becomes prohibitively expensive at longer sequence lengths. We pretrain models on sequence lengths 210 = 1024, 212 = 4096, 214 = 16384, 216 = 65536, 218 = 262144, 220 = 1048576. We ï¬ x a model size of 6 layers by width 128 (about 1.3M-1.4M parameters). Models were trained for 20ð ¾ gradient steps for a total of â 330ð µ tokens. The longer sequence lengths used sequence length warmup similar to (Nguyen, Poli, et al. 2023). Results. Figure 5 (Right) shows that Mamba is able to make use of longer context even up to extremely long sequences of length 1M, and its pretraining perplexity improves as the context increases. On the other hand, the HyenaDNA model gets worse with sequence length. This is intuitive from the discussion in Section 3.5 on properties of the selection mechanism. In particular, LTI models cannot selectively ignore information; from a convolutional perspective, a very long convolution kernel is aggregating all information across a long sequence 13', chunk_index=34, num_tokens=395, metadata={}), ResponseChunk(id='chunk_8468bc23-f635-46fc-a000-e4c76818ba86', content='Scaling Laws on the Human Genome (HG38) Scaling Laws - Sequence Length (HG38) â â HyenaDNa 1.4m â = Mamba 1.4M â â Mamba 7M ae â â HyenaDNA 3.00 4 â Mamba â â Transformert+ 2.98 | Perplexity Perplexity 2.80 4 284 2.754 274 r T r r r ; 10Â° 107 103 10 105 10Â° Parameters (log scale) Sequence Length Figure 5: (DNA Scaling Laws.) Pretraining on the HG38 (human genome) dataset. (Left) Fixing short context length 210 = 1024 and increasing size from â 200ð ¾ to â 40ð parameters, Mamba scales better than baselines. (Right) Fixing model size and increasing sequence lengths while keeping tokens/batch and total training tokens fixed. Unlike baselines, the selection mechanism of Mamba facilitates better performance with increasing context length. Finetuning Accuracy (Species DNA Classification) 0.8] â â HyenaDNA1.4M 0.7-| â â Mamba 1.4m â â Mamba 7M mag] â â Random g 5 os 3 â 8 oA 034 024 --------------------------------- T T T T 103 10Â¢ 108 10 Sequence Length Scaling Laws - Sequence Length (YouTubeMix) 1.475 â â SA+FEN 1.450 4 â â Mamba @ 1.4254 2 1.400 4 5 o 1.375 4 Â© 1.3504 1.325 4 1.300 T T T 10* 10Â° 10 Sequence Length', chunk_index=35, num_tokens=385, metadata={}), ResponseChunk(id='chunk_e348d63c-874b-40c9-918d-65fc27fdfdf1', content='Figure 6: (Great Apes DNA Classification.) Accuracy after fine-tuning on sequences of length 210 = 1024 up to 220 = 1048576 using pretrained models of the same context length. Nu- merical results in Table 13. Figure 7: (Audio Pretraining.) Mamba improves performance over prior state-of-the-art (Sashimi) in autoregressive audio mod- eling, while improving up to minute-long context or million- length sequences (controlling for computation). which may be very noisy. Note that while HyenaDNA claims to improve with longer context, their results do not control for computation time. # 4.3.3 Synthetic Species Classification We evaluate models on a downstream task of classifying between 5 diï¬ erent species by randomly sampling a contigu- ous segment of their DNA. This task is adapted from HyenaDNA, which used the species {human, lemur, mouse, pig, hippo}. We modify the task to be signiï¬ cantly more challenging by classifying between the ï¬ ve great apes species {human, chimpanzee, gorilla, orangutan, bonobo}, which are known to share 99% of their DNA. # 4.4 Audio Modeling and Generation For the audio waveform modality, we compare primarily to the SaShiMi architecture and training protocols (Goel et al. 2022). This model comprises 1. a U-Net backbone with two stages of pooling by a factor ð that doubles the model dimension ð · per stage, 2. alternating S4 and MLP blocks in each stage. We consider replacing the S4+MLP blocks with Mamba blocks. Experiment details are in Appendix E.4. # 4.4.1 Long-Context Autoregressive Pretraining', chunk_index=36, num_tokens=380, metadata={}), ResponseChunk(id='chunk_de2ea758-5b48-4495-bdee-fe9627465927', content='We evaluate pretraining quality (autoregressive next-sample prediction) on YouTubeMix (DeepSound 2017), a standard piano music dataset used by prior work consisting of 4 hours of solo piano music, sampled at a rate of 14 16000 Hz Pretraining details largely follow the standard language modeling setup (Section 4.2). Figure 7 evaluates the eï¬ ect of increasing training sequence lengths from 213 = 8192 to 220 â 106, while keeping computation ï¬ xed. (There are some slight edge cases to the way the data is curated, which may lead to kinks in the scaling curves. For example, only minute-long clips were available so the maximum sequence length is actually bounded by 60ð â 16000ð »ð § = 960000.) Both Mamba and the SaShiMi (S4+MLP) baseline improve consistently with longer context lengths; Mamba is better throughout, and the gap widens at longer lengths. The main metric is bits per byte (BPB), which is a constant factor log(2) of the standard negative log-likelihood (NLL) loss for pretraining other modalities. We note one important detail: this is the only experiment in this paper in which we switched from the real parameterization to complex (Section 3.6). We show additional ablations in Appendix E.4. # 4.4.2 Autoregressive Speech Generation SC09 is a benchmark speech generation dataset (Donahue, McAuley, and Puckette 2019; Warden 2018), consisting of 1-second clips sampled at 16000 Hz of the digits â zeroâ through â nineâ with highly variable characteristics. We largely follow the autoregressive training setup and generation protocol of Goel et al. (2022).', chunk_index=37, num_tokens=380, metadata={}), ResponseChunk(id='chunk_494c3a0e-af52-4bed-965c-1b3c6a405c32', content='Table 4 shows automated metrics of the Mamba-UNet model compared to a variety of baselines from Goel et al. (2022): WaveNet (Oord et al. 2016), SampleRNN (Mehri et al. 2017), WaveGAN (Donahue, McAuley, and Puckette 2019), Diï¬ Wave (Z. Kong et al. 2021), and SaShiMi. A small Mamba model outperforms the state-of-the-art (and much larger) GAN- and diï¬ usion- based models. A larger model parameter-matched to the baselines further improves on ï¬ delity metrics dramatically. Table 5 takes the small Mamba model and investigates combinations of diï¬ erent architectures for the outer stages and center stage. It shows that Mamba is consistently better than S4+MLP in the outer blocks, and Mamba > S4+MLP > MHA+MLP in the center blocks. Table 4: (SC09) Automated metrics for unconditional generation on a challenging dataset of fixed-length speech clips. (Top to Bottom) Autoregressive baselines, non-autoregressive baselines, Mamba, and dataset metrics. Table 5: (SC09 Model Ablations) Models with 6M parameters. In SaShiMiâ s U-Net backbone, there are 8 center blocks operat- ing on sequence length 1000, sandwiched on each side by 8 outer blocks on sequence length 4000, sandwiched by 8 outer blocks on sequence length 16000 (40 blocks total). The architecture of the 8 center blocks are ablated independently of the rest. Note that Transformers (MHA+MLP) were not tested in the more im- portant outer blocks because of efficiency constraints. Model Params NLL â FID â IS â', chunk_index=38, num_tokens=399, metadata={}), ResponseChunk(id='chunk_c91b9c84-3c6c-45ee-8b9d-a3392930f9f7', content='mIS â AM â SampleRNN WaveNet SaShiMi 35.0M 4.2M 5.8M 2.042 1.925 1.873 8.96 5.08 1.99 1.71 2.27 5.13 3.02 5.80 42.57 1.76 1.47 0.74 WaveGAN DiffWave + SaShiMi Mamba Mamba Train Test 19.1M 24.1M 23.0M 6.1M 24.3M - - - - - 1.852 1.860 - - 2.03 1.92 1.42 0.94 0.67 0.00 0.02 4.90 5.26 5.94 6.26 7.33 8.56 8.33 36.10 51.21 69.17 88.54 144.9 292.5 257.6 0.80 0.68 0.59 0.52 0.36 0.16 0.19 Outer Center S4+MLP MHA+MLP S4+MLP S4+MLP Mamba Mamba Mamba Mamba S4+MLP MHA+MLP S4+MLP Mamba NLL â 1.859 1.867 1.859 1.850 1.853 1.852 FID â 1.45 1.43 1.42 1.37 1.07 0.94 IS â 5.06 5.42 5.71 5.63 6.05 6.26 mIS â', chunk_index=39, num_tokens=386, metadata={}), ResponseChunk(id='chunk_b534418e-73e2-4bcd-9d8d-fa800f5ae570', content='47.03 53.54 56.51 58.23 73.34 88.54 AM â 0.70 0.65 0.64 0.62 0.55 0.52 4.5 Speed and Memory Benchmarks We benchmark the speed of the SSM scan operation (state expansion ð = 16), as well as the end-to-end inference throughput of Mamba, in Figure 8. Our eï¬ cient SSM scan is faster than the best attention implementation that we know of (FlashAttention-2 (Dao 2023)) beyond sequence length 2K, and up to 20-40Ã faster than a standard scan implementation in PyTorch. Mamba achieves 4-5Ã higher inference throughput than a Transformer of similar size, since without the KV cache it can use much higher batch sizes. For example, a Mamba-6.9B (untrained) would have higher inference throughput than a 5Ã smaller Transformer-1.3B. Details in Appendix E.5, which additionally includes a benchmark of memory consumption. 15 Scan vs Convolution vs Attention time (A100 80GB PCle) Inference throughput on A100 80GB (prompt length 2048) â Flashattention-2 ame ee ES 1000-1 â convolution @ 1500] mm Mamba 6.98 wwe â â Scan (PyTorch) Py mmm Transformer 6.78 100 4 â â Scan (ours) Ei % 00M 2 a tod S 1000 B us Ff = 2 500 â = pad oid r S12 1k 2k Â«= 4k BKK 32K GK 128k 256K 512k 1 2 Hi A 16 32 oa 128 Sequence length Batch size @ = ~ Â£', chunk_index=40, num_tokens=400, metadata={}), ResponseChunk(id='chunk_bfb17f5e-41c4-4a43-9a37-751af8dd56fb', content='Figure 8: (Efficiency Benchmarks.) (Left) Training: our efficient scan is 40Ã faster than a standard implementation. (Right) Inference: as a recurrent model, Mamba can achieve 5Ã higher throughput than Transformers. # 4.6 Model Ablations We perform a series of detailed ablations on components of our model, focusing on the setting of language modeling with size â 350M models at Chinchilla token counts (same setting as Figure 4). # 4.6.1 Architecture Table 6 investigates the eï¬ ects of the architecture (block) and its inner SSM layer (Figure 3). We ï¬ nd that â ¢ Among previous non-selective (LTI) SSMs, which are equivalent to global convolutions, performance is very similar. â ¢ Replacing the complex-valued S4 variant from previous work with a real-valued one does not aï¬ ect performance much, suggesting that (at least for LM) real-valued SSMs may be a better choice when accounting for hardware eï¬ ciency. â ¢ Replacing any of these with a selective SSM (S6) signiï¬ cantly improves performance, validating the motivation of Section 3. â ¢ The Mamba architecture performs similarly to the H3 architecture (and seems slightly better when using a selective layer). We also investigate interleaving the Mamba block with other blocks such as MLP (a traditional architecture) MHA (a hybrid attention architecture) in Appendix E.2.2. # 4.6.2 Selective SSM Table 7 ablates the selective SSM layer by considering diï¬ erent combinations of selective â , B, and C param- eters (Algorithm 2), showing that â is the most important parameter due to its connection to RNN gating (Theorem 1). Table 8 considers diï¬', chunk_index=41, num_tokens=401, metadata={}), ResponseChunk(id='chunk_1b8e2d80-089b-4102-94f4-2be87741442e', content='erent initializations of the SSM, which have been shown to make a large diï¬ erence in some data modalities and settings (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022). On language modeling, we ï¬ nd that simpler real-valued diagonal initializations (S4D-Real, row 3) instead of more standard complex-valued parameterizations (S4D-Lin, row 1) perform better. Random initializations also work well, consistent with ï¬ ndings from prior work (Mehta et al. 2023). Table 9 and Table 10 consider varying the dimension of the â and (B, C) projections respectively. Changing them from static to selective provides the most beneï¬ t, while increasing the dimensions further generally improves performance modestly with a small increase in parameter count. Of particular note is the dramatic improvement of the selective SSM when the state size ð is increased, with over a 1.0 perplexity improvement for a cost of only 1% additional parameters. This validates our core motivation in Sections 3.1 and 3.3. 16 Table 6: (Ablations: Architecture and SSM layer.) The Mamba block performs similarly to H3 while being simpler. In the inner layer, there is little difference among different parameterizations of LTI models, while selective SSMs (S6) provide a large improvement. More specifically, the S4 (real) variant is S4D-Real and the S4 (complex) variant is S4D-Lin. Model Arch. SSM Layer Perplexity Model Arch.', chunk_index=42, num_tokens=351, metadata={}), ResponseChunk(id='chunk_da0ab88b-5868-4c47-a739-a347abf64ee8', content='SSM Layer Perplexity Hyena H3 H3 H3 H3 - H3 - Hyena S4 (complex) S4 (real) S6 10.24 10.30 10.34 8.95 Mamba Hyena - Mamba - - Mamba Mamba Mamba S4 (complex) S4 (real) S6 10.75 10.54 10.56 8.69 Table 7: (Ablations: Selective parameters.) â is the most im- portant parameter (Theorem 1), but using multiple selective pa- rameters together synergizes. Table 8: (Ablations: Parameterization of A.) The more standard initializations based on S4D-Lin (Gu, Gupta, et al. 2022) perform worse than S4D-Real or a random initializa- tion, when the SSM is selective. Selective A Selective B SelectiveC Perplexity \\\\Qx& xX Qk *Â®QX Qk Q&X 1093 10.15 9.98 9.81 8.71 Að Initialization Að = â 1 Complex Real Að = â 1â 2 Að = â (ð + 1) Real Að â ¼ exp(ð ©(0, 1)) Real Field + ð ð 2 9.16 8.85 8.71 8.71 Table 9: (Ablations: Expressivity of â .) The selection mechanism of â constructs it with a projection of the input. Project- ing it even to dim. 1 provides a large in- crease in performance; increasing it fur- ther provides further improvements at the cost of a modest increase in parameters. State size fixed to ð = 16. Size of â', chunk_index=43, num_tokens=394, metadata={}), ResponseChunk(id='chunk_4fa7d4a8-96be-43c7-b533-323e8114b1d5', content='proj. - 1 2 4 8 16 32 64 Params (M) 358.9 359.1 359.3 359.7 360.5 362.1 365.2 371.5 9.12 8.97 8.97 8.91 8.83 8.84 8.80 8.71 # Perplexity Table 10: (Ablations: SSM state dimension.) (Top) Constant B and C (Bottom) Selective B and C. Increasing the SSM state dimension ð , which can be viewed as an expansion factor on the dimension of the recurrent state, can significantly improve performance for a negligible cost in parameters/FLOPs, but only when B and C are also selective. Size of â projection fixed to 64. State dimension ð Params (M) Perplexity 1 2 4 8 16 1 2 4 8 16 367.1 367.4 368.0 369.1 371.5 367.1 367.4 368.0 369.1 371.5 9.88 9.86 9.82 9.82 9.81 9.73 9.40 9.09 8.84 8.71 # 5 Discussion We discuss related work, limitations, and some future directions. Related Work. Appendix A discusses how the selection mechanism relates to similar concepts. Appendix B has an extended related work of SSMs and other related models. No Free Lunch: Continuous-Discrete Spectrum. Structured SSMs were originally deï¬ ned as discretizations of continuous systems (1), and have had a strong inductive bias toward continuous-time data modalities such as perceptual signals (e.g. audio, video).', chunk_index=44, num_tokens=392, metadata={}), ResponseChunk(id='chunk_c9ce1d1c-b880-4bab-8752-67557f89e5a7', content='As discussed in Sections 3.1 and 3.5, the selection mechanism overcomes their weaknesses on discrete modalities such as text and DNA; but this conversely can impede their performance 17 on data that LTI SSMs excel on. Our ablations on audio waveforms examine this tradeoï¬ in more detail. Downstream Affordances. Transformer-based foundation models (particularly LLMs) have a rich ecosystem of properties and modes of interaction with pretrained models, such as ï¬ ne-tuning, adaptation, prompting, in-context learning, instruction tuning, RLHF, quantization, and so on. We are particularly interested in whether Transformer alternatives such as SSMs have similar properties and aï¬ ordances. Scaling. Our empirical evaluation is limited to small model sizes, below the threshold of most strong open source LLMs (e.g. Llama (Touvron et al. 2023)) as well as other recurrent models such as RWKV (B. Peng et al. 2023) and RetNet (Y. Sun et al. 2023), which have been evaluated at the 7B parameter scale and beyond. It remains to assess whether Mamba still compares favorably at these larger sizes. We also note that scaling SSMs may involve further engineering challenges and adjustments to the model that are not discussed in this paper. # 6 Conclusion We introduce a selection mechanism to structured state space models, allowing them to perform context-dependent reasoning while scaling linearly in sequence length. When incorporated into a simple attention-free architecture, Mamba achieves state-of-the-art results on a diverse set of domains, where it matches or exceeds the performance of strong Transformer models. We are excited about the broad applications of selective state space models to build foundation models for diï¬ erent domains, especially in emerging modalities requiring long context such as genomics, audio, and video.', chunk_index=45, num_tokens=392, metadata={}), ResponseChunk(id='chunk_d205bc7f-03ad-411e-a737-202e01c63dda', content='Our results suggest that Mamba is a strong candidate to be a general sequence model backbone. # Acknowledgments We thank Karan Goel, Arjun Desai, and Kush Bhatia for helpful feedback on the draft. # References [1] Martin Arjovsky, Amar Shah, and Yoshua Bengio. â Unitary Evolution Recurrent Neural Networksâ . In: The International Conference on Machine Learning (ICML). 2016, pp. 1120â 1128. iga Avsec, Vikram Agarwal, Daniel Visentin, Joseph R Ledsam, Agnieszka Grabska-Barwinska, Kyle R Taylor, Yannis Assael, John Jumper, Pushmeet Kohli, and David R Kelley. â Effective Gene Expression Prediction from Sequence by Integrating Long-range Interactionsâ . In: Nature Methods 18.10 (2021), pp. 1196â 1203. Jimmy Ba, Geoffrey E Hinton, Volodymyr Mnih, Joel Z Leibo, and Catalin Ionescu. â Using Fast Weights to Attend to the Recent Pastâ . In: Advances in Neural Information Processing Systems (NeurIPS) 29 (2016). Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. â Layer Normalizationâ . In: arXiv preprint arXiv:1607.06450 (2016). [2] [3] [4] [5] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. â Neural Machine Translation by Jointly Learning to Align and Translateâ . In: The International Conference on Learning Representations (ICLR). 2015. [6] David Balduzzi and Muhammad Ghifary. â Strongly-typed Recurrent Neural Networksâ . In: International Con- ference on Machine Learning.', chunk_index=46, num_tokens=393, metadata={}), ResponseChunk(id='chunk_0b101439-f241-4e49-a318-8aa2ae189dc8', content='PMLR. 2016, pp. 1292â 1300. [7] Stella Biderman, Hailey Schoelkopf, Quentin Gregory Anthony, Herbie Bradley, Kyle OBrien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, et al. â Pythia: A Suite for Analyzing Large Language Models across Training and Scalingâ . In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 2397â 2430. [8] Yonatan Bisk, Rowan Zellers, Jianfeng Gao, Yejin Choi, et al. â PIQA: Reasoning about Physical Commonsense in Natural Languageâ . In: Proceedings of the AAAI conference on Artificial Intelligence. Vol. 34. 05. 2020, pp. 7432â 7439. [9] Guy E Blelloch. â Prefix Sums and Their Applicationsâ . In: (1990). [10] James Bradbury, Stephen Merity, Caiming Xiong, and Richard Socher. â Quasi-recurrent Neural Networksâ . In: arXiv preprint arXiv:1611.01576 (2016). 18 [11] Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Nee- lakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. â Language Models are Few-shot Learnersâ . In: Advances in Neural Information Processing Systems (NeurIPS) 33 (2020), pp. 1877â 1901. [12] Aydar Bulatov, Yuri Kuratov, and Mikhail S Burtsev. â', chunk_index=47, num_tokens=399, metadata={}), ResponseChunk(id='chunk_17b66203-9d1c-4c9f-9d9e-919f593f3879', content='Scaling Transformer to 1M tokens and Beyond with RMTâ . In: arXiv preprint arXiv:2304.11062 (2023). [13] Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. â Generating Long Sequences with Sparse Trans- formersâ . In: arXiv preprint arXiv:1904.10509 (2019). [14] Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Pe- ter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, et al. â Rethinking Attention with Performersâ . In: The International Conference on Learning Representations (ICLR). 2021. [15] Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. â PaLM: Scaling Language Modeling with Pathwaysâ . In: Journal of Machine Learning Research 24.240 (2023), pp. 1â 113. url: http://jmlr.org/ papers/v24/22-1144.html. Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio. â Empirical Evaluation of Gated Re- current Neural Networks on Sequence Modelingâ . In: arXiv preprint arXiv:1412.3555 (2014). [17] Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. â', chunk_index=48, num_tokens=395, metadata={}), ResponseChunk(id='chunk_66018634-afbc-4afc-8f78-0681ba8fbec9', content='Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challengeâ . In: arXiv preprint arXiv:1803.05457 (2018). [18] Tri Dao. â FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioningâ . In: (2023). [19] Tri Dao, Daniel Y Fu, Stefano Ermon, Atri Rudra, and Christopher RÃ©. â FlashAttention: Fast and Memory- Efficient Exact Attention with IO-Awarenessâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2022. [20] Tri Dao, Daniel Y Fu, Khaled K Saab, Armin W Thomas, Atri Rudra, and Christopher RÃ©. â Hungry Hungry Hippos: Towards Language Modeling with State Space Modelsâ . In: The International Conference on Learning Representations (ICLR). 2023. [21] Yann N Dauphin, Angela Fan, Michael Auli, and David Grangier. â Language Modeling with Gated Convolu- tional Networksâ . In: The International Conference on Machine Learning (ICML). PMLR. 2017, pp. 933â 941. # [22] DeepSound. SampleRNN. https://github.com/deepsound-project/samplernn-pytorch. 2017. [23] Jiayu Ding, Shuming Ma, Li Dong, Xingxing Zhang, Shaohan Huang, Wenhui Wang, and Furu Wei. â LongNet: Scaling Transformers to 1,000,000,000 Tokensâ . In: arXiv preprint arXiv:2307.02486 (2023). [24] Chris Donahue, Julian McAuley, and Miller Puckette. â Adversarial Audio Synthesisâ . In:', chunk_index=49, num_tokens=397, metadata={}), ResponseChunk(id='chunk_1d7ef085-6f73-4025-a6da-ff2f8e4a613e', content='The International Conference on Learning Representations (ICLR). 2019. [25] Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. â An Image is Worth 16x16 Words: Transformers for Image Recognition at Scaleâ . In: The International Conference on Learning Representations (ICLR). 2020. [26] Nelson Elhage, Neel Nanda, Catherine Olsson, Tom Henighan, Nicholas Joseph, Ben Mann, Amanda Askell, Yuntao Bai, Anna Chen, Tom Conerly, Nova DasSarma, Dawn Drain, Deep Ganguli, Zac Hatfield-Dodds, Danny Hernandez, Andy Jones, Jackson Kernion, Liane Lovitt, Kamal Ndousse, Dario Amodei, Tom Brown, Jack Clark, Jared Kaplan, Sam McCandlish, and Chris Olah. â A Mathematical Framework for Transformer Circuitsâ . In: Transformer Circuits Thread (2021). https://transformer-circuits.pub/2021/framework/index.html. [27] Mahan Fathi, Jonathan Pilault, Pierre-Luc Bacon, Christopher Pal, Orhan Firat, and Ross Goroshin. â Block- State Transformerâ . In: arXiv preprint arXiv:2306.09539 (2023). [28] Yassir Fathullah, Chunyang Wu, Yuan Shangguan, Junteng Jia, Wenhan Xiong, Jay Mahadeokar, Chunxi Liu, Yangyang Shi, Ozlem Kalinli, Mike Seltzer, et al. â Multi-Head State Space Model for Sequence Modelingâ . In: INTERSPEECH. 2023.', chunk_index=50, num_tokens=398, metadata={}), ResponseChunk(id='chunk_ecd42c42-f5a0-4c36-b977-01d8fe75f4ba', content='[29] Karl J Friston, Lee Harrison, and Will Penny. â Dynamic Causal Modellingâ . In: Neuroimage 19.4 (2003), pp. 1273â 1302. [30] Daniel Y Fu, Elliot L Epstein, Eric Nguyen, Armin W Thomas, Michael Zhang, Tri Dao, Atri Rudra, and Christo- pher RÃ©. â Simple Hardware-efficient Long Convolutions for Sequence Modelingâ . In: The International Confer- ence on Machine Learning (ICML) (2023). [31] Ken-ichi Funahashi and Yuichi Nakamura. â Approximation of Dynamical Systems by Continuous Time Recur- rent Neural Networksâ . In: Neural Networks 6.6 (1993), pp. 801â 806. 19 [32] Leo Gao, Stella Biderman, Sid Black, Laurence Golding, Travis Hoppe, Charles Foster, Jason Phang, Horace He, Anish Thite, Noa Nabeshima, Shawn Presser, and Connor Leahy. â The Pile: An 800GB Dataset of Diverse Text for Language Modelingâ . In: arXiv preprint arXiv:2101.00027 (2020). [33] Leo Gao, Jonathan Tow, Stella Biderman, Sid Black, Anthony DiPofi, Charles Foster, Laurence Golding, Jeffrey Hsu, Kyle McDonell, Niklas Muennighoff, Jason Phang, Laria Reynolds, Eric Tang, Anish Thite, Ben Wang, Kevin Wang, and Andy Zou. A Framework for Few-shot Language Model Evaluation. Version v0.0.1. Sept. 2021. doi: 10.5281/zenodo.5371628. url: https://doi.org/10.5281/zenodo.5371628.', chunk_index=51, num_tokens=398, metadata={}), ResponseChunk(id='chunk_e05c1a64-6f50-45ce-aaf2-4f94638b725d', content='[34] Karan Goel, Albert Gu, Chris Donahue, and Christopher RÃ©. â Itâ s Raw! Audio Generation with State-Space Modelsâ . In: The International Conference on Machine Learning (ICML). 2022. [35] Albert Gu, Tri Dao, Stefano Ermon, Atri Rudra, and Christopher RÃ©. â HIPPO: Recurrent Memory with Optimal Polynomial Projectionsâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2020. [36] Albert Gu, Karan Goel, and Christopher RÃ©. â Efficiently Modeling Long Sequences with Structured State Spacesâ . In: The International Conference on Learning Representations (ICLR). 2022. [37] Albert Gu, Caglar Gulcehre, Tom Le Paine, Matt Hoffman, and Razvan Pascanu. â Improving the Gating Mech- anism of Recurrent Neural Networksâ . In: The International Conference on Machine Learning (ICML). 2020. [38] Albert Gu, Ankit Gupta, Karan Goel, and Christopher RÃ©. â On the Parameterization and Initialization of Diag- onal State Space Modelsâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2022. [39] Albert Gu, Isys Johnson, Karan Goel, Khaled Saab, Tri Dao, Atri Rudra, and Christopher RÃ©. â Combining Recur- rent, Convolutional, and Continuous-time Models with the Linear State Space Layerâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2021. [40] Albert Gu, Isys Johnson, Aman Timalsina, Atri Rudra, and Christopher RÃ©. â How to Train Your HIPPO: State Space Models with Generalized Basis Projectionsâ .', chunk_index=52, num_tokens=397, metadata={}), ResponseChunk(id='chunk_51fef256-dd71-4af0-8cf4-d9deca76f5af', content='In: The International Conference on Learning Representations (ICLR). 2023. [41] Ankit Gupta, Albert Gu, and Jonathan Berant. â Diagonal State Spaces are as Effective as Structured State Spacesâ . In: Advances in Neural Information Processing Systems 35 (2022), pp. 22982â 22994. [42] David Ha, Andrew Dai, and Quoc V. Le. â HyperNetworksâ . In: The International Conference on Learning Rep- resentations (ICLR). 2017. [43] Danijar Hafner, Timothy Lillicrap, Jimmy Ba, and Mohammad Norouzi. â Dream to Control: Learning Behav- iors by Latent Imaginationâ . In: The International Conference on Learning Representations (ICLR). 2020. [44] Ramin Hasani, Mathias Lechner, Tsun-Hsuan Wang, Makram Chahine, Alexander Amini, and Daniela Rus. â Liquid Structural State-Space Modelsâ . In: The International Conference on Learning Representations (ICLR). 2023. [45] Mikael Henaff, Arthur Szlam, and Yann LeCun. â Recurrent Orthogonal Networks and Long-Memory Tasksâ . In: The International Conference on Machine Learning (ICML). 2016. [46] Dan Hendrycks and Kevin Gimpel. â Gaussian Error Linear Units (GELUs)â . In: arXiv preprint arXiv:1606.08415 (2016). [47] Sepp Hochreiter and JÃ¼rgen Schmidhuber. â Long Short-Term Memoryâ . In: Neural Computation 9.8 (1997), pp. 1735â 1780.', chunk_index=53, num_tokens=375, metadata={}), ResponseChunk(id='chunk_ef15589d-83c6-4831-b684-8a2d1dc11f78', content='Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. â An Empirical Analysis of Compute- Optimal Large Language Model Trainingâ . In: Advances in Neural Information Processing Systems (NeurIPS) 35 (2022), pp. 30016â 30030. 48 [49] Weizhe Hua, Zihang Dai, Hanxiao Liu, and Quoc Le. â Transformer Quality in Linear Timeâ . In: The Interna- tional Conference on Machine Learning (ICML). PMLR. 2022, pp. 9099â 9117. [50] Hassan Ismail Fawaz, Germain Forestier, Jonathan Weber, Lhassane Idoumghar, and Pierre-Alain Muller. â Deep Learning for Time Series Classification: A Reviewâ . In: Data Mining and Knowledge Discovery 33.4 (2019), pp. 917â 963. [51] Andrei Ivanov, Nikoli Dryden, Tal Ben-Nun, Shigang Li, and Torsten Hoefler. â Data Movement is All You Need: A Case Study on Optimizing Transformersâ . In: Proceedings of Machine Learning and Systems 3 (2021), pp. 711â 732. [52] Li Jing, Caglar Gulcehre, John Peurifoy, Yichen Shen, Max Tegmark, Marin Soljacic, and Yoshua Bengio. â Gated Orthogonal Recurrent Units: On Learning to Forgetâ . In: Neural Computation 31.4 (2019), pp. 765â 783. [53] Rudolph Emil Kalman. â A New Approach to Linear Filtering and Prediction Problemsâ .', chunk_index=54, num_tokens=399, metadata={}), ResponseChunk(id='chunk_8d8f0acf-a3f8-48ab-ba4f-85dba8c86af9', content='In: (1960). 20 [54] Angelos Katharopoulos, Apoorv Vyas, Nikolaos Pappas, and FranÃ§ois Fleuret. â Transformers are RNNs: Fast Autoregressive Transformers with Linear Attentionâ . In: International Conference on Machine Learning. PMLR. 2020, pp. 5156â 5165. [55] Zhifeng Kong, Wei Ping, Jiaji Huang, Kexin Zhao, and Bryan Catanzaro. â DiffWave: A Versatile Diffusion Model for Audio Synthesisâ . In: International Conference on Learning Representations. 2021. [56] Chrysoula Kosma, Giannis Nikolentzos, and Michalis Vazirgiannis. â Time-Parameterized Convolutional Neu- ral Networks for Irregularly Sampled Time Seriesâ . In: arXiv preprint arXiv:2308.03210 (2023). [57] Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. â ImageNet Classification with Deep Convolutional Neural Networksâ . In: Advances in Neural Information Processing Systems (NeurIPS) 25 (2012). [58] Tao Lei. â When Attention Meets Fast Recurrence: Training Language Models with Reduced Computeâ . In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing. 2021, pp. 7633â 7648. [59] Tao Lei, Yu Zhang, Sida I Wang, Hui Dai, and Yoav Artzi. â Simple Recurrent Units for Highly Parallelizable Recurrenceâ . In: arXiv preprint arXiv:1709.02755 (2017). [60] Mario Lezcano-Casado and David MartÃ nez-Rubio. â Cheap Orthogonal Constraints in Neural Networks:', chunk_index=55, num_tokens=400, metadata={}), ResponseChunk(id='chunk_33f05e54-3ff7-4f23-8715-790286c8f3f5', content='A Simple Parametrization of the Orthogonal and Unitary Groupâ . In: The International Conference on Machine Learning (ICML). 2019. [61] Yuhong Li, Tianle Cai, Yi Zhang, Deming Chen, and Debadeepta Dey. â What Makes Convolutional Models Great on Long Sequence Modeling?â In: The International Conference on Learning Representations (ICLR). 2023. [62] Vasileios Lioutas and Yuhong Guo. â Time-aware Large Kernel Convolutionsâ . In: The International Conference on Machine Learning (ICML). PMLR. 2020, pp. 6172â 6183. [63] Chris Lu, Yannick Schroecker, Albert Gu, Emilio Parisotto, Jakob Foerster, Satinder Singh, and Feryal Behba- hani. â Structured State Space Models for In-Context Reinforcement Learningâ . In: Advances in Neural Informa- tion Processing Systems (NeurIPS). 2023. [64] Shahar Lutati, Itamar Zimerman, and Lior Wolf. â Focus Your Attention (with Adaptive IIR Filters)â . In: arXiv preprint arXiv:2305.14952 (2023). [65] Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. â Mega: Moving Average Equipped Gated Attentionâ . In: The International Conference on Learning Representations (ICLR). 2023. [66] Eric Martin and Chris Cundy. â Parallelizing Linear Recurrent Neural Nets Over Sequence Lengthâ . In: The International Conference on Learning Representations (ICLR). 2018.', chunk_index=56, num_tokens=385, metadata={}), ResponseChunk(id='chunk_27d192f8-03ae-42d8-81e0-b6dfd410f950', content='[67] Soroush Mehri, Kundan Kumar, Ishaan Gulrajani, Rithesh Kumar, Shubham Jain, Jose Sotelo, Aaron Courville, and Yoshua Bengio. â SampleRNN: An Unconditional End-to-End Neural Audio Generation Modelâ . In: The International Conference on Learning Representations (ICLR). 2017. [68] Harsh Mehta, Ankit Gupta, Ashok Cutkosky, and Behnam Neyshabur. â Long Range Language Modeling via Gated State Spacesâ . In: The International Conference on Learning Representations (ICLR). 2023. [69] Zakaria Mhammedi, Andrew Hellicar, Ashfaqur Rahman, and James Bailey. â Efficient Orthogonal Parametri- sation of Recurrent Neural Networks using Householder Reflectionsâ . In: International Conference on Machine Learning. PMLR. 2017, pp. 2401â 2409. [70] Eric Nguyen, Karan Goel, Albert Gu, Gordon Downs, Preey Shah, Tri Dao, Stephen Baccus, and Christopher RÃ©. â S4ND: Modeling Images and Videos as Multidimensional Signals with State Spacesâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2022. [71] Eric Nguyen, Michael Poli, Marjan Faizi, Armin Thomas, Callum Birch-Sykes, Michael Wornow, Aman Pa- tel, Clayton Rabideau, Stefano Massaroli, Yoshua Bengio, et al. â HyenaDNA: Long-range Genomic Sequence Modeling at Single Nucleotide Resolutionâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2023.', chunk_index=57, num_tokens=369, metadata={}), ResponseChunk(id='chunk_fef9093c-3cab-43cb-8866-1bf9165d6711', content='[72] Catherine Olsson, Nelson Elhage, Neel Nanda, Nicholas Joseph, Nova DasSarma, Tom Henighan, Ben Mann, Amanda Askell, Yuntao Bai, Anna Chen, Tom Conerly, Dawn Drain, Deep Ganguli, Zac Hatfield-Dodds, Danny Hernandez, Scott Johnston, Andy Jones, Jackson Kernion, Liane Lovitt, Kamal Ndousse, Dario Amodei, Tom Brown, Jack Clark, Jared Kaplan, Sam McCandlish, and Chris Olah. â In-context Learning and Induction Headsâ . In: Transformer Circuits Thread (2022). https://transformer-circuits.pub/2022/in-context-learning-and-induction- heads/index.html. [73] Aaron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalch- brenner, Andrew Senior, and Koray Kavukcuoglu. â WaveNet: A Generative Model for Raw Audioâ . In: arXiv preprint arXiv:1609.03499 (2016). 21 [74] Antonio Orvieto, Samuel L Smith, Albert Gu, Anushan Fernando, Caglar Gulcehre, Razvan Pascanu, and So- ham De. â Resurrecting Recurrent Neural Networks for Long Sequencesâ . In: The International Conference on Machine Learning (ICML). 2023. [75] Denis Paperno, GermÃ¡n Kruszewski, Angeliki Lazaridou, Ngoc-Quan Pham, Raffaella Bernardi, Sandro Pezzelle, Marco Baroni, Gemma Boleda, and Raquel FernÃ¡ndez. â The LAMBADA Dataset: Word Prediction Requiring a Broad Discourse Contextâ . In:', chunk_index=58, num_tokens=397, metadata={}), ResponseChunk(id='chunk_e1e9657f-722c-4242-8bc0-f4a9327faed2', content='Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics. 2016, pp. 1525â 1534. [76] Razvan Pascanu, Tomas Mikolov, and Yoshua Bengio. â On the Difficulty of Training Recurrent Neural Net- worksâ . In: International Conference on Machine Learning. 2013, pp. 1310â 1318. [77] Bo Peng, Eric Alcaide, Quentin Anthony, Alon Albalak, Samuel Arcadinho, Huanqi Cao, Xin Cheng, Michael Chung, Matteo Grella, Kranthi Kiran GV, et al. â RWKV: Reinventing RNNs for the Transformer Eraâ . In: arXiv preprint arXiv:2305.13048 (2023). [78] Hao Peng, Nikolaos Pappas, Dani Yogatama, Roy Schwartz, Noah A Smith, and Lingpeng Kong. â Random Feature Attentionâ . In: The International Conference on Learning Representations (ICLR). 2021. [79] Michael Poli, Stefano Massaroli, Eric Nguyen, Daniel Y Fu, Tri Dao, Stephen Baccus, Yoshua Bengio, Stefano Ermon, and Christopher RÃ©. â Hyena Hierarchy: Towards Larger Convolutional Language Modelsâ . In: The International Conference on Machine Learning (ICML). 2023. [80] Zhen Qin, Xiaodong Han, Weixuan Sun, Bowen He, Dong Li, Dongxu Li, Yuchao Dai, Lingpeng Kong, and Yiran Zhong. â Toeplitz Neural Network for Sequence Modelingâ . In: The International Conference on Learning Representations (ICLR). 2023.', chunk_index=59, num_tokens=373, metadata={}), ResponseChunk(id='chunk_c231fd2b-d969-434a-af52-02c1dc7e7ed8', content='[81] Zhen Qin, Xiaodong Han, Weixuan Sun, Dongxu Li, Lingpeng Kong, Nick Barnes, and Yiran Zhong. â The devil in linear transformerâ . In: arXiv preprint arXiv:2210.10340 (2022). [82] Zhen Qin, Weixuan Sun, Hui Deng, Dongxu Li, Yunshen Wei, Baohong Lv, Junjie Yan, Lingpeng Kong, and Yiran Zhong. â CosFormer: Rethinking Softmax in Attentionâ . In: The International Conference on Learning Representations (ICLR). 2022. [83] Ali Rahimi and Benjamin Recht. â Random features for large-scale kernel machinesâ . In: Advances in neural information processing systems 20 (2007). [84] Prajit Ramachandran, Barret Zoph, and Quoc V Le. â Swish: A Self-gated Activation Functionâ . In: arXiv preprint arXiv:1710.05941 7.1 (2017), p. 5. [85] David W Romero, Anna Kuzina, Erik J Bekkers, Jakub M Tomczak, and Mark Hoogendoorn. â CKConv: Con- tinuous Kernel Convolution For Sequential Dataâ . In: arXiv preprint arXiv:2102.02611 (2021). [86] Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. â Winogrande: An Adversarial Wino- grad Schema Challenge at Scaleâ . In: Communications of the ACM 64.9 (2021), pp. 99â 106. [87] George Saon, Ankit Gupta, and Xiaodong Cui. â', chunk_index=60, num_tokens=391, metadata={}), ResponseChunk(id='chunk_9ee51c64-c585-41d1-8cf4-ce210edd9459', content='Diagonal State Space Augmented Transformers for Speech Recognitionâ . In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE. 2023, pp. 1â 5. Imanol Schlag, Kazuki Irie, and JÃ¼rgen Schmidhuber. â Linear Transformers are Secretly Fast Weight Program- mersâ . In: The International Conference on Machine Learning (ICML). PMLR. 2021, pp. 9355â 9366. [89] Noam Shazeer. â GLU Variants Improve Transformerâ . In: arXiv preprint arXiv:2002.05202 (2020). [90] Freda Shi, Xinyun Chen, Kanishka Misra, Nathan Scales, David Dohan, Ed H Chi, Nathanael SchÃ¤rli, and Denny Zhou. â Large Language Models can be Easily Distracted by Irrelevant Contextâ . In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 31210â 31227. Jiaxin Shi, Ke Alexander Wang, and Emily Fox. â Sequence Modeling with Multiresolution Convolutional Mem- oryâ . In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 31312â 31327. Jimmy TH Smith, Andrew Warrington, and Scott W Linderman. â Simplified State Space Layers for Sequence Modelingâ . In: The International Conference on Learning Representations (ICLR). 2023. Jianlin Su, Yu Lu, Shengfeng Pan, Ahmed Murtadha, Bo Wen, and Yunfeng Liu. â Roformer: Enhanced Trans- former with Rotary Position Embeddingâ .', chunk_index=61, num_tokens=393, metadata={}), ResponseChunk(id='chunk_751a79d4-b042-4625-9082-98f1666ba6f5', content='In: arXiv preprint arXiv:2104.09864 (2021). [93] [94] Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, and Furu Wei. â Retentive network: A successor to transformer for large language modelsâ . In: arXiv preprint arXiv:2307.08621 (2023). Ilya Sutskever, Oriol Vinyals, and Quoc V Le. â Sequence to Sequence Learning with Neural Networksâ . In: Advances in Neural Information Processing Systems (NeurIPS) 27 (2014). 22 [96] Corentin Tallec and Yann Ollivier. â Can Recurrent Neural Networks Warp Time?â In: The International Con- ference on Learning Representations (ICLR). 2018. [97] Yi Tay, Mostafa Dehghani, Samira Abnar, Yikang Shen, Dara Bahri, Philip Pham, Jinfeng Rao, Liu Yang, Se- bastian Ruder, and Donald Metzler. â Long Range Arena: A Benchmark for Efficient Transformersâ . In: Inter- national Conference on Learning Representations (ICLR). 2021. [98] Yi Tay, Mostafa Dehghani, Dara Bahri, and Donald Metzler. â Efficient Transformers: A Surveyâ . In: ACM Com- puting Surveys 55.6 (2022), pp. 1â 28.', chunk_index=62, num_tokens=335, metadata={}), ResponseChunk(id='chunk_b9fe4075-dc59-4d51-8162-f619c96de4e4', content='[99] Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, TimothÃ©e Lacroix, Bap- tiste RoziÃ¨re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. â Llama: Open and Efficient Foundation Language Modelsâ . In: arXiv preprint arXiv:2302.13971 (2023). [100] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. â Attention Is All You Needâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2017. [101] Eugene Vorontsov, Chiheb Trabelsi, Samuel Kadoury, and Chris Pal. â On Orthogonality and Learning Recur- rent Networks with Long Term Dependenciesâ . In: International Conference on Machine Learning. PMLR. 2017, pp. 3570â 3578. Jue Wang, Wentao Zhu, Pichao Wang, Xiang Yu, Linda Liu, Mohamed Omar, and Raffay Hamid. â Selective Structured State-Spaces for Long-form Video Understandingâ . In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2023, pp. 6387â 6397. [102] [103] Pete Warden. â Speech Commands: A Dataset for Limited-Vocabulary Speech Recognitionâ . In: ArXiv abs/1804.03209 (2018). [104] Samuel Williams, Andrew Waterman, and David Patterson. â Roofline: An Insightful Visual Performance Model for Multicore Architecturesâ . In:', chunk_index=63, num_tokens=392, metadata={}), ResponseChunk(id='chunk_541070f6-1574-489e-bd4d-2e6322b4a28e', content='Communications of the ACM 52.4 (2009), pp. 65â 76. [105] Brandon Yang, Gabriel Bender, Quoc V Le, and Jiquan Ngiam. â CondConv: Conditionally Parameterized Con- volutions for Efficient Inferenceâ . In: Advances in Neural Information Processing Systems (NeurIPS) 32 (2019). [106] Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. â HellaSwag: Can a Machine Really Finish Your Sentence?â In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguis- tics. 2019. [107] Shuangfei Zhai, Walter Talbott, Nitish Srivastava, Chen Huang, Hanlin Goh, Ruixiang Zhang, and Josh Susskind. â An Attention Free Transformerâ . In: arXiv preprint arXiv:2105.14103 (2021). [108] Michael Zhang, Khaled K Saab, Michael Poli, Tri Dao, Karan Goel, and Christopher RÃ©. â Effectively Modeling Time Series with Simple Discrete State Spacesâ . In: The International Conference on Learning Representations (ICLR). 2023. [109] Lin Zheng, Chong Wang, and Lingpeng Kong. â Linear complexity randomized self-attention mechanismâ . In: International Conference on Machine Learning. PMLR. 2022, pp. 27011â 27041. [110] Simiao Zuo, Xiaodong Liu, Jian Jiao, Denis Charles, Eren Manavoglu, Tuo Zhao, and Jianfeng Gao. â Efficient Long Sequence Modeling via State Space Augmented Transformerâ . In: arXiv preprint arXiv:2212.08136 (2022). 23', chunk_index=64, num_tokens=400, metadata={}), ResponseChunk(id='chunk_2b3e926a-62ce-49f3-915d-9dc75ed317f1', content='# A Discussion: Selection Mechanism Our selection mechanism is inspired by and related to concepts such as gating, hypernetworks, and data-dependence. It can also be viewed as related to â fast weightsâ (J. Ba et al. 2016), which connects classical RNNs with the mechanism of linear attention (Schlag, Irie, and Schmidhuber 2021). However, we believe that it is a distinct concept that is worth clarifying. Gating. Gating originally referred to the gating mechanisms of RNNs such as the LSTM (Hochreiter and Schmidhuber 1997) and GRU (J. Chung et al. 2014), or the gated equation (5)n Theorem 1. This was interpreted as a particular mechanism for controlling whether to let an input into the hidden state of an RNN. In particular, this aï¬ ects the propagation of signal through time and causes inputs to interact along the sequence length dimension. However, the concept of gating has since been relaxed in popular usage to simply mean any multiplicative interaction (often with an activation function). For example, elementwise multiplicative components of neural network architectures (that do not interact along sequence length) are now commonly referred to as gated architectures (Hua et al. 2022; Mehta et al. 2023), despite a very diï¬ erent meaning than the original RNN sense. Thus we believe the original concept of RNN gating versus the popular usage of multiplicative gating actually have a very diï¬ erent semantic meaning. Hypernetworks. Hypernetworks refer to neural networks whose parameters are themselves generated by smaller neural networks. The original idea (Ha, Dai, and Quoc V. Le 2017) used it in a narrow sense to deï¬ ne a large RNN whose recurrent parameters are generated by a smaller RNN. Data-dependence.', chunk_index=65, num_tokens=395, metadata={}), ResponseChunk(id='chunk_3f1974a5-086c-49d6-a7ab-ab82dfcb8a81', content='Similar to hypernetworks, data-dependence can refer to any notion where some parameters of the model depend on the data (Poli et al. 2023). Example: GLU Activation. To illustrate the issues with these concepts, consider a simple diagonal linear layer ð ¦ = Dð ¥, where D is a diagonal weight parameter. Now suppose that D is itself generated from a linear transformation of ð ¥, with an optional nonlinearity: D = ð (W ð ¥). Since it is diagonal, the multiplication becomes an elementwise product: ð ¦ = ð (W ð ¥)â ¦ð ¥. This is a rather trivial transformation, yet it technically satisï¬ es the common meanings of gating (since it has a multiplicative â branchâ ), hypernetworks (since the parameter D is generated by another layer), and data-dependent (since D depends on the data ð ¥). However, this in fact simply deï¬ nes a GLU function, which is so simple that it is often considered just an activation function (Dauphin et al. 2017; Shazeer 2020) instead of a meaningful layer. Selection. Thus, while selection mechanisms could be considered a special case of ideas such as architectural gating, hypernetworks, or data-dependence, so can an enormous range of other constructionsâ essentially anything with a multiplication, including standard attention mechanisms (Bahdanau, Cho, and Bengio 2015; Vaswani et al. 2017) as wellâ and we ï¬ nd it uninformative to think of them as such. Instead, we view it as most closely related to the gating mechanism of traditional RNNs, which is a special case (Theorem 1) and also has a deeper history of connections to SSMs through variable (input-dependent) discretization of â', chunk_index=66, num_tokens=392, metadata={}), ResponseChunk(id='chunk_e30cc870-4c35-4d94-84b2-a1c99a152de8', content='(Funahashi and Nakamura 1993; Gu, Dao, et al. 2020; Tallec and Ollivier 2018). We also eschew the term â gatingâ in favor of selection to clarify the overloaded use of former. More narrowly, we use selection to refer to the mechanistic action of a model to select or ignore inputs and facilitate data interaction along the sequence length (Section 3.1). Beyond selective SSMs and gated RNNs, other examples may include input-dependent convolutions (Kosma, Nikolentzos, and Vazirgiannis 2023; Lioutas and Guo 2020; Lutati, Zimerman, and Wolf 2023; Yang et al. 2019) and even attention. 24 # B Related Work We overview several prior works related to our methods. We mention that some of the most closely related models include recurrent layers such as S4, S5, and quasi-RNNs; as well as end-to-end architectures such as H3, RetNet, and RWKV. # B.1 S4 Variants and Derivatives We describe a brief overview of some structured SSMs from past work, particularly those that have a relation to our method. â ¢ S4 (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021) introduced the ï¬ rst structured SSM, describing diagonal structure and diagonal plus low-rank (DPLR). It focused on eï¬ cient convolutional algorithms for DPLR SSMs due to a connection to continuous-time online memorization (HIPPO) (Gu, Dao, et al. 2020). â ¢ DSS (Gupta, Gu, and Berant 2022) ï¬ rst discovered the empirical eï¬', chunk_index=67, num_tokens=394, metadata={}), ResponseChunk(id='chunk_0829d769-cd66-4fec-aa9c-623a089d4e09', content='ectiveness of diagonal structured SSMs by approximating the HIPPO initialization. This was expanded on theoretically in S4D (Gu, Gupta, et al. 2022). â ¢ S5 (Smith, Warrington, and Linderman 2023) independently discovered the diagonal SSM approximation, and is the ï¬ rst S4 model to be computed recurrently with the parallel scan. However, this required lowering the eï¬ ective state dimension, which they accomplished by switching the SSM dimensions from a SISO (single-input single-output) to MIMO (multi-input multi-output) formulation. Our proposed S6 shares the scan, but diï¬ ers by (i) keeping the SISO dimensions, which provides a larger eï¬ ective recurrent state, (ii) using a hardware-aware algorithm to overcome the computation issue, (iii) adding the selection mechanism. Lu et al. (2023) applied S5 to meta-RL in order to handle resetting the SSM state between episode trajectories. Their mechanism can be viewed as a particular hard-coded instance of a selection mechanism, where A is manually set to 0, instead of our learnable mechanism that depends on the input. It would be interesting to apply selective SSMs generically to this setting and probe if the model has learned to automatically reset its state on episode boundaries. â ¢ Mega (Ma et al. 2023) introduced a simpliï¬ cation of S4 to be real- instead of complex- valued, giving it an interpretation of being an exponential moving average (EMA). They additionally make an interesting connection of the discretization step of SSMs to an EMA damping term. Contrary to ï¬ ndings in the original S4 papers, this was the ï¬ rst model to show that real-valued SSMs are empirically eï¬ ective in certain settings or when combined with diï¬ erent architectural components. â', chunk_index=68, num_tokens=406, metadata={}), ResponseChunk(id='chunk_bbe1fada-a461-436a-80d4-32c961fe2b07', content='¢ Liquid S4 (Hasani et al. 2023) is also motivated by augmenting S4 with an input-dependent state transition. From this perspective it shares similarity to selection mechanisms, although in a limited form which is still computed convolutionally and close to LTI. â ¢ SGConv (Y. Li et al. 2023), Hyena (Poli et al. 2023), LongConv (Fu et al. 2023), MultiresConv (J. Shi, K. A. Wang, and Fox 2023), and Toeplitz Neural Network (Qin, Han, W. Sun, He, et al. 2023) all focus on the convolutional representation of S4 and create global or long convolution kernels with diï¬ erent parameterizations. However, these methods cannot do fast autoregressive inference directly. Notably, all of these methods, and all other structured SSMs that we are aware of, have been non-selective and usually strictly LTI (linear time invariant). # B.2 SSM Architectures We use SSM architectures or state space neural networks (SSNN) to refer to deep neural network architectures incorporating one of the previous SSMs as a black box layer. â ¢ GSS (Mehta et al. 2023) was the ï¬ rst gated neural network architecture incorporating SSMs. It is motivated by the gated attention unit (GAU) of Hua et al. (2022) and looks quite similar to our block, except with additional projections. Most importantly, its projection contracts the model dimension to reduce the state size of the SSM, while ours expands the model dimension in order to increase the state size, based on the motivation in Section 3.1. 25 â ¢ Mega (Ma et al. 2023) combined the EMA simpliï¬', chunk_index=69, num_tokens=390, metadata={}), ResponseChunk(id='chunk_cfa712ef-6c87-42a1-aa55-94b0066e13a4', content='cation of S4 described above into a hybrid architecture using an eï¬ cient attention approximation. â ¢ H3 (Dao, Fu, Saab, et al. 2023) is motivated by combining S4 with linear attention (Katharopoulos et al. 2020). It is the ï¬ rst to generalize this formulation of linear attention to more general recurrences, which is also the basis of later architectures. â ¢ Selective S4 (J. Wang et al. 2023) incorporates S4 as a black box to generate a binary mask which is multiplied on the input. While sharing the â selectionâ name, we consider this an architectural modiï¬ cation that is closer to architectural gating than a selection mechanism (Appendix A). For example, we hypothesize that it would not solve the Selective Copying task because simply masking out the irrelevant inputs does not aï¬ ect the spacing between the relevant ones (indeed, the Selective Copying task can even be viewed as coming pre-masked if the noise tokens are embedded to 0). â ¢ RetNet (Y. Sun et al. 2023) is also based on Linear Attention and very similar to H3, but reduces the inner S4 layer to a special case where the state dimension is ð = 1. Although not framed as such, its recurrence can be viewed as a special case of a linear SSM. Its primary source of improvement is using a linear attention with large head dimension, which can be viewed as another method to perform input-dependent state expansion. Using a larger head dimension in the context of linear attention variants was ï¬ rst done by H3, but not extensively used since this requires a proportional amount of extra computation.', chunk_index=70, num_tokens=362, metadata={}), ResponseChunk(id='chunk_7e93dcbd-bebd-44cb-9b4a-16c9f547aab6', content='RetNet avoids this with an alternate way to parallelize the computation with a variant of standard multi-head attention instead of convolutions, made feasible by their particular special case of SSMs which acts as a simple EMA. â ¢ RWKV (B. Peng et al. 2023) is another recent RNN designed for language modeling. It is based on AFT (attention-free Transformer (S. Zhai et al. 2021)), another variant of linear attention. Its main â WKVâ mechanism involves LTI recurrences and can be seen as the ratio of two SSMs. We also highlight the gated attention unit (GAU) from Hua et al. (2022), which was motivated by combining the Transformerâ s MHA and MLP blocks together and was an inspiration for our architecture (Section 3.4) combining the H3 and MLP blocks. # B.3 Relationship to RNNs RNNs and SSMs are broadly related, as they both involve the concepts of recurrence on a latent state. Several older RNNs such as the strongly typed RNN (Balduzzi and Ghifary 2016), quasi-RNN (QRNN) (Bradbury et al. 2016), and simple recurrent unit (SRU) (Lei 2021; Lei et al. 2017) involve forms of gated RNNs without time-wise nonlinearities. Because of the connections of gating mechanisms and selection mechanisms, these can be viewed as cases of selective SSMs, and are thus more powerful in a sense than the family of LTI structured SSMs above. The main diï¬ erences are: â ¢ They do not use state expansion (ð = 1) or selective B, C parameters, both of which are important for performance (Section 4.6). â', chunk_index=71, num_tokens=382, metadata={}), ResponseChunk(id='chunk_75908291-6d95-4881-80fb-dee0ee28d6b3', content='¢ They use a heuristic gating mechanism, which we generalize as a consequence of the selection mechanism + discretization (Theorem 1). The connections to principled SSM theory provides better parameterizations and initializations (Section 3.6). Additionally, older RNNs famously suï¬ ered from eï¬ ciency issues and the vanishing gradients problem (Pascanu, Mikolov, and Bengio 2013), both caused by their sequential nature. The latter could be solved for some of the above RNNs by leveraging the parallel scan (Martin and Cundy 2018), but the former was diï¬ cult without theory later developed for SSMs. For example, modern structured SSMs diï¬ er in more careful parameterization of the recurrent dynamics inspired by classical SSM theory (e.g. through discretization (Gu, Johnson, Goel, et al. 2021; Gu, Johnson, Timalsina, et al. 2023)), or direct analysis (Orvieto et al. 2023)). We also note that there is a long line of work on orthogonal RNNs (Arjovsky, Shah, and Bengio 2016; Henaï¬ , Szlam, and LeCun 2016; Lezcano-Casado and MartÃ nez-Rubio 2019; Mhammedi et al. 2017; Vorontsov et al. 2017) 26 which are motivated by constraining the A transition matrix to be orthogonal or unitary, in order to control its eigenvalues and prevent the vanishing gradient problem. However, these had other limitations; we believe that these stem from the fact that orthogonal/unitary RNNs are also LTI.', chunk_index=72, num_tokens=365, metadata={}), ResponseChunk(id='chunk_094dee7c-fe22-46cf-9abd-dfd6be342d2b', content='For example, they are almost always evaluated on the Copying task which they can solve perfectly, but observed to struggle on the Selective Copying task (Jing et al. 2019). # B.4 Linear Attention The Linear Attention (LA) (Katharopoulos et al. 2020) framework is an important result popularizing kernel attention and showing how it relates to recurrent autoregressive models. Many variants have proposed alternative kernels and other modiï¬ cations. Random Feature Attention (RFA) (H. Peng et al. 2021) chooses the kernel feature map to approximate softmax attention (i.e. the exp feature map) using the random Fourier feature approximation of Gaussian kernels (Rahimi and Recht 2007). Performer (Choromanski et al. 2021) ï¬ nds an approximation to the exponential kernel involving only positive features, which also allows the softmax normalization term. TransNormer (Qin, Han, W. Sun, D. Li, et al. 2022) showed that the LA denominator term can be unstable and proposed replacing it with a LayerNorm. cosFormer (Qin, W. Sun, et al. 2022) augments RFA with a cosine reweighting mechanism that incorporates positional information to emphasize locality. Linear Randomized Attention (Zheng, C. Wang, and L. Kong 2022) generalize RFA from the perspective of importance sampling, and generalize it to provide better estimates of the full softmax kernel (rather than just the exp-transformed numerator). Aside from kernel attention, many other variants of eï¬ cient attention exist; the survey Tay, Dehghani, Bahri, et al. (2022) oï¬ ers an extensive categorization of many of these. # B.5 Long Context Models', chunk_index=73, num_tokens=380, metadata={}), ResponseChunk(id='chunk_b96f372d-dedd-420a-8bdc-2cb109134dfd', content='Long context has become a popular subject, and several recent models have claimed to scale to longer and longer sequences. However, these are often from a computational standpoint and have not been extensively validated. These include: â ¢ Recurrent Memory Transformer (Bulatov, Kuratov, and Burtsev 2023), a lightweight wrapper around a Transformer backbone. It showed ability to generalize up to 1M sequences but only on synthetic memorization tasks; their main result is similar to our Induction Heads extrapolation experiment (Table 2). â ¢ LongNet (Ding et al. 2023), which claimed to scale to 1B length but only evaluated on length < 100ð ¾ for actual tasks. â ¢ Hyena and HyenaDNA (Nguyen, Poli, et al. 2023; Poli et al. 2023), which claimed to leverage up to 1M context. However, their experiments trained on proportionally more data at longer contexts, making it hard to conclude if quality improvements at 1M context are due to context length or due to more data and computation. â ¢ Sparse Transformer (Child et al. 2019) showed a proof-of-concept of using a strided sparse attention Transformer to model audio waveforms of length 220 = 1048576, although did not discuss performance tradeoï¬ s when controlling for computation and model size. In contrast, we believe this work presents one of the ï¬ rst approaches to meaningfully demonstrate increasing performance with longer context. # C Mechanics of Selective SSMs Proof of Theorem 1. Consider a selective SSM (Algorithm 2) with ð = 1, A = â 1, B = 1, ð â = ð «ð ð ð ¾ð ºð (ð ¥), ð â = ð ð ð ¿ð ð ð ð ð .', chunk_index=74, num_tokens=408, metadata={}), ResponseChunk(id='chunk_e3d661e2-48b9-4c2e-b7a2-ae0c887e90cb', content='The corresponding continuous-time SSM (1) is â (ð ¡) = â â (ð ¡) + ð ¥(ð ¡) which is also called a leaky integrator. 27 The discretization step size is The discretization step size is # â ð ¡ = ð â (ð ¯ð ºð ð ºð ð ¾ð ð ¾ð + ð â (ð ¥ð ¡)) = ð ð ð ¿ð ð ð ð ð (ð ¯ð ºð ð ºð ð ¾ð ð ¾ð + ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) = ð ð ð ¿ð ð ð ð ð (ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) where we observe that the parameter can be viewed as a learnable bias and folded into the linear projection. Now applying the zero-order hold (ZOH) discretization formulas: Að ¡ = exp(â A) = 1 1 + exp(ð «ð ð ð ¾ð ºð (ð ¥ð ¡) = ð (â ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) = 1 â ð (ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) Bð ¡ = (â A)â 1(exp(â A) â I) â â B = â (exp(â A) â I) = 1 â A = ð (ð «ð ð ð ¾ð ºð (ð ¥ð ¡)). Thus the final discrete recurrence (2a) is ð ð ¡ = ð (ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) â ð ¡ = (1 â ð ð ¡)â ð ¡â 1 + ð ð ¡ð ¥ð ¡ as desired. # D Hardware-aware Algorithm For Selective SSMs', chunk_index=75, num_tokens=460, metadata={}), ResponseChunk(id='chunk_ee86067a-f4b6-4ccb-9a97-7c99d2389509', content='Without input-dependent selectivity, SSMs can be eï¬ ciently implemented as a convolution (Dao, Fu, Saab, et al. 2023; Gu, Goel, and RÃ© 2022), which leverages the fast Fourier transform (FFT) as primitive. With selectivity, SSMs are no-longer equivalent to convolution, but we leverage the parallel associative scan. While SSM scans are theoretically eï¬ cient (ð (ð µð ¿ð ·ð ) FLOPs, scaling linear in ð ¿), training foundation models with selective SSMs requires them to be eï¬ cient on modern hardware (GPUs) as well. We describe how we use kernel fusion and recomputation to make SSM scan fast and memory-eï¬ cient. We evaluate the speed of our scan implementation compared to convolution and attention in Section 4.5, showing that it is up to 7Ã times faster than attention at sequence length 32K, and is as memory-eï¬ cient as the best attention implementation (FlashAttention). Speed. On modern hardware accelerators (GPUs) most operations (except matrix multiply) are bounded by memory-bandwidth (Dao, Fu, Ermon, et al. 2022; Ivanov et al. 2021; Williams, Waterman, and Patterson 2009). This the case with our scan operation, and we use kernel fusion to reduce the amount of memory IOs, leading to signiï¬ cant speedup compared to a standard implementation. The standard way to implement the scan algorithm in Section 3.2 is to prepare the scan input A, B of size (ð µ, ð ¿, ð ·, ð ) in GPU HBM (high-bandwidth memory, commonly referred to as GPU memory), call a parallel associative scan implementation to write the scan output of size (ð µ, ð ¿, ð', chunk_index=76, num_tokens=398, metadata={}), ResponseChunk(id='chunk_f28e43e9-f850-4d00-a722-8cde69be37a0', content='·, ð ) to GPU HBM, then multiply that scan output with C to produce an output of size (ð µ, ð ¿, ð ·). However, this requires the number of memory reads/writes on the order of ð (ð µð ¿ð ·ð ). We can instead fuse the discretization step, the scan, and the multiplication with C into one kernel: 1. We read in ð (ð µð ¿ð · + ð ·ð ) bytes of memory (â , A, B, C) from slow HBM to fast SRAM. 2. We discretize to produce A, B of size (ð µ, ð ¿, ð ·, ð ) in SRAM. 3. We perform a parallel associative scan, yielding intermediate states of size (ð µ, ð ¿, ð ·, ð ) in SRAM. 4. We multiply and sum with C, producing outputs of size (ð µ, ð ¿, ð ·) and write it to HBM. This way, we reduce IOs by a factor of ð (ð ) (the state dimension), which in practice speeds up the operation by 20-40 times (Section 4.5). 28 Table 11: (Induction heads.) Models are trained on sequence length 2Â° = 256, and tested on various sequence lengths of 2Â° = 64 up to 2Â° = 1048576. Y denotes perfect generalization accuracy, while X denotes out of memory.', chunk_index=77, num_tokens=329, metadata={}), ResponseChunk(id='chunk_ebcd0a19-b611-43ae-88db-02f9722abd34', content='Model Params Test Accuracy (%) at Sequence Length 26 7 28 29 210 gl 212 913 214915216 917918919920 MHA-Abs 137K v 99.6 100.0 58.6 266 188 98 10.9 7.8 X x x x x x MHA-RoPE = 137K v v 100.0 83.6 31.3 184 8.6 9.0 5.5 xX x x x x x MHA-xPos 137K v v 100.0 99.6 67.6 254 7.0 9.0 78 =X x x x x x H3 153K v v 100.0 80.9 39.5 238 148 82 59 66 82 47 82 63 74 Hyena 69M* 977 Vo 100.0 Vv 441 125 66 5.1 70 #59 66 66 59 63 98 Mamba 74K v v 100.0 Vv v v v v v v v v v v v â Most of the parameters are in learnable positional encodings. For sequence length ð ¿ too long where we cannot ï¬ t the sequence in SRAM (which is much smaller than HBM), we split the sequences into chunks and perform the fused scan on each chunk. As long as we have the intermediate scan states, we can continue the scan with the next chunk. Memory. We describe how we use the classical technique of recomputation to reduce the total amount of memory required to train selective SSM layers. From the way we fuse the forward pass, we do not save the intermediate states of size (ð µ, ð ¿, ð ·, ð', chunk_index=78, num_tokens=395, metadata={}), ResponseChunk(id='chunk_b8b6b0b5-9a84-4314-bc5c-aa9e22cfddee', content=') to avoid memory blowup. However, these intermediate states are necessary for the backward pass to compute gradients. We instead recompute those intermediate states in the backward pass. Since the inputs â , A, B, C and output gradient read from HBM to SRAM are of size ð (ð µð ¿ð + ð ·ð ), and the input gradients are also of size ð (ð µð ¿ð + ð ·ð ), recomputation avoids the cost of reading ð (ð µð ¿ð ð ·) elements from HBM. This means that recomputation of the SSM states in the backward pass speeds up the computation compared to storing them and reading them from HBM. Beyond optimizing for the memory requirement of just the scan operation, we also use recomputation to optimize the memory requirement of the entire selective SSM block (input projection, convolution, activation, scan, output projection). In particular, we do not save intermediate activations that take a lot of memory but are fast to recompute (e.g. output of activation function or short convolution). As a result, the selective SSM layer has the same memory requirement as an optimized Transformer implementation with FlashAttention. In particular, each attention layer (FlashAttention) stores around 12 bytes of activations per token, an each MLP layer stores around 20 bytes of activations per token, for a total of 32 bytes ((assuming mixed-precision training in FP16 or BF16)). Each selective SSM stores around 16 bytes of activations per token. Hence two layers of selective SSMs have around the same activation memory as an attention layer and an MLP layer. # E Experimental Details and Additional Results # E.1 Synthetic Tasks Selective Copying. Our setting is on sequences of length 4096, with a vocab size of 16 possible tokens (including the white â noiseâ token from Figure 2) and requiring models to memorize 16 â dataâ tokens.', chunk_index=79, num_tokens=399, metadata={}), ResponseChunk(id='chunk_6d8de5d1-e9b4-40f1-9b10-eb1c07b7e22f', content='We use 2 layer models with a model dimension of ð · = 64. Models are trained for 400K steps at a constant learning rate of 0.0001 with a batch size of 64. Induction Heads. Training consists of randomly generating data every step, with a batch size of 8. We choose an â epochâ size of 8192 steps, and track the accuracy on ï¬ xed validation sets (also randomly generated) of each target sequence length. For the MHA-Abs and Mamba models, results are reported after the 25th epoch (8192 Ã 25 = 204800 steps). For the MHA-RoPE and MHA-xPos models, results are reported after the 50th epoch (8192 Ã 50 = 409600 steps). For the LTI H3 and Hyena models, results are reported after the 10th epoch (81920 steps) because they had converged by then and failed to improve further. 29 Table 12: (Scaling Law Model Sizes.) Our model sizes and hyperparameters for scaling experiments. (Model dimension and number of heads applies only to Transformer models.) Params ð _ð ð ð ¢ð ð ð ð _ð ð ð ð ð ð _ð ð ð ð ð / ð _ð ð ð ð', chunk_index=80, num_tokens=292, metadata={}), ResponseChunk(id='chunk_3e068ecb-9f88-4031-9314-104345bc49e0', content='Training steps Learning Rate Batch Size Tokens 125M 350M 760M 1.3B 12 24 24 24 768 1024 1536 2048 12 / 64 16 / 64 16 / 96 32 / 64 4800 13500 29000 50000 6e-4 3e-4 2.5e-4 2e-4 0.5M tokens 0.5M tokens 0.5M tokens 0.5M tokens 2.5B 7B 15B 26B We use the Adam optimizer with no weight decay. All models are trained at constant learning rates 2ð â 4 and 1ð â 3, and the better results are reported for each model (2ð â 4 for all models except Mamba). The attention and Hyena models did not learn at LR 1ð â 3. H3 learned at both LRs, but interestingly generalized better to shorter sequences at the smaller LR of 2ð â 4. Mamba learned at both LRs, but extrapolated better at the larger LR of 1ð â 3. # E.2 Language Modeling # E.2.1 Scaling Law Details All models were trained on the Pile. Model Sizes. Table 12 speciï¬ es the model sizes we use for scaling laws. This is taken directly from the GPT3 speciï¬ cations (Brown et al. 2020), with very minor modiï¬ cations. First, we changed the batch size of the 1.3B model from 1M tokens to 0.5M tokens, since we did not use enough parallelization to require the larger batch size. Second, we changed the number of training steps and total tokens to roughly match Chinchilla scaling laws (Hoï¬', chunk_index=81, num_tokens=401, metadata={}), ResponseChunk(id='chunk_b6452f38-a9b1-44a8-bd49-57a35d503e9b', content='mann et al. 2022), which specify that training tokens should increase proportionally to model size. Training Recipes. All models used the AdamW optimizer with â ¢ gradient clip value 1.0 â ¢ weight decay 0.1 no dropout linear learning rate warmup with cosine decay By default, the peak learning rate is the GPT3 speciï¬ cation. We give several models an â improved recipeâ , inspired by changes adopted by popular large language models such as PaLM (Chowdhery et al. 2023) and LLaMa (Touvron et al. 2023). These include: â ¢ linear learning rate warmup with cosine decay to 1ð â 5, with a peak value of 5Ã the GPT3 value no linear bias terms RMSNorm instead of LayerNorm â ¢ AdamW hyperparameter ð ½ = (.9, .95) (the GPT3 value) instead of the PyTorch default of ð ½ = (.9, .999) Architecture and Training Details. Our models are: â ¢ Transformer: The standard Transformer based on GPT3 (Table 12). â ¢ Transformer++: A Transformer with an improved architecture, namely rotary positional encodings (Su et al. 2021) and SwiGLU MLP (Shazeer 2020), and the improved training recipe above. â ¢ Hyena: Interleaving a Hyena block (the H3 block with S4 replaced by a global convolution parameterized by an MLP) with standard MLP blocks. The MLP blocks have expansion factor 2 instead of 4 and the number of layers is correspondingly increased by 1.5Ã to preserve parameter count. 30 â ¢ H3++: The H3 architecture with a few modiï¬ cations, including (i) using the same â thinâ', chunk_index=82, num_tokens=396, metadata={}), ResponseChunk(id='chunk_c96044bd-873e-4f4d-9361-14c3dd8ddb70', content='Hyena dimensions above (ii) the improved training recipe above (iii) a linear attention head dimension of 8. â ¢ RWKV: The default RWKV model from B. Peng et al. (2023), including its modiï¬ ed MLP block. We also used as much of its speciï¬ ed training recipe as possible, such as increasing the learning rates by 2Ã or 3Ã on certain parameters. â ¢ RetNet: The default RetNet model from Y. Sun et al. (2023). We also gave it the improved training recipe above. â ¢ Mamba: The standard Mamba architecture, with the improved training recipe. # E.2.2 Additional Scaling Law Ablations We perform additional ablations on the architecture using the same protocol as the 2k context length scaling laws in Figure 4 (Left). Mamba Architecture: Interleaving Blocks. We test the eï¬ ect of diï¬ erent architectural blocks combined with the Mamba block. We focus on the viewpoint that the Mamba block is simply the standard SwiGLU block with an extra ð ¼ð ð ð â ð ²ð ²ð ¬ path added. This leads to two natural ablations: â ¢ What if the Mamba block is interleaved with a standard MLP block, instead of stacked homogenously? This can also be interpreted as taking Mamba and removing half of the SSMs. â ¢ What if the Mamba block is interleaved with MHA (multi-head attention) blocks? This can also be interpreted as taking a Transformer with SwiGLU MLPs (i.e. what we call Transformer++) and simply adding SSMs to the MLP blocks. Figure 9 (Right) shows these variants compared to the original (homogenous) Mamba architecture. Interestingly, neither change matters too much.', chunk_index=83, num_tokens=394, metadata={}), ResponseChunk(id='chunk_89dfcda4-e4c9-4311-b754-a38d2b4be625', content='The Mamba-MLP architecture is only slightly worse, and still better than all models except Transformer++. The Mamba-MHA architecture is only slightly better, which is somewhat surprising in light of the fact that many recent works have found that combining (LTI) SSMs with Attention can lead to substantial improvements (Dao, Fu, Saab, et al. 2023; Fathi et al. 2023; Fathullah et al. 2023; Saon, Gupta, and Cui 2023; Zuo et al. 2022). H3 Architecture: Training Recipes. Next we ablate diï¬ erences between the Hyena and H3++ models, our weakest and strongest models outside of Transformer++ and Mamba, particularly to isolate the eï¬ ect of training recipes. â ¢ Hyena: The Hyena block with its original architecture and GPT3 training recipe (same as Figure 4). â ¢ Hyena+: The same architecture but with the improved training recipe described above. â ¢ H3+: The same architecture as Hyena+ but with the Hyena convolution kernel swapped out for S4D convolution kernel. â ¢ H3++: The same as H3+, but with a linear attention head dimension of 8. This increases computation inside the SSM recurrence but does not increase parameters. Our general convention is that â Model+â represents the base model with the improved training recipe, and â Model++â also allows for architectural changes. Figure 9 (Right) shows that A large improvement is achieved by the improved training recipe, which was used for many of the models in the main Figure 4 (RetNet, H3++, Transformer++, Mamba). The choice of the inner LTI SSM does not matter (e.g. Hyena vs. S4), consistent with ï¬ ndings throughout this paper.', chunk_index=84, num_tokens=390, metadata={}), ResponseChunk(id='chunk_b3a0c8ad-dd5c-4f0b-83af-dabeedd8059c', content='The head dimension expansion improves performance, consistent with one of our main themes that expanded state dimension improves performance for SSMs (Section 3). 31 Scaling Laws on The Pile (Sequence Length 2048) Scaling Laws on The Pile (Sequence Length 2048) â â Mamba Hyena Mamba-mLp | = â Hyenas â â Members |g â â He a â He 3 Sox! = 2104 ext? 5 2S 7x0 Ea 1 1 1 1 10 30 10Â° 10â FLOPS (log scale) FLOPs (log scale) s 5 2 3 2 = 3 8 Figure 9: (Scaling laws: extra ablations.) (Left) Instead of (Right) Instead of # E.2.3 Downstream Evaluation Details This pretraining procedure is the same as the scaling law protocol, but extended to 300B tokens. For the 1.3B model, we use a batch size of 1M tokens to be consistent with the GPT3 speciï¬ cations. We report the perplexity on the Pile validation set, and for this metric only compare to models trained on the same dataset and with the same tokenizer, in particular Pythia and RWKV. For downstream evaluation, we use the LM evaluation harness from EleutherAI (L. Gao, Tow, et al. 2021), as done by most work in this area. We evaluate on the following tasks/datasets that measure common sense reasoning: â ¢ LAMBADA (Paperno et al. 2016). â ¢ HellaSwag (Zellers et al. 2019). â ¢ PIQA (Bisk et al. 2020). â ¢ ARC-challenge (P. Clark et al. 2018). â', chunk_index=85, num_tokens=397, metadata={}), ResponseChunk(id='chunk_b0a82072-ceae-4ebe-a0bd-43b47ebd76a5', content='¢ ARC-easy: an easy subset of ARC-challenge. â ¢ WinoGrande (Sakaguchi et al. 2021). We report accuracy for LAMBADA, WinoGrande, PIQA, and ARC-easy, and accuracy normalized by sequence length for HellaSwag and ARC-challenge (since normalized accuracy is higher for almost all models for these task). # E.3 DNA Modeling # E.3.1 Pretraining Details We describe the dataset and training procedure of the HG38 pretraining task in more detail. The dataset follows the splits from the prior Enformer work on genomics (Avsec et al. 2021); the training split contains a total of ð = 34021 segments of length 217 = 131072 that cover the genome, for a total of approximately 4.5 billion tokens (DNA base pairs). These segments are pairs of (chromosome number, starting index, ending index), and can be extended if necessary (e.g. to get longer segments). We deviate from HyenaDNA when the training sequence length is not 217. HyenaDNA always takes a ï¬ xed sub-segment (e.g. the beginning or middle of the prescribed segment), and thus for any training sequence length each epoch is ï¬ xed to 34021 samples and doesnâ t necessarily go through the whole genome. On the other hand, we use the entire training data: â ¢ When the context length ð ¿ is less than (or equal to) 217, we divide up each segment into non-overlapping sub-segments of length ð ¿, so that there are ð Ã 217 ð ¿ total samples and ð Ã 217 â 4.5ð µ tokens per epoch. â ¢ When the context length ð', chunk_index=86, num_tokens=379, metadata={}), ResponseChunk(id='chunk_42b3e4f0-0083-4518-8d59-7df3a3a741ef', content='¿ is greater than 217, we turn each segment into two samples, one that begins with the prescribed segment and one that ends with the prescribed segment. Thus each epoch has 2ð items and 2ð ð ¿ 32 tokens per epoch. For example, at sequence length 218 = 262144 there are 4Ã as many tokens as the default, and at sequence length 220 there are 16Ã as many tokens. Other training details generally follow the same protocol as our language modeling experiments (Appendix E.2). For example, we use the AdamW with (ð ½1, ð ½2) = (0.9, 0.95), no dropout, weight decay 0.1. We use a cosine learning rate scheduler with linear warmup for 10% of total steps. # E.3.2 Scaling: Model Size Details Models. The models we consider are: â ¢ Transformer++: a Transformer with improved architecture, notably the usage of RoPE positional encodings (Su et al. 2021). Informally, we found these to be noticeably better than vanilla positional encodings from (Vaswani et al. 2017). â ¢ HyenaDNA: the Hyena model from Nguyen, Poli, et al. (2023) and Poli et al. (2023), which is roughly a Transformer with the MHA block replaced by an H3 block using a global convolution parameterized by an MLP. â ¢ Mamba: the standard Mamba architecture. Model Sizes. We use the following model sizes. Blocks Model Dimension Params (Approx.) 4 64 250K 700K 1.4M 3.5M 7.0M 19.3M 40.7M 5 96 6 128 7 192 8 256 10 384 12 512', chunk_index=87, num_tokens=397, metadata={}), ResponseChunk(id='chunk_799bb73c-75d9-47cf-b854-480fa8bb2253', content='Note that the number of blocks for Mamba is doubled, because one Transformer â layerâ includes both the MHA and MLP blocks (and similarly for Hyena), which requires two Mamba blocks to match parameters (Section 3.4). Training. For each model (Transformer++, HyenaDNA, Mamba), we swept the learning rate across {1ð â 3, 2ð â 3, 4ð â 3, 8ð â 3}. The optimal Transformer and HyenaDNA learning rates were 2e-3 across all sizes. The optimal Mamba learning rate was 8e-3; note that Mamba performed better than baselines with matched learning rates (2e-3), but was more stable and improved even more at higher learning rates. (Furthermore, as this LR is on the upper range of the sweep, it is possible that our results are still suboptimal.) Note that, in contrast to standard LM scaling laws (Table 12), our LR held constant across model sizes for simplicity. The optimal LR should go down for larger models, but we didnâ t ï¬ nd a noticeable eï¬ ect at the small model sizes (at most a few million parameters) we considered. E.3.3 Scaling: Context Length Details We use a total batch size of 224 â 16ð tokens per training step, for every sequence length (e.g. at length 220 there are 16 segments per batch and at length 210 there are 16384 segments per batch). This is a large batch size relative to the model size by usual LM standards, but note that a batch size of 223 is the minimum possible on a machine with 8 GPUs and sequence length of 220, and that HyenaDNA used much larger batches of 228.', chunk_index=88, num_tokens=372, metadata={}), ResponseChunk(id='chunk_68f636df-556e-4210-b0b8-5f94a5c013b8', content='The learning rate used was 0.008 for Mamba and 0.001 for HyenaDNA; we initially attempted to use the same learning rate of 0.002 from the previous section for HyenaDNA, but found that it was unstable at the longest context length. Sequence Length Warmup. Following (Nguyen, Poli, et al. 2023), we use sequence length warmup (SLW) during pretraining. We choose a simple schedule of 2 epochs at each power-of-two sequence length starting from 210 = 1024. (Note that because of how data is curated, at the longest sequence lengths more steps and tokens are spent proportionally. In particular, each stage up to length 217 processes the same number of tokens, but 4Ã as many tokens are processed at length 218, 8Ã as many at length 219, and 16Ã as many at length 220.) Unlike HyenaDNA, we always control for the number of tokens per gradient update, so the batch size is successively halved as the sequence lengths are doubled in each stage. 33 Table 13: (Great Apes DNA Classification.) Accuracy after fine-tuning on sequences of length 210 = 1024 up to 220 = 1048576 using pretrained models of the same context length. Random guessing is 20%. Params Accuracy (%) at Sequence Length 210 212 214 216 218 220 28.04 31.47 28.43 27.50 41.17 27.66 42.22 40.72 31.10 42.41 7M 30.00 29.01 31.48 43.73 56.60 Remark E.1. We also note that the schedule was not tuned, and we never experimented with turning off sequence length warmup for these pretraining experiments.', chunk_index=89, num_tokens=393, metadata={}), ResponseChunk(id='chunk_555f6986-48e3-40a8-8326-57556040b1ca', content='We later found that SLW did not help noticeably for audio pretraining at similar lengths (Section 4.4), and it is possible that it is not necessary for DNA pretraining either. # E.3.4 Species (Great Apes) Classification Models are causal and therefore only the last element (across the sequence length) of the modelâ s output is used for the classiï¬ cation head. Note that we control for the total number of elements in the loss function per gradient step. The pretraining objective includes all positions across the sequence length, so that ð ð ð ð ð _ð ð ð £ð Ã ð ð ð ð ð ð ð ð _ð ð ð ð ð ð is held constant; in other words, the batch size decreases as the sequence length increases. However, for a classiï¬ cation task, since only the last position enters the loss, the batch size itself is held constant. Note that this also means that ï¬ ne-tuning models with longer sequence lengths is more computationally expensive. Training consists of 10 epochs, each of which has 1024 gradient steps. Each gradient step uses batch size 64, which are all independently randomly drawn by uniformly picking a species, uniformly picking a chromosome, and then uniformly picking a contiguous segment of DNA. Following (Nguyen, Poli, et al. 2023), models with a maximum context length greater than 214 = 16384 use sequence length warmup with 1 epoch at length 214 = 16384, 1 epoch at length 215 = 32768, 1 epoch at length 216 = 65536, and so on up to the maximum sequence length. For example, the model with 220 = 1048576 context undergoes 6 epochs of sequence length warmup before 4 more epochs at its maximum sequence length. The learning rate for all Hyena models is ð ºð â ð', chunk_index=90, num_tokens=421, metadata={}), ResponseChunk(id='chunk_8124ee24-797b-49e9-863b-3764278ce657', content='», while the learning rate for all Mamba models is ð ·ð â ð º. These were found by performing learning rate sweeps for each model among {1ð â 5, 2ð â 5, 4ð â 5, 1ð â 4, 2ð â 4} for the smaller sequence lengths (210, 212, 214, 216), and these values were consistently found to be the best for each model. An abridged learning rate sweep was done at length 218, which agreed with these values, and a single run at length 220 was performed (as described above, the computational cost of these experiments is proportional to the sequence length). The learning rate followed a cosine decay schedule with warmup with 5 epochs of linear warmup to the maximum learning rate, and 5 epochs of cosine decay down to 1ð â 6. The unusually long learning rate warmup schedule was chosen because the sequence length warmup was also long (e.g. comprising 6 out of 10 epochs for the model with context length 220); we did not experiment with this choice. Results for the Species classiï¬ cation task are in Table 13. # E.4 Audio Details # E.4.1 YouTubeMix Audio Pretraining Model. We use a model with 3 blocks per stage (3 Ã 5 = 15 total Mamba blocks), pooling factor ð = 16, and outer dimension ð · = 64, for about 3.5M parameters. Dataset. The data is mu-law encoded at 8 bits, so the model is modeling discrete tokens with a vocab size of 256. The dataset consists of clips of up to 1 minute long, or length 960000, which is subsampled and divided into segments of any desired sequence length. Since the architecture involves two stages of pooling by a factor of 16, 34 Table 14:', chunk_index=91, num_tokens=403, metadata={}), ResponseChunk(id='chunk_5ab06cd1-8a6c-4f95-a3a0-d590071fe117', content='YouTubeMix length scaling sequence lengths and batch sizes. 468 Ã 2048 = 958464 234 Ã 2048 = 479232 117 Ã 2048 = 239616 59 Ã 2048 = 120832 30 Ã 2048 = 61440 15 Ã 2048 = 30720 8 Ã 2048 = 16384 4 Ã 2048 = 8192 1 2 4 8 16 32 64 128 958464 958464 958464 966656 983040 983040 1048576 1048576 Audio Waveforms - SSM Parameterization aso â â samp â â Mamba (s6) = â sy = sSeaive B/C Â° 1.40 4 â â -selective A s ras | __Mamba-$4) B 1204 124 108 108 Sequence Length Audio Waveforms - SSM Parameterization â â Mamba ($6) 4 â â +complex = Solestive a | (Mamba-S4) 1.35 1.304 1.254 108 108 Sequence Length 1.48 21404 . Ã© ag Figure 10: (Audio Pretraining (YouTubeMix) Ablations.) As a uniformly-sampled â continuousâ signal modality, audio wave- forms actually benefit from LTI models which have matching inductive bias. (Left) Homogenous models (all blocks have the same parameterization) (Right) Only the center U-Net blocks are ablated; the outer blocks are Mamba-S4. Purple line is same as figure on left. and we want the resulting sequence length to be a a multiple of 8 for hardware eï¬ ciency, the longest possible sequence is 468 Ã 2048 = 958464. The rest of our sequence lengths are deï¬', chunk_index=92, num_tokens=400, metadata={}), ResponseChunk(id='chunk_0049db3f-e2b3-4470-863b-f22a040551bb', content='ned by successively halving this and rounding up to the nearest multiple of 2048. Table 14 lists the speciï¬ cations used in Figure 7. Beyond the varying batch sizes, the number of valid segments in the training set varied between diï¬ erent sequence lengths (e.g. the number of training steps per epoch was not constant for diï¬ erent points in the graph), which may have contributed to kinks in the scaling curves. Training. Models were trained for 200ð ¾ training steps with a maximum learning rate of 0.002, 20ð ¾ (10%) warmup steps, and weight decay 0.1 (similar to our general pretraining recipe across domains). Additional Ablations: SSM Parameterizations. We investigate SSM parameterizations on long-form audio waveform pretraining in the setting of Figure 7. The setting is modiï¬ ed slightly to use larger models (8 layers and ð · = 64 for 6M params, the SaShiMi default), shorter sequences (211 = 2048 to 218 = 262144 instead of 213 to 220), lower LR (0.001 from 0.002), and shorter training cycles (100K instead of 200K steps). Figure 10 shows that the change from S4 â S6 (i.e. the selection mechanism) is not always beneï¬ cial. On long-form audio waveforms, it in fact signiï¬ cantly hampers performance, which may be intuitive from the point of view that audio is uniformly sampled and very smooth, and therefore beneï¬ ts from continuous linear time-invariant (LTI) methods. After ablating away the selection mechanism, note that the resulting model is the S4 layer inside the Mamba block. To disambiguate, we call this Mamba-S4 as opposed the default Mamba architecture Mamba-S6.', chunk_index=93, num_tokens=403, metadata={}), ResponseChunk(id='chunk_9f2adfb3-7954-4403-8b21-db062b37e9aa', content='However, on the right side, we keep the outer layers of the U-Net Mamba-S4 and ablate only the inner layers. The performance diï¬ erences shrink dramatically; this reinforces the hypothesis that layers closer to the raw audio signal should be LTI, but once they are â tokenizedâ and compressed by the outer layers, the inner layers no longer need to be LTI. In this setting however, the real-valued SSM still underperforms the complex-valued one. 35 # E.4.2 SC09 Speech Generation Autoregressive training largely followed the autoregressive language modeling protocol, such as â ¢ Weight decay 0.1 â ¢ Learning rate warmup for 10% of total steps â ¢ AdamW optimizer with ð ½ = (0.9, 0.95) â ¢ Gradient clip value 0.1 We used a learning rate of 0.002 and 200000 training steps at a batch size of 16. The large Mamba model in Table 4 has 15 layers per stage with an outer dimension of ð · = 96 and pooling factor 4. We note that this dataset is small (training went through 100 epochs) and for this large model, there was signiï¬ cant overï¬ tting of the BPB or NLL. However, automated metrics of generated samples continually improving throughout training. The models in the architecture ablations in Table 5 all have 8 layers per stage with an outer dimension of ð ³ = 64 and pooling factor 4. The S4+MLP block has roughly 2ð ·2 + 4ð ·2 parameters (expansion factor 2 in the MLP). The Transformer block has 4ð ·2 + 2ð ·2 parameters (expansion factor 1 in the MLP). The Mamba block has the usual â 6ð ·2 parameters.', chunk_index=94, num_tokens=403, metadata={}), ResponseChunk(id='chunk_4f568176-27dc-48f7-adb3-0d6f72e30543', content='All models have roughly 6M total parameters. # E.5 Efficiency Benchmark Scan Operation. We compare the core operation of selective SSMs, which is the parallel scan (Section 3.3), against convolution and attention, measured on an A100 80GB PCIe GPU. Note that these do not include the cost of other operations outside of this core operation, such as computing the convolutional kernel in global-convolution models, or computing the QKV projections in attention. As a baseline, we implement a standard parallel scan in PyTorch with no kernel fusion. This requires materializing the parameters A, B, C in HBM. Our scan implementation fuses the discretization step and the parallel scan, avoiding the cost of materializing all the large parameters in HBM. For convolution, we use the standard implementation in PyTorch, which separately performs FFTs on the inputs and the ï¬ lters, multiply them in frequency domain, then performs an inverse FFT to obtain the result. The theoretical complexity is ð (ð ¿ log(ð ¿)) for sequence length ð ¿. For attention, we compare against the fastest implementation that we are aware of (FlashAttention-2 (Dao 2023)), with causal mask. Note that FlashAttention-2 with causal mask is about 1.7Ã faster than without causal mask, since approximately only half of the attention entries are computed. We use batch size of 1 and increase the sequence length from 29 = 512, 210 â 1ð ¾, 211 â 2ð ¾, up to 219 â 500ð ¾ (some of the baselines run out of memory before reaching 500K). We use a model dimension of ð · = 1024 and state dimension ð = 16. We measure with BF16 inputs, which is the data type most commonly used for large scale training. End-to-end Inference.', chunk_index=95, num_tokens=399, metadata={}), ResponseChunk(id='chunk_1a552e60-3231-4013-8657-03c5ca6a7791', content='We measure the inference throughput of a Mamba 1.4B model and an untrained Mamba 6.9B model, against a standard Transformer (GPT3 architecture) at 1.3B and 6.7B size. We use the standard Transformer implementation in the Huggingface transformers library. We set the prompt length to be 2048 and the generation length to be 128. We vary the batch size from 1, 2, 4, 8, 16, 32, 64, to 128, and measure time time taken to generate 128 tokens. We then calculate the throughput (tokens/s) as batch size Ã 128â time taken. We repeat the measurements 3 times and take the average. Measurements are done on an A100 80GB PCIe GPU. Memory Benchmark. The memory usage simply scales proportionally to the size of the activation tensors, as with most deep sequence models. We report measurements of the training memory requirements of 125M models 36 Table 15: (Memory benchmark.) Mambaâ s memory footprint is comparable to the most optimized Transformer. Results for 125M models. Batch size Transformer (w/ FlashAttention-2) Mamba 1 2 4 8 16 32 4.6GB 5.2GB 6.9GB 11.5GB 20.7GB 34.5GB 4.8GB 5.8GB 7.3GB 12.3GB 23.1GB 38.2GB on 1 A100 80GB GPU. Each batch consists of sequences of length 2048. We compare to the most memory-eï¬ cient Transformer implementation we are aware of (with kernel fusion from torch.compile and with FlashAttention-2). Table 15 shows that Mambaâ', chunk_index=96, num_tokens=384, metadata={}), ResponseChunk(id='chunk_e5dfb946-cfee-4340-ba1c-5c7331c3b622', content='s memory requirement is comparable to a similar-sized Transformer with an extremely optimized implementation, and we expect further improvement in Mambaâ s memory footprint in the future. 37', chunk_index=97, num_tokens=34, metadata={})]))"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from aurelio_sdk import ChunkingOptions, ChunkResponse\n",
     "\n",
@@ -125,24 +136,29 @@
     "\n",
     "response_regex: ChunkResponse = await client.chunk(\n",
     "    content=content, processing_options=chunking_options\n",
-    ")"
+    ")\n",
+    "\n",
+    "response_regex"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "status=<TaskStatus.completed: 'completed'> usage=Usage(tokens=42937, pages=None, seconds=None) message=None processing_options=ChunkingOptions(max_chunk_length=400, chunker_type='regex', window_size=1, delimiters=[]) document=ResponseDocument(id='doc_d6f92e7e-e067-4c08-9530-e0a29f30aa0f', content='# Mamba: Linear-Time Sequence Modeling with Selective State Spaces\\n# Albert Gu*1 and Tri Dao*2\\n1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me\\n# Abstract\\nFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ\\x80\\x99 computational ineï¬\\x83ciency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second, even though this change prevents the use of eï¬\\x83cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simpliï¬\\x81ed end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5Ã\\x97 higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.\\n# 1 Introduction\\nFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬\\x80ective paradigm in modern machine learning. The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014). While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬\\x83cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data. However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬\\x81nite window, and quadratic scaling with respect to the window length. An enormous body of research has appeared on more eï¬\\x83cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬\\x80ective. As of yet, none of these variants have been shown to be empirically eï¬\\x80ective at scale across domains.\\nRecently, structured state space sequence models (SSMs) (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling. These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960). This class of models can be computed very eï¬\\x83ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length. Additionally, they have principled\\nEqual contribution.\\n1\\nmechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021). Many ï¬\\x82avors of SSMs (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023). However, they have been less eï¬\\x80ective at modeling discrete and information-dense data such as text.\\nWe propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.\\nSelection Mechanism. First, we identify a key limitation of prior models: the ability to eï¬\\x83ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs). Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input. This allows the model to ï¬\\x81lter out irrelevant information and remember relevant information indeï¬\\x81nitely.\\nHardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬\\x83cient. We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬\\x80erent levels of the GPU memory hierarchy. The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3Ã\\x97 faster on A100 GPUs).\\nArchitecture. We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces.\\nSelective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬\\x83ciency together yield performance improvements on real data up to sequence length 1M.\\nWe empirically validate Mambaâ\\x80\\x99s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬\\x81c task performance, on several types of modalities and settings:\\nâ\\x80¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬\\x81nitely long (>1M tokens).\\nâ\\x80¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences.\\nâ\\x80¢ Language Modeling. Mamba is the ï¬\\x81rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5Ã\\x97 generation throughput compared to Transformers of similar size, and Mamba-3Bâ\\x80\\x99s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B).\\nModel code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.\\n2\\n# Selective State Space Model\\n# with Hardware-aware State Expansion\\n# A\\nvuvy GPU SRAM Selection Mechanism es\\nSelection Mechanism\\nFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð\\x9d\\x90· = 5) of an input ð\\x9d\\x91¥ to output ð\\x9d\\x91¦ through a higher dimensional latent state â\\x84\\x8e (e.g. ð\\x9d\\x91\\x81 = 4). Prior SSMs avoid materializing this large effective state (ð\\x9d\\x90·ð\\x9d\\x91\\x81, times batch size ð\\x9d\\x90µ and sequence length ð\\x9d\\x90¿) through clever alternate computation paths requiring time-invariance: the (â\\x88\\x86, A, B, C) parameters are constant across time. Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy.\\n# 2 State Space Models\\nStructured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð\\x9d\\x91¥(ð\\x9d\\x91¡) â\\x88\\x88 â\\x84\\x9d â\\x86¦ ð\\x9d\\x91¦(ð\\x9d\\x91¡) â\\x88\\x88 â\\x84\\x9d through an implicit latent state â\\x84\\x8e(ð\\x9d\\x91¡) â\\x88\\x88 â\\x84\\x9dð\\x9d\\x91\\x81. Concretely, S4 models are deï¬\\x81ned with four parameters (â\\x88\\x86, A, B, C), which deï¬\\x81ne a sequence-to-sequence trans- formation in two stages.\\nâ\\x84\\x8eâ\\x80²(ð\\x9d\\x91¡) = Aâ\\x84\\x8e(ð\\x9d\\x91¡) + Bð\\x9d\\x91¥(ð\\x9d\\x91¡) ð\\x9d\\x91¦(ð\\x9d\\x91¡) = Câ\\x84\\x8e(ð\\x9d\\x91¡)\\n(1a) (1b) â\\x84\\x8eð\\x9d\\x91¡ = Aâ\\x84\\x8eð\\x9d\\x91¡â\\x88\\x921 + Bð\\x9d\\x91¥ð\\x9d\\x91¡ ð\\x9d\\x91¦ð\\x9d\\x91¡ = Câ\\x84\\x8eð\\x9d\\x91¡ (2a) (2b) ð\\x9d\\x91\\x98 ð\\x9d\\x91² = (Cð\\x9d\\x91©, Cð\\x9d\\x91¨ð\\x9d\\x91©, â\\x80¦ , Cð\\x9d\\x91¨ ð\\x9d\\x91¦ = ð\\x9d\\x91¥ â\\x88\\x97 ð\\x9d\\x91² ð\\x9d\\x91©, â\\x80¦ ) (3a) (3b)\\nDiscretization. The ï¬\\x81rst stage transforms the â\\x80\\x9ccontinuous parametersâ\\x80\\x9d (â\\x88\\x86, A, B) to â\\x80\\x9cdiscrete parametersâ\\x80\\x9d (A, B) through ï¬\\x81xed formulas A = ð\\x9d\\x91\\x93ð\\x9d\\x90´(â\\x88\\x86, A) and B = ð\\x9d\\x91\\x93ð\\x9d\\x90µ(â\\x88\\x86, A, B), where the pair (ð\\x9d\\x91\\x93ð\\x9d\\x90´, ð\\x9d\\x91\\x93ð\\x9d\\x90µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬\\x81ned in equation (4).\\nA = exp(â\\x88\\x86A) B = (â\\x88\\x86A)â\\x88\\x921(exp(â\\x88\\x86A) â\\x88\\x92 I) â\\x8b\\x85 â\\x88\\x86B (4)\\nDiscretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023). It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5. However, from a mechanical point of view discretization can simply be viewed as the ï¬\\x81rst step of the computation graph in the forward pass of an SSM. Alternate ï¬\\x82avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about.\\nComputation. After the parameters have been transformed from (â\\x88\\x86, A, B, C) â\\x86¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).\\n3\\nCommonly, the model uses the convolutional mode (3) for eï¬\\x83cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬\\x83cient autoregressive inference (where the inputs are seen one timestep at a time).\\nLinear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ\\x80\\x99s dynamics are constant through time. In other words (â\\x88\\x86, A, B, C), and consequently (A, B) as well, are ï¬\\x81xed for all time-steps. This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models.\\nThus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬\\x83ciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬\\x83ciency bottlenecks.\\nStructure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬\\x83ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use. In this case, the A â\\x88\\x88 â\\x84\\x9dð\\x9d\\x91\\x81Ã\\x97ð\\x9d\\x91\\x81, B â\\x88\\x88 â\\x84\\x9dð\\x9d\\x91\\x81Ã\\x971, C â\\x88\\x88 â\\x84\\x9d1Ã\\x97ð\\x9d\\x91\\x81 matrices can all be represented by ð\\x9d\\x91\\x81 numbers. To operate over an input sequence ð\\x9d\\x91¥ of batch size ð\\x9d\\x90µ and length ð\\x9d\\x90¿ with ð\\x9d\\x90· channels, the SSM is applied independently to each channel. Note that in this case, the total hidden state has dimension ð\\x9d\\x90·ð\\x9d\\x91\\x81 per input, and computing it over the sequence length requires ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90·ð\\x9d\\x91\\x81) time and memory; this is the root of the fundamental eï¬\\x83ciency bottleneck addressed in Section 3.3.\\nGeneral State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state. It has been used to refer to many disparate concepts in diï¬\\x80erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬\\x81lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).\\nThroughout this entire paper we use the term â\\x80\\x9cSSMâ\\x80\\x9d to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary.\\nSSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\\nâ\\x80¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.\\nâ\\x80¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.\\nâ\\x80¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).\\nâ\\x80¢ RetNet (Y. Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.\\n4\\nâ\\x80¢ RWKV (B. Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)). Its main â\\x80\\x9cWKVâ\\x80\\x9d mechanism involves LTI recurrences and can be viewed as the ratio of two SSMs.\\nOther closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM.\\n# 3 Selective State Space Models\\nWe motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\\x83ciently. We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5).\\n# 3.1 Motivation: Selection as a Means of Compression\\nWe argue that a fundamental problem of sequence modeling is compressing context into a smaller state. In fact, we can view the tradeoï¬\\x80s of popular sequence models from this point of view. For example, attention is both eï¬\\x80ective and ineï¬\\x83cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬\\x83cient because they have a ï¬\\x81nite state, implying constant-time inference and linear-time training. However, their eï¬\\x80ectiveness is limited by how well this state has compressed the context.\\nTo understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\\nâ\\x80¢ The Selective Copying task modiï¬\\x81es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬\\x81lter out the irrelevant ones (white).\\nâ\\x80¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black).\\nThese tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬\\x80ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬\\x83culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.\\nIn summary, the eï¬\\x83ciency vs. eï¬\\x80ectiveness tradeoï¬\\x80 of sequence models is characterized by how well they compress their state: eï¬\\x83cient models must have a small state, while eï¬\\x80ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬\\x81lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).\\n# Improving SSMs with Selection\\nOne method of incorporating a selection mechanism into models is by letting their parameters that aï¬\\x80ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the convolution kernel of a CNN) be input-dependent.\\n5\\nCopying Output noo am > mt HE nee Tt Solution\\n# Tetons\\n|\\n# oO S lective Copying\\n# aoe\\n# i)\\n# [coe\\n# Induction Heads\\n# EES\\n>\\n# fo\\nPerfectly solved by LTI (e.g. convolutional) models that do not need to look at the actual inputs\\nHi i Hl ] Bw H a H > BH\\nFigure 2: (Left) The standard version of the Copying task involves constant spacing between input and output elements and is easily solved by time-invariant models such as linear recurrences and global convolutions. (Right Top) The Selective Copying task has random spacing in between inputs and requires time-varying models that can selectively remember or ignore inputs depending on their content. (Right Bottom) The Induction Heads task is an example of associative recall that requires retrieving an answer based on context, a key ability for LLMs.\\nAlgorithm 2 SSM + Selection (S6) Input: ð\\x9d\\x91¥ â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³) Output: ð\\x9d\\x91¦ â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³) 1: A â\\x88¶ (ð\\x9d\\x99³, ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b â\\x8a³ Represents structured ð\\x9d\\x91\\x81 Ã\\x97 ð\\x9d\\x91\\x81 matrix â\\x8a³ Represents structured ð\\x9d\\x91\\x81 Ã\\x97 ð\\x9d\\x91\\x81 matrix 2: B â\\x88¶ (ð\\x9d\\x99³, ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b 3: C â\\x88¶ (ð\\x9d\\x99³, ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b 4: â\\x88\\x86 â\\x88¶ (ð\\x9d\\x99³) â\\x86\\x90 ð\\x9d\\x9c\\x8fâ\\x88\\x86(ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b) 5: A, B â\\x88¶ (ð\\x9d\\x99³, ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x96½ð\\x9d\\x97\\x82ð\\x9d\\x97\\x8cð\\x9d\\x96¼ð\\x9d\\x97\\x8bð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x97\\x82ð\\x9d\\x97\\x93ð\\x9d\\x96¾(â\\x88\\x86, A, B) 6: ð\\x9d\\x91¦ â\\x86\\x90 ð\\x9d\\x96²ð\\x9d\\x96²ð\\x9d\\x96¬(A, B, C)(ð\\x9d\\x91¥) 2: B â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x91\\xa0ð\\x9d\\x90µ(ð\\x9d\\x91¥) 3: C â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x91\\xa0ð\\x9d\\x90¶(ð\\x9d\\x91¥) 4: â\\x88\\x86 â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³) â\\x86\\x90 ð\\x9d\\x9c\\x8fâ\\x88\\x86(ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b+ð\\x9d\\x91\\xa0â\\x88\\x86(ð\\x9d\\x91¥)) 5: A, B â\\x88¶ (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³, ð\\x9d\\x99½) â\\x86\\x90 ð\\x9d\\x96½ð\\x9d\\x97\\x82ð\\x9d\\x97\\x8cð\\x9d\\x96¼ð\\x9d\\x97\\x8bð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x97\\x82ð\\x9d\\x97\\x93ð\\x9d\\x96¾(â\\x88\\x86, A, B) 6: ð\\x9d\\x91¦ â\\x86\\x90 ð\\x9d\\x96²ð\\x9d\\x96²ð\\x9d\\x96¬(A, B, C)(ð\\x9d\\x91¥) â\\x8a³ Time-invariant: recurrence or convolution â\\x8a³ Time-varying: recurrence (scan) only 7: return ð\\x9d\\x91¦ 7: return ð\\x9d\\x91¦\\nAlgorithms 1 and 2 illustrates the main selection mechanism that we use. The main diï¬\\x80erence is simply making several parameters â\\x88\\x86, B, C functions of the input, along with the associated changes to tensor shapes throughout. In particular, we highlight that these parameters now have a length dimension ð\\x9d\\x90¿, meaning that the model has changed from time-invariant to time-varying. (Note that shape annotations were described in Section 2). This loses the equivalence to convolutions (3) with implications for its eï¬\\x83ciency, discussed next.\\nWe speciï¬\\x81cally choose ð\\x9d\\x91\\xa0ð\\x9d\\x90µ(ð\\x9d\\x91¥) = ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x91\\x81(ð\\x9d\\x91¥), ð\\x9d\\x91\\xa0ð\\x9d\\x90¶(ð\\x9d\\x91¥) = ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x91\\x81(ð\\x9d\\x91¥), ð\\x9d\\x91\\xa0â\\x88\\x86(ð\\x9d\\x91¥) = ð\\x9d\\x96¡ð\\x9d\\x97\\x8bð\\x9d\\x97\\x88ð\\x9d\\x96ºð\\x9d\\x96½ð\\x9d\\x96¼ð\\x9d\\x96ºð\\x9d\\x97\\x8cð\\x9d\\x97\\x8dð\\x9d\\x90·(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b1(ð\\x9d\\x91¥)), and ð\\x9d\\x9c\\x8fâ\\x88\\x86 = ð\\x9d\\x97\\x8cð\\x9d\\x97\\x88ð\\x9d\\x96¿ð\\x9d\\x97\\x8dð\\x9d\\x97\\x89ð\\x9d\\x97\\x85ð\\x9d\\x97\\x8eð\\x9d\\x97\\x8c, where ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x91\\x91 is a parameterized projection to dimension ð\\x9d\\x91\\x91. The choice of ð\\x9d\\x91\\xa0â\\x88\\x86 and ð\\x9d\\x9c\\x8fâ\\x88\\x86 is due to a connection to RNN gating mechanisms explained in Section 3.5.\\n# 3.3 Efficient Implementation of Selective SSMs\\nHardware-friendly architectures such as convolutions (Krizhevsky, Sutskever, and Hinton 2012) and Transform- ers (Vaswani et al. 2017) enjoy widespread application. Here we aim to make selective SSMs eï¬\\x83cient on modern hardware (GPU) as well. The selection mechanism is quite natural, and earlier works attempted to incorporate special cases of selection, such as letting â\\x88\\x86 vary over time in recurrent SSMs (Gu, Dao, et al. 2020). However, as previously mentioned a core limitation in the usage of SSMs is their computational eï¬\\x83ciency, which was why S4 and all derivatives used LTI (non-selective) models, most commonly in the form of global convolutions.\\n# 3.3.1 Motivation of Prior Models\\nWe ï¬\\x81rst revisit this motivation and overview our approach to overcome limitations of prior methods.\\nâ\\x80¢ At a high level, recurrent models such as SSMs always balance a tradeoï¬\\x80 between expressivity and speed: as discussed in Section 3.1, models with larger hidden state dimension should be more eï¬\\x80ective but slower. Thus\\n6\\nwe want to maximize hidden state dimension without paying speed and memory costs.\\nâ\\x80¢ Note that the recurrent mode is more ï¬\\x82exible than the convolution mode, since the latter (3) is derived from expanding the former (2) (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021). However, this would require computing and materializing the latent state â\\x84\\x8e with shape (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³, ð\\x9d\\x99½), much larger (by a factor of ð\\x9d\\x91\\x81, the SSM state dimension) than the input ð\\x9d\\x91¥ and output ð\\x9d\\x91¦ of shape (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³). Thus the more eï¬\\x83cient convolution mode was introduced which could bypass the state computation and materializes a convolution kernel (3a) of only (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³).\\nâ\\x80¢ Prior LTI SSMs leverage the dual recurrent-convolutional forms to increase the eï¬\\x80ective state dimension by a factor of ð\\x9d\\x91\\x81 (â\\x89\\x88 10 â\\x88\\x92 100), much larger than traditional RNNs, without eï¬\\x83ciency penalties.\\n# 3.3.2 Overview of Selective Scan: Hardware-Aware State Expansion\\nThe selection mechanism is designed to overcome the limitations of LTI models; at the same time, we therefore need to revisit the computation problem of SSMs. We address this with three classical techniques: kernel fusion, parallel scan, and recomputation. We make two main observations:\\nâ\\x80¢ The naive recurrent computation uses ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90·ð\\x9d\\x91\\x81) FLOPs while the convolutional computation uses ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90· log(ð\\x9d\\x90¿)) FLOPs, and the former has a lower constant factor. Thus for long sequences and not-too-large state dimension ð\\x9d\\x91\\x81, the recurrent mode can actually use fewer FLOPs.\\nâ\\x80¢ The two challenges are the sequential nature of recurrence, and the large memory usage. To address the latter, just like the convolutional mode, we can attempt to not actually materialize the full state â\\x84\\x8e.\\nThe main idea is to leverage properties of modern accelerators (GPUs) to materialize the state â\\x84\\x8e only in more eï¬\\x83cient levels of the memory hierarchy. In particular, most operations (except matrix multiplication) are bounded by memory bandwidth (Dao, Fu, Ermon, et al. 2022; Ivanov et al. 2021; Williams, Waterman, and Patterson 2009). This includes our scan operation, and we use kernel fusion to reduce the amount of memory IOs, leading to a signiï¬\\x81cant speedup compared to a standard implementation.\\nConcretely, instead of preparing the scan input (A, B) of size (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³, ð\\x9d\\x99½) in GPU HBM (high-bandwidth memory), we load the SSM parameters (â\\x88\\x86, A, B, C) directly from slow HBM to fast SRAM, perform the discretization and recurrence in SRAM, and then write the ï¬\\x81nal outputs of size (ð\\x9d\\x99±, ð\\x9d\\x99», ð\\x9d\\x99³) back to HBM.\\nTo avoid the sequential recurrence, we observe that despite not being linear it can still be parallelized with a work-eï¬\\x83cient parallel scan algorithm (Blelloch 1990; Martin and Cundy 2018; Smith, Warrington, and Linderman 2023).\\nFinally, we must also avoid saving the intermediate states, which are necessary for backpropagation. We carefully apply the classic technique of recomputation to reduce the memory requirements: the intermediate states are not stored but recomputed in the backward pass when the inputs are loaded from HBM to SRAM. As a result, the fused selective scan layer has the same memory requirements as an optimized transformer implementation with FlashAttention.\\nDetails of the fused kernel and recomputation are in Appendix D. The full Selective SSM layer and algorithm is illustrated in Figure 1.\\n# 3.4 A Simplified SSM Architecture\\nAs with structured SSMs, selective SSMs are standalone sequence transformations that can be ï¬\\x82exibly incorporated into neural networks. The H3 architecture is the basis for the most well-known SSM architectures (Section 2), which are generally comprised of a block inspired by linear attention interleaved with an MLP (multi-layer perceptron) block. We simplify this architecture by combining these two components into one, which is stacked homogenously (Figure 3). This is inspired by the gated attention unit (GAU) (Hua et al. 2022), which did something similar for attention.\\nThis architecture involves expanding the model dimension ð\\x9d\\x90· by a controllable expansion factor ð\\x9d\\x90¸. For each block, most of the parameters (3ð\\x9d\\x90¸ð\\x9d\\x90·2) are in the linear projections (2ð\\x9d\\x90¸ð\\x9d\\x90·2 for input projections, ð\\x9d\\x90¸ð\\x9d\\x90·2 for output projection) while the inner SSM contributes less. The number of SSM parameters (projections for â\\x88\\x86, B, C, and\\n7\\nLinear projection Sequence transformation Nonlinearity (activation multiplication) H3 Â®@ Gated MLP â\\x80\\x94 Mamba\\n# or\\nFigure 3: (Architecture.) Our simplified block design combines the H3 block, which is the basis of most SSM architectures, with the ubiquitous MLP block of modern neural networks. Instead of interleaving these two blocks, we simply repeat the Mamba block homogenously. Compared to the H3 block, Mamba replaces the first multiplicative gate with an activation function. Compared to the MLP block, Mamba adds an SSM to the main branch. For ð\\x9d\\x9c\\x8e we use the SiLU / Swish activation (Hendrycks and Gimpel 2016; Ramachandran, Zoph, and Quoc V Le 2017).\\nthe matrix A) are much smaller in comparison. We repeat this block, interleaved with standard normalization and residual connections, to form the Mamba architecture. We always ï¬\\x81x to ð\\x9d\\x90¸ = 2 in our experiments and use two stacks of the block to match the 12ð\\x9d\\x90·2 parameters of a Transformerâ\\x80\\x99s interleaved MHA (multi-head attention) and MLP blocks. We use the SiLU / Swish activation function (Hendrycks and Gimpel 2016; Ramachandran, Zoph, and Quoc V Le 2017), motivated so that the Gated MLP becomes the popular â\\x80\\x9cSwiGLUâ\\x80\\x9d variant (Chowdhery et al. 2023; Shazeer 2020; Touvron et al. 2023). Finally, we additionally use an optional normalization layer (we choose LayerNorm (J. L. Ba, Kiros, and Hinton 2016)), motivated by RetNetâ\\x80\\x99s usage of a normalization layer in a similar location (Y. Sun et al. 2023).\\n# 3.5 Properties of Selection Mechanisms\\nThe selection mechanism is a broader concept that can be applied in diï¬\\x80erent ways, such as to more traditional RNNs or CNNs, to diï¬\\x80erent parameters (e.g. A in Algorithm 2), or using diï¬\\x80erent transformations ð\\x9d\\x91\\xa0(ð\\x9d\\x91¥).\\n# 3.5.1 Connection to Gating Mechanisms\\nWe highlight the most important connection: the classical gating mechanism of RNNs is an instance of our selection mechanism for SSMs. We note that the connection between RNN gating and the discretization of continuous-time systems is well established (Funahashi and Nakamura 1993; Tallec and Ollivier 2018). In fact, Theorem 1 is an improvement of Gu, Johnson, Goel, et al. (2021, Lemma 3.1) generalizing to the ZOH discretization and input-dependent gates (proof in Appendix C). More broadly, â\\x88\\x86 in SSMs can be seen to play a generalized role of the RNN gating mechanism. In line with prior work, we adopt the view that discretization of SSMs is the principled foundation of heuristic gating mechanisms.\\nTheorem 1. When ð\\x9d\\x91\\x81 = 1, A = â\\x88\\x921, B = 1, ð\\x9d\\x91\\xa0â\\x88\\x86 = ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥), and ð\\x9d\\x9c\\x8fâ\\x88\\x86 = ð\\x9d\\x97\\x8cð\\x9d\\x97\\x88ð\\x9d\\x96¿ð\\x9d\\x97\\x8dð\\x9d\\x97\\x89ð\\x9d\\x97\\x85ð\\x9d\\x97\\x8eð\\x9d\\x97\\x8c, then the selective SSM recurrence (Algorithm 2) takes the form\\nð\\x9d\\x91\\x94ð\\x9d\\x91¡ = ð\\x9d\\x9c\\x8e(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)) â\\x84\\x8eð\\x9d\\x91¡ = (1 â\\x88\\x92 ð\\x9d\\x91\\x94ð\\x9d\\x91¡)â\\x84\\x8eð\\x9d\\x91¡â\\x88\\x921 + ð\\x9d\\x91\\x94ð\\x9d\\x91¡ð\\x9d\\x91¥ð\\x9d\\x91¡. (5)\\nAs mentioned in Section 3.2, our speciï¬\\x81c choices of ð\\x9d\\x91\\xa0â\\x88\\x86, ð\\x9d\\x9c\\x8fâ\\x88\\x86 is from this connection. In particular, note that if a given input ð\\x9d\\x91¥ð\\x9d\\x91¡ should be completely ignored (as necessary in the synthetic tasks), all ð\\x9d\\x90· channels should ignore it, and so we project the input down to 1 dimension before repeating/broadcasting with â\\x88\\x86.\\n8\\n# Interpretation of Selection Mechanisms\\nWe elaborate on two particular mechanistic eï¬\\x80ects of selection.\\nVariable Spacing. Selectivity allows ï¬\\x81ltering out irrelevant noise tokens that may occur between inputs of interest. This is exempliï¬\\x81ed by the Selective Copying task, but occurs ubiquitously in common data modalities, particularly for discrete data â\\x80\\x93 for example the presence of language ï¬\\x81llers such as â\\x80\\x9cumâ\\x80\\x9d. This property arises because the model can mechanistically ï¬\\x81lter out any particular input ð\\x9d\\x91¥ð\\x9d\\x91¡, for example in the gated RNN case (Theorem 1) when ð\\x9d\\x91\\x94ð\\x9d\\x91¡ â\\x86\\x92 0.\\nIt has been empirically observed that many sequence models do not improve with longer Filtering Context. context (F. Shi et al. 2023), despite the principle that more context should lead to strictly better performance. An explanation is that many sequence models cannot eï¬\\x80ectively ignore irrelevant context when necessary; an intuitive example are global convolutions (and general LTI models). On the other hand, selective models can simply reset their state at any time to remove extraneous history, and thus their performance in principle improves monotonicly with context length (e.g. Section 4.3.2).\\nIn settings where multiple independent sequences are stitched together, Transformers Boundary Resetting. can keep them separate by instantiating a particular attention mask, while LTI models will bleed information between the sequences. Selective SSMs can also reset their state at boundaries (e.g. â\\x88\\x86ð\\x9d\\x91¡ â\\x86\\x92 â\\x88\\x9e or Theorem 1 when ð\\x9d\\x91\\x94ð\\x9d\\x91¡ â\\x86\\x92 1). These settings may occur artiï¬\\x81cially (e.g. packing documents together to improve hardware utilization) or naturally (e.g. episode boundaries in reinforcement learning (Lu et al. 2023)).\\nAdditionally, we elaborate on eï¬\\x80ects of each selective parameter.\\nIn general, â\\x88\\x86 controls the balance between how much to focus or ignore the current input Interpretation of â\\x88\\x86. ð\\x9d\\x91¥ð\\x9d\\x91¡. It generalizes RNN gates (e.g. ð\\x9d\\x91\\x94ð\\x9d\\x91¡ in Theorem 1), mechanically, a large â\\x88\\x86 resets the state â\\x84\\x8e and focuses on the current input ð\\x9d\\x91¥, while a small â\\x88\\x86 persists the state and ignores the current input. SSMs (1)-(2) can be interpreted as a continuous system discretized by a timestep â\\x88\\x86, and in this context the intuition is that large â\\x88\\x86 â\\x86\\x92 â\\x88\\x9e represents the system focusing on the current input for longer (thus â\\x80\\x9cselectingâ\\x80\\x9d it and forgetting its current state) while a small â\\x88\\x86 â\\x86\\x92 0 represents a transient input that is ignored.\\nInterpretation of A. We remark that while the A parameter could also be selective, it ultimately aï¬\\x80ects the model only through its interaction with â\\x88\\x86 via A = exp(â\\x88\\x86A) (the discretization (4)). Thus selectivity in â\\x88\\x86 is enough to ensure selectivity in (A, B), and is the main source of improvement. We hypothesize that making A selective in addition to (or instead of) â\\x88\\x86 would have similar performance, and leave it out for simplicity.\\nInterpretation of B and C. As discussed in Section 3.1, the most important property of selectivity is ï¬\\x81ltering out irrelevant information so that a sequence modelâ\\x80\\x99s context can be compressed into an eï¬\\x83cient state. In an SSM, modifying B and C to be selective allows ï¬\\x81ner-grained control over whether to let an input ð\\x9d\\x91¥ð\\x9d\\x91¡ into the state â\\x84\\x8eð\\x9d\\x91¡ or the state into the output ð\\x9d\\x91¦ð\\x9d\\x91¡. These can be interpreted as allowing the model to modulate the recurrent dynamics based on content (input) and context (hidden states) respectively.\\n3.6 Additional Model Details Real vs. Complex. Most prior SSMs use complex numbers in their state â\\x84\\x8e, which is necessary for strong performance on many tasks (Gu, Goel, and RÃ© 2022). However, it has been empirically observed that completely real-valued SSMs seem to work ï¬\\x81ne, and possibly even better, in some settings (Ma et al. 2023). We use real values as the default, which work well for all but one of our tasks; we hypothesize that the complex-real tradeoï¬\\x80 is related to the continuous-discrete spectrum in data modalities, where complex numbers are helpful for continuous modalities (e.g. audio, video) but not discrete (e.g. text, DNA).\\n9\\nInitialization. Most prior SSMs also suggest special initializations, particularly in the complex-valued case, which can help in several settings such as low-data regimes. Our default initialization for the complex case is S4D-Lin and for the real case is S4D-Real (Gu, Gupta, et al. 2022), which is based on the HIPPO theory (Gu, Dao, et al. 2020). These deï¬\\x81ne the ð\\x9d\\x91\\x9b-th element of A as â\\x88\\x921â\\x88\\x952 + ð\\x9d\\x91\\x9bð\\x9d\\x91\\x96 and â\\x88\\x92(ð\\x9d\\x91\\x9b + 1) respectively. However, we expect many initializations to work ï¬\\x81ne, particularly in the large-data and real-valued SSM regimes; some ablations are considered in Section 4.6.\\nParameterization of â\\x88\\x86. We deï¬\\x81ned the selective adjustment to â\\x88\\x86 as ð\\x9d\\x91\\xa0â\\x88\\x86(ð\\x9d\\x91¥) = ð\\x9d\\x96¡ð\\x9d\\x97\\x8bð\\x9d\\x97\\x88ð\\x9d\\x96ºð\\x9d\\x96½ð\\x9d\\x96¼ð\\x9d\\x96ºð\\x9d\\x97\\x8cð\\x9d\\x97\\x8dð\\x9d\\x90·(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b1(ð\\x9d\\x91¥)), which was motivated by the mechanics of â\\x88\\x86 (Section 3.5). We observe that it can be generalized from dimension 1 to a larger dimension ð\\x9d\\x9a\\x81. We set this to be a small fraction of ð\\x9d\\x99³, which uses a negligible number of parameters compared to the main Linear projections in the block. We additionally note that the broadcasting operation can instead be viewed as another Linear projection, initialized to a speciï¬\\x81c pattern of 1â\\x80\\x99s and 0â\\x80\\x99s; if this projection is trainable, this leads to the alternative ð\\x9d\\x91\\xa0â\\x88\\x86(ð\\x9d\\x91¥) = ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x90·(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x91\\x85(ð\\x9d\\x91¥)), which can be viewed as a low-rank projection. In our experiments, the â\\x88\\x86 parameter (which can be viewed as a bias term) is initialized to ð\\x9d\\x9c\\x8fâ\\x88\\x921 â\\x88\\x86 following prior work on SSMs (Gu, Johnson, Timalsina, et al. 2023).\\nRemark 3.1. For brevity in our experimental results, we sometimes abbreviate selective SSMs as S6 models, because they are S4 models with a selection mechanism and computed with a scan.\\n# 4 Empirical Evaluation\\nIn Section 4.1 we test Mambaâ\\x80\\x99s ability to solve the two synthetic tasks motivated in Section 3.1. We then evaluate on three domains, each evaluated on autoregressive pretraining as well as downstream tasks.\\nSection 4.2: language model pretraining (scaling laws), and zero-shot downstream evaluation.\\nSection 4.3: DNA sequence pretraining, and ï¬\\x81ne-tuning on a long-sequence classiï¬\\x81cation task.\\nSection 4.4: audio waveform pretraining, and the quality of autoregressively generated speech clips.\\nFinally, Section 4.5 shows Mambaâ\\x80\\x99s computational eï¬\\x83ciency at both training and inference time, and Section 4.6 ablates various components of the architecture and selective SSMs.\\n# 4.1 Synthetic Tasks\\nFull experiment details for these tasks including task details and training protocol are in Appendix E.1.\\n# 4.1.1 Selective Copying\\nThe Copying task is one of the most well-studied synthetic tasks for sequence modeling, originally designed to test the memorization abilities of recurrent models. As discussed in Section 3.1, LTI SSMs (linear recurrences and global convolutions) can easily solve this task by only keeping track of time instead of reasoning about the data; for example, by constructing a convolution kernel of exactly the right length (Figure 2). This was explicitly validated in earlier work on global convolutions (Romero et al. 2021). The Selective Copying task prevents this shortcut by randomizing the spacing between tokens. Note that this task has been introduced before as the Denoising task (Jing et al. 2019).\\nNote that many previous works argue that adding architecture gating (multiplicative interactions) can endow models with â\\x80\\x9cdata-dependenceâ\\x80\\x9d and solve related tasks (Dao, Fu, Saab, et al. 2023; Poli et al. 2023). However, we ï¬\\x81nd this explanation insuï¬\\x83cient intuitively because such gating does not interact along the sequence axis, and cannot aï¬\\x80ect the spacing between tokens. In particular architecture gating is not an instance of a selection mechanism (Appendix A).\\nTable 1 conï¬\\x81rms that gated architectures such as H3 and Mamba only partially improve performance, while the selection mechanism (modifying S4 to S6) easily solves this task, particularly when combined with these more powerful architectures.\\n10\\nModel Arch. Layer Acc. S4 - No gate No gate S4 S6 18.3 97.0 H3 Hyena - H3 H3 H3 S4 Hyena S6 57.0 30.1 99.7 - - Mamba Mamba Mamba Mamba Hyena S4 S6 56.4 28.4 99.8\\nInduction Heads Extrapolation\\nExtrapolation 1.05 \\' â\\x80\\x94â\\x80\\x94 Mua-Absotute 08] ; â\\x80\\x94â\\x80\\x94 MHA-RoPE i =~ MHA-xPos 6) i â\\x80\\x94 HB oa = byena \\' Random 1 ran benath 0.0 , ; ; : , 10Â° 10Â° 108 10Â° 10Â° Test Sequence Length\\n> g 8\\nTable 1: (Selective Copying.) Accuracy for combinations of architectures and inner sequence layers.\\nTable 2: (Induction Heads.) Models are trained on sequence length 28 = 256, and tested on increasing sequence lengths of 26 = 64 up to 220 = 1048576. Full numbers in Table 11.\\n# 4.1.2 Induction Heads\\nInduction heads (Olsson et al. 2022) is a simple task from the mechanistic interpretability lens (Elhage et al. 2021) that is surprisingly predictive of the in-context learning ability of LLMs. It requires models to perform associative recall and copy: for example, if the model has seen a bigram such as â\\x80\\x9cHarry Potterâ\\x80\\x9d in the sequence, then the next time â\\x80\\x9cHarryâ\\x80\\x9d appears in the same sequence, the model should be able to predict â\\x80\\x9cPotterâ\\x80\\x9d by copying from history.\\nDataset. We train a 2-layer model on the induction heads task at sequence length 256, with a vocab size of 16, which is comparable to prior work on this task (Dao, Fu, Saab, et al. 2023) but with longer sequences. We additionally investigate generalization and extrapolation abilities by evaluating on a range of sequence lengths from 26 = 64 up to 220 = 1048576 at test time.\\nModels. Following established work on induction heads, we use 2 layer models, which allows attention to mechanistically solve the induction heads task (Olsson et al. 2022). We test both multi-head attention (8 heads, with various positional encodings) and SSM variants. We use a model dimension ð\\x9d\\x90· of 64 for Mamba and 128 for the other models.\\nResults. Table 2 shows that Mambaâ\\x80\\x94or more precisely, its selective SSM layerâ\\x80\\x94has the ability to solve the task perfectly because of its ability to selectively remember the relevant token while ignoring everything else in between. It generalizes perfectly to million-length sequences, or 4000Ã\\x97 longer than it saw during training, while no other method goes beyond 2Ã\\x97.\\nOut of positional encoding variants for attention models, xPos (which was designed for length extrapolation) is slightly better than the others; also note that all attention models were only tested up to sequence length 214 = 16384 due to memory limitations. Out of other SSMs, H3 and Hyena are similar, contrary to the ï¬\\x81ndings in Poli et al. (2023).\\n# 4.2 Language Modeling\\nWe evaluate the Mamba architecture on standard autoregressive language modeling against other architectures, on both pretraining metrics (perplexity) and zero-shot evaluations. We set the model sizes (depth and width) to mirror GPT3 speciï¬\\x81cations. We use the Pile dataset (L. Gao, Biderman, et al. 2020), and follow the training recipe described in Brown et al. (2020). All training details are in Appendix E.2.\\n# 4.2.1 Scaling Laws\\nFor baselines, we compare against the standard Transformer architecture (GPT3 architecture), as well as the strongest Transformer recipe we know of (here referred to as Transformer++), based on the PaLM and LLaMa\\n11\\nScaling Laws on The Pile (Sequence Length 2048) Scaling Laws on The Pile (Sequence Length 8192) 2x10\" 2x10 Hyena Hyena RWKV s RWKV â\\x80\\x94â\\x80\\x94 Transformer Fy â\\x80\\x94â\\x80\\x94 Transformer fd RetNet 2 â\\x80\\x94â\\x80\\x94 RetNet 3+ 2 â\\x80\\x94 HH wd â\\x80\\x94= Transformers |, | â\\x80\\x94â\\x80\\x94 Transformert+ â\\x80\\x94â\\x80\\x94 Mamba zg â\\x80\\x94â\\x80\\x94 Mamba 2 2 S a 6x 10Â° 1 7 6x 10Â° 1 7 10\"? 102 10 107Â° FLOPs (log scale) FLOPs (log scale)\\ns 8 fd 2 2\\n> 3 2 2 S a\\nFigure 4: (Scaling Laws.) Models of size â\\x89\\x88 125ð\\x9d\\x91\\x80 to â\\x89\\x88 1.3ð\\x9d\\x90µ parameters, trained on the Pile. Mamba scales better than all other attention-free models and is the first to match the performance of a very strong â\\x80\\x9cTransformer++â\\x80\\x9d recipe that has now become standard, particularly as the sequence length grows.\\narchitectures (e.g. rotary embedding, SwiGLU MLP, RMSNorm instead of LayerNorm, no linear bias, and higher learning rates). We also compare against other recent subquadratic architectures (Figure 4). All model details are in Appendix E.2.\\nFigure 4 shows scaling laws under the standard Chinchilla (Hoï¬\\x80mann et al. 2022) protocol, on models from â\\x89\\x88 125ð\\x9d\\x91\\x80 to â\\x89\\x88 1.3ð\\x9d\\x90µ parameters. Mamba is the ï¬\\x81rst attention-free model to match the performance of a very strong Transformer recipe (Transformer++) that has now become standard, particularly as the sequence length grows. We note that full results on context length 8k are missing for the RWKV and RetNet baselines, prior strong recurrent models that can also be interpreted as SSMs, due to a lack of eï¬\\x83cient implementation leading to out-of-memory or unrealistic computation requirements.\\n# 4.2.2 Downstream Evaluations\\nTable 3 shows the performance of Mamba on a range of popular downstream zero-shot evaluation tasks. We compare against the most well-known open source models at these sizes, most importantly Pythia (Biderman et al. 2023) and RWKV (B. Peng et al. 2023) which were trained with the same tokenizer, dataset, and training length (300B tokens) as our models. (Note that Mamba and Pythia are trained with context length 2048, while RWKV was trained with context length 1024.)\\n# 4.3 DNA Modeling\\nMotivated by the success of large language models, there has been recent exploration into using the foundation model paradigm for genomics. DNA has been likened to language in that it consists of sequences of discrete tokens with a ï¬\\x81nite vocab. It is also known for requiring long-range dependencies to model (Avsec et al. 2021). We investigate Mamba as a FM backbone for pretraining and ï¬\\x81ne-tuning in the same setting as recent works on long-sequence models for DNA (Nguyen, Poli, et al. 2023). In particular, we focus on two explorations of scaling laws across model size and sequence length (Figure 5), and a diï¬\\x83cult downstream synthetic classiï¬\\x81cation task requiring long context (Figure 6).\\nFor pretraining, we largely follow a standard causal language modeling (next token prediction) setup for the training and model details (see also Appendix E.2). For the dataset, we largely follow the setup of HyenaDNA (Nguyen, Poli, et al. 2023), which uses the HG38 dataset for pretraining consisting of a single human genome with about 4.5 billion tokens (DNA base pairs) in the training split.\\n# 4.3.1 Scaling: Model Size\\nIn this experiment, we investigate the scaling properties of genomics foundation models with various model backbones (Figure 5 Left).\\nTraining. To advantage the baselines, we train on a short sequence length of 1024; as shown in Section 4.3.2, we expect results to favor Mamba even more at longer sequence lengths. We ï¬\\x81x a global batch size of 1024, for a\\n12\\nTable 3: (Zero-shot Evaluations.) Best results for each size in bold. We compare against open source LMs with various tokenizers, trained for up to 300B tokens. Pile refers to the validation split, comparing only against models trained on the same dataset and tokenizer (GPT-NeoX-20B). For each model size, Mamba is best-in-class on every single evaluation result, and generally matches baselines at twice the model size.\\nModel Token. Pile ppl â\\x86\\x93 LAMBADA LAMBADA HellaSwag ppl â\\x86\\x93 acc â\\x86\\x91 acc â\\x86\\x91 acc â\\x86\\x91 acc â\\x86\\x91 acc â\\x86\\x91 acc â\\x86\\x91 Hybrid H3-130M GPT2 â\\x80\\x94 Pythia-160M Mamba-130M NeoX NeoX 29.64 10.56 89.48 38.10 16.07 25.77 33.0 44.3 31.7 30.2 35.3 64.2 61.4 64.5 44.4 43.2 48.0 24.2 24.1 24.3 50.6 51.9 51.9 40.1 40.6 44.7 Hybrid H3-360M GPT2 â\\x80\\x94 Pythia-410M Mamba-370M NeoX NeoX 9.95 8.28 12.58 10.84 8.14 48.0 51.4 55.6 41.5 40.6 46.5 68.1 66.9 69.5 51.4 52.1 55.1 24.7 24.6 28.0 54.1 53.8 55.3 48.0 48.2 50.0 Pythia-1B Mamba-790M NeoX NeoX 7.82 7.33 7.92 6.02 56.1 62.7 47.2 55.1 70.7 72.1 57.0 61.2 27.1 29.5 53.5 56.1 51.9 57.1 GPT-Neo 1.3B Hybrid H3-1.3B OPT-1.3B Pythia-1.4B RWKV-1.5B Mamba-1.4B GPT2 â\\x80\\x94 GPT2 â\\x80\\x94 â\\x80\\x94 OPT 7.51 NeoX 7.70 NeoX NeoX 6.80 7.50 11.25 6.64 6.08 7.04 5.04 57.2 49.6 58.0 61.7 56.4 64.9 48.9 52.6 53.7 52.1 52.5 59.1 71.1 71.3 72.4 71.0 72.4 74.2 56.2 59.2 56.7 60.5 60.5 65.5 25.9 28.1 29.6 28.5 29.4 32.8 54.9 56.9 59.5 57.2 54.6 61.5 52.4 53.0 55.0 55.2 54.3 59.7 GPT-Neo 2.7B Hybrid H3-2.7B OPT-2.7B Pythia-2.8B RWKV-3B Mamba-2.8B GPT2 â\\x80\\x94 GPT2 â\\x80\\x94 â\\x80\\x94 OPT 6.73 NeoX 7.00 NeoX NeoX 6.22 5.63 7.92 5.12 5.04 5.24 4.23 62.2 55.7 63.6 64.7 63.9 69.2 55.8 59.7 60.6 59.3 59.6 66.1 72.1 73.3 74.8 74.0 73.7 75.2 61.1 65.6 60.8 64.1 67.8 69.7 30.2 32.3 31.3 32.9 33.1 36.3 57.6 61.4 61.0 59.7 59.6 63.5 56.5 58.0 58.7 59.1 59.6 63.3 GPT-J-6B OPT-6.7B Pythia-6.9B RWKV-7.4B GPT2 OPT NeoX NeoX â\\x80\\x93 â\\x80\\x93 6.51 6.31 4.10 4.25 4.45 4.38 68.3 67.7 67.1 67.2 66.3 67.2 64.0 65.5 75.4 76.3 75.2 76.1 67.0 65.6 67.3 67.8 36.6 34.9 35.5 37.5 64.1 65.5 61.3 61.0 63.0 62.9 61.7 62.5\\ntotal of 220 â\\x89\\x88 1ð\\x9d\\x91\\x80 tokens per batch. Models were trained for 10ð\\x9d\\x90¾ gradient steps for a total of 10ð\\x9d\\x90µ tokens.\\nResults. Figure 5 (Left) shows that Mambaâ\\x80\\x99s pretraining perplexity improves smoothly with model size, and that Mamba scales better than both HyenaDNA and Transformer++. For example, at the largest model size of â\\x89\\x88 40ð\\x9d\\x91\\x80 parameters, the curve shows that Mamba can match the Transformer++ and HyenaDNA models with roughly 3Ã\\x97 to 4Ã\\x97 fewer parameters.\\n# 4.3.2 Scaling: Context Length\\nIn the next DNA experiment, we investigate the scaling properties of models with respect to sequence length. We only compare the HyenaDNA and Mamba models, as quadratic attention becomes prohibitively expensive at longer sequence lengths. We pretrain models on sequence lengths 210 = 1024, 212 = 4096, 214 = 16384, 216 = 65536, 218 = 262144, 220 = 1048576. We ï¬\\x81x a model size of 6 layers by width 128 (about 1.3M-1.4M parameters). Models were trained for 20ð\\x9d\\x90¾ gradient steps for a total of â\\x89\\x88 330ð\\x9d\\x90µ tokens. The longer sequence lengths used sequence length warmup similar to (Nguyen, Poli, et al. 2023).\\nResults. Figure 5 (Right) shows that Mamba is able to make use of longer context even up to extremely long sequences of length 1M, and its pretraining perplexity improves as the context increases. On the other hand, the HyenaDNA model gets worse with sequence length. This is intuitive from the discussion in Section 3.5 on properties of the selection mechanism. In particular, LTI models cannot selectively ignore information; from a convolutional perspective, a very long convolution kernel is aggregating all information across a long sequence\\n13\\nScaling Laws on the Human Genome (HG38) Scaling Laws - Sequence Length (HG38) â\\x80\\x94â\\x80\\x94 HyenaDNa 1.4m â\\x80\\x94= Mamba 1.4M â\\x80\\x94â\\x80\\x94 Mamba 7M ae â\\x80\\x94â\\x80\\x94 HyenaDNA 3.00 4 â\\x80\\x94 Mamba â\\x80\\x94â\\x80\\x94 Transformert+ 2.98 | Perplexity Perplexity 2.80 4 284 2.754 274 r T r r r ; 10Â° 107 103 10 105 10Â° Parameters (log scale) Sequence Length\\nFigure 5: (DNA Scaling Laws.) Pretraining on the HG38 (human genome) dataset. (Left) Fixing short context length 210 = 1024 and increasing size from â\\x89\\x88 200ð\\x9d\\x90¾ to â\\x89\\x88 40ð\\x9d\\x91\\x80 parameters, Mamba scales better than baselines. (Right) Fixing model size and increasing sequence lengths while keeping tokens/batch and total training tokens fixed. Unlike baselines, the selection mechanism of Mamba facilitates better performance with increasing context length.\\nFinetuning Accuracy (Species DNA Classification) 0.8] â\\x80\\x94â\\x80\\x94 HyenaDNA1.4M 0.7-| â\\x80\\x94â\\x80\\x94 Mamba 1.4m â\\x80\\x94â\\x80\\x94 Mamba 7M mag] â\\x80\\x94â\\x80\\x94 Random g 5 os 3 â\\x80\\x9c8 oA 034 024 --------------------------------- T T T T 103 10Â¢ 108 10 Sequence Length\\nScaling Laws - Sequence Length (YouTubeMix) 1.475 â\\x80\\x94â\\x80\\x94 SA+FEN 1.450 4 â\\x80\\x94â\\x80\\x94 Mamba @ 1.4254 2 1.400 4 5 o 1.375 4 Â© 1.3504 1.325 4 1.300 T T T 10* 10Â° 10 Sequence Length\\nFigure 6: (Great Apes DNA Classification.) Accuracy after fine-tuning on sequences of length 210 = 1024 up to 220 = 1048576 using pretrained models of the same context length. Nu- merical results in Table 13.\\nFigure 7: (Audio Pretraining.) Mamba improves performance over prior state-of-the-art (Sashimi) in autoregressive audio mod- eling, while improving up to minute-long context or million- length sequences (controlling for computation).\\nwhich may be very noisy. Note that while HyenaDNA claims to improve with longer context, their results do not control for computation time.\\n# 4.3.3 Synthetic Species Classification\\nWe evaluate models on a downstream task of classifying between 5 diï¬\\x80erent species by randomly sampling a contigu- ous segment of their DNA. This task is adapted from HyenaDNA, which used the species {human, lemur, mouse, pig, hippo}. We modify the task to be signiï¬\\x81cantly more challenging by classifying between the ï¬\\x81ve great apes species {human, chimpanzee, gorilla, orangutan, bonobo}, which are known to share 99% of their DNA.\\n# 4.4 Audio Modeling and Generation\\nFor the audio waveform modality, we compare primarily to the SaShiMi architecture and training protocols (Goel et al. 2022). This model comprises\\n1. a U-Net backbone with two stages of pooling by a factor ð\\x9d\\x91\\x9d that doubles the model dimension ð\\x9d\\x90· per stage,\\n2. alternating S4 and MLP blocks in each stage.\\nWe consider replacing the S4+MLP blocks with Mamba blocks. Experiment details are in Appendix E.4.\\n# 4.4.1 Long-Context Autoregressive Pretraining\\nWe evaluate pretraining quality (autoregressive next-sample prediction) on YouTubeMix (DeepSound 2017), a standard piano music dataset used by prior work consisting of 4 hours of solo piano music, sampled at a rate of\\n14\\n16000 Hz Pretraining details largely follow the standard language modeling setup (Section 4.2). Figure 7 evaluates the eï¬\\x80ect of increasing training sequence lengths from 213 = 8192 to 220 â\\x89\\x88 106, while keeping computation ï¬\\x81xed. (There are some slight edge cases to the way the data is curated, which may lead to kinks in the scaling curves. For example, only minute-long clips were available so the maximum sequence length is actually bounded by 60ð\\x9d\\x91\\xa0 â\\x8b\\x85 16000ð\\x9d\\x90»ð\\x9d\\x91§ = 960000.)\\nBoth Mamba and the SaShiMi (S4+MLP) baseline improve consistently with longer context lengths; Mamba is better throughout, and the gap widens at longer lengths. The main metric is bits per byte (BPB), which is a constant factor log(2) of the standard negative log-likelihood (NLL) loss for pretraining other modalities.\\nWe note one important detail: this is the only experiment in this paper in which we switched from the real parameterization to complex (Section 3.6). We show additional ablations in Appendix E.4.\\n# 4.4.2 Autoregressive Speech Generation\\nSC09 is a benchmark speech generation dataset (Donahue, McAuley, and Puckette 2019; Warden 2018), consisting of 1-second clips sampled at 16000 Hz of the digits â\\x80\\x9czeroâ\\x80\\x9d through â\\x80\\x9cnineâ\\x80\\x9d with highly variable characteristics. We largely follow the autoregressive training setup and generation protocol of Goel et al. (2022).\\nTable 4 shows automated metrics of the Mamba-UNet model compared to a variety of baselines from Goel et al. (2022): WaveNet (Oord et al. 2016), SampleRNN (Mehri et al. 2017), WaveGAN (Donahue, McAuley, and Puckette 2019), Diï¬\\x80Wave (Z. Kong et al. 2021), and SaShiMi. A small Mamba model outperforms the state-of-the-art (and much larger) GAN- and diï¬\\x80usion- based models. A larger model parameter-matched to the baselines further improves on ï¬\\x81delity metrics dramatically.\\nTable 5 takes the small Mamba model and investigates combinations of diï¬\\x80erent architectures for the outer stages and center stage. It shows that Mamba is consistently better than S4+MLP in the outer blocks, and Mamba > S4+MLP > MHA+MLP in the center blocks.\\nTable 4: (SC09) Automated metrics for unconditional generation on a challenging dataset of fixed-length speech clips. (Top to Bottom) Autoregressive baselines, non-autoregressive baselines, Mamba, and dataset metrics.\\nTable 5: (SC09 Model Ablations) Models with 6M parameters. In SaShiMiâ\\x80\\x99s U-Net backbone, there are 8 center blocks operat- ing on sequence length 1000, sandwiched on each side by 8 outer blocks on sequence length 4000, sandwiched by 8 outer blocks on sequence length 16000 (40 blocks total). The architecture of the 8 center blocks are ablated independently of the rest. Note that Transformers (MHA+MLP) were not tested in the more im- portant outer blocks because of efficiency constraints.\\nModel Params NLL â\\x86\\x93 FID â\\x86\\x93 IS â\\x86\\x91 mIS â\\x86\\x91 AM â\\x86\\x93 SampleRNN WaveNet SaShiMi 35.0M 4.2M 5.8M 2.042 1.925 1.873 8.96 5.08 1.99 1.71 2.27 5.13 3.02 5.80 42.57 1.76 1.47 0.74 WaveGAN DiffWave + SaShiMi Mamba Mamba Train Test 19.1M 24.1M 23.0M 6.1M 24.3M - - - - - 1.852 1.860 - - 2.03 1.92 1.42 0.94 0.67 0.00 0.02 4.90 5.26 5.94 6.26 7.33 8.56 8.33 36.10 51.21 69.17 88.54 144.9 292.5 257.6 0.80 0.68 0.59 0.52 0.36 0.16 0.19\\nOuter Center S4+MLP MHA+MLP S4+MLP S4+MLP Mamba Mamba Mamba Mamba S4+MLP MHA+MLP S4+MLP Mamba NLL â\\x86\\x93 1.859 1.867 1.859 1.850 1.853 1.852 FID â\\x86\\x93 1.45 1.43 1.42 1.37 1.07 0.94 IS â\\x86\\x91 5.06 5.42 5.71 5.63 6.05 6.26 mIS â\\x86\\x91 47.03 53.54 56.51 58.23 73.34 88.54 AM â\\x86\\x93 0.70 0.65 0.64 0.62 0.55 0.52\\n4.5 Speed and Memory Benchmarks We benchmark the speed of the SSM scan operation (state expansion ð\\x9d\\x91\\x81 = 16), as well as the end-to-end inference throughput of Mamba, in Figure 8. Our eï¬\\x83cient SSM scan is faster than the best attention implementation that we know of (FlashAttention-2 (Dao 2023)) beyond sequence length 2K, and up to 20-40Ã\\x97 faster than a standard scan implementation in PyTorch. Mamba achieves 4-5Ã\\x97 higher inference throughput than a Transformer of similar size, since without the KV cache it can use much higher batch sizes. For example, a Mamba-6.9B (untrained) would have higher inference throughput than a 5Ã\\x97 smaller Transformer-1.3B. Details in Appendix E.5, which additionally includes a benchmark of memory consumption.\\n15\\nScan vs Convolution vs Attention time (A100 80GB PCle) Inference throughput on A100 80GB (prompt length 2048) â\\x80\\x94 Flashattention-2 ame ee ES 1000-1 â\\x80\\x94 convolution @ 1500] mm Mamba 6.98 wwe â\\x80\\x94â\\x80\\x94 Scan (PyTorch) Py mmm Transformer 6.78 100 4 â\\x80\\x94â\\x80\\x94 Scan (ours) Ei % 00M 2 a tod S 1000 B us Ff = 2 500 â\\x80\\x9c = pad oid r S12 1k 2k Â«= 4k BKK 32K GK 128k 256K 512k 1 2 Hi A 16 32 oa 128 Sequence length Batch size\\n@ =\\n~ Â£\\nFigure 8: (Efficiency Benchmarks.) (Left) Training: our efficient scan is 40Ã\\x97 faster than a standard implementation. (Right) Inference: as a recurrent model, Mamba can achieve 5Ã\\x97 higher throughput than Transformers.\\n# 4.6 Model Ablations\\nWe perform a series of detailed ablations on components of our model, focusing on the setting of language modeling with size â\\x89\\x88 350M models at Chinchilla token counts (same setting as Figure 4).\\n# 4.6.1 Architecture\\nTable 6 investigates the eï¬\\x80ects of the architecture (block) and its inner SSM layer (Figure 3). We ï¬\\x81nd that\\nâ\\x80¢ Among previous non-selective (LTI) SSMs, which are equivalent to global convolutions, performance is very similar.\\nâ\\x80¢ Replacing the complex-valued S4 variant from previous work with a real-valued one does not aï¬\\x80ect performance much, suggesting that (at least for LM) real-valued SSMs may be a better choice when accounting for hardware eï¬\\x83ciency.\\nâ\\x80¢ Replacing any of these with a selective SSM (S6) signiï¬\\x81cantly improves performance, validating the motivation of Section 3.\\nâ\\x80¢ The Mamba architecture performs similarly to the H3 architecture (and seems slightly better when using a selective layer).\\nWe also investigate interleaving the Mamba block with other blocks such as MLP (a traditional architecture) MHA (a hybrid attention architecture) in Appendix E.2.2.\\n# 4.6.2 Selective SSM\\nTable 7 ablates the selective SSM layer by considering diï¬\\x80erent combinations of selective â\\x88\\x86, B, and C param- eters (Algorithm 2), showing that â\\x88\\x86 is the most important parameter due to its connection to RNN gating (Theorem 1).\\nTable 8 considers diï¬\\x80erent initializations of the SSM, which have been shown to make a large diï¬\\x80erence in some data modalities and settings (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022). On language modeling, we ï¬\\x81nd that simpler real-valued diagonal initializations (S4D-Real, row 3) instead of more standard complex-valued parameterizations (S4D-Lin, row 1) perform better. Random initializations also work well, consistent with ï¬\\x81ndings from prior work (Mehta et al. 2023).\\nTable 9 and Table 10 consider varying the dimension of the â\\x88\\x86 and (B, C) projections respectively. Changing them from static to selective provides the most beneï¬\\x81t, while increasing the dimensions further generally improves performance modestly with a small increase in parameter count.\\nOf particular note is the dramatic improvement of the selective SSM when the state size ð\\x9d\\x91\\x81 is increased, with over a 1.0 perplexity improvement for a cost of only 1% additional parameters. This validates our core motivation in Sections 3.1 and 3.3.\\n16\\nTable 6: (Ablations: Architecture and SSM layer.) The Mamba block performs similarly to H3 while being simpler. In the inner layer, there is little difference among different parameterizations of LTI models, while selective SSMs (S6) provide a large improvement. More specifically, the S4 (real) variant is S4D-Real and the S4 (complex) variant is S4D-Lin.\\nModel Arch. SSM Layer Perplexity Model Arch. SSM Layer Perplexity Hyena H3 H3 H3 H3 - H3 - Hyena S4 (complex) S4 (real) S6 10.24 10.30 10.34 8.95 Mamba Hyena - Mamba - - Mamba Mamba Mamba S4 (complex) S4 (real) S6 10.75 10.54 10.56 8.69\\nTable 7: (Ablations: Selective parameters.) â\\x88\\x86 is the most im- portant parameter (Theorem 1), but using multiple selective pa- rameters together synergizes.\\nTable 8: (Ablations: Parameterization of A.) The more standard initializations based on S4D-Lin (Gu, Gupta, et al. 2022) perform worse than S4D-Real or a random initializa- tion, when the SSM is selective.\\nSelective A Selective B SelectiveC Perplexity \\\\Qx& xX Qk *Â®QX Qk Q&X 1093 10.15 9.98 9.81 8.71\\nAð\\x9d\\x91\\x9b Initialization Að\\x9d\\x91\\x9b = â\\x88\\x92 1 Complex Real Að\\x9d\\x91\\x9b = â\\x88\\x921â\\x88\\x952 Að\\x9d\\x91\\x9b = â\\x88\\x92(ð\\x9d\\x91\\x9b + 1) Real Að\\x9d\\x91\\x9b â\\x88¼ exp(ð\\x9d\\x92©(0, 1)) Real Field + ð\\x9d\\x91\\x9bð\\x9d\\x91\\x96 2 9.16 8.85 8.71 8.71\\nTable 9: (Ablations: Expressivity of â\\x88\\x86.) The selection mechanism of â\\x88\\x86 constructs it with a projection of the input. Project- ing it even to dim. 1 provides a large in- crease in performance; increasing it fur- ther provides further improvements at the cost of a modest increase in parameters. State size fixed to ð\\x9d\\x91\\x81 = 16.\\nSize of â\\x88\\x86 proj. - 1 2 4 8 16 32 64 Params (M) 358.9 359.1 359.3 359.7 360.5 362.1 365.2 371.5 9.12 8.97 8.97 8.91 8.83 8.84 8.80 8.71\\n# Perplexity\\nTable 10: (Ablations: SSM state dimension.) (Top) Constant B and C (Bottom) Selective B and C. Increasing the SSM state dimension ð\\x9d\\x91\\x81, which can be viewed as an expansion factor on the dimension of the recurrent state, can significantly improve performance for a negligible cost in parameters/FLOPs, but only when B and C are also selective. Size of â\\x88\\x86 projection fixed to 64.\\nState dimension ð\\x9d\\x91\\x81 Params (M) Perplexity 1 2 4 8 16 1 2 4 8 16 367.1 367.4 368.0 369.1 371.5 367.1 367.4 368.0 369.1 371.5 9.88 9.86 9.82 9.82 9.81 9.73 9.40 9.09 8.84 8.71\\n# 5 Discussion\\nWe discuss related work, limitations, and some future directions.\\nRelated Work. Appendix A discusses how the selection mechanism relates to similar concepts. Appendix B has an extended related work of SSMs and other related models.\\nNo Free Lunch: Continuous-Discrete Spectrum. Structured SSMs were originally deï¬\\x81ned as discretizations of continuous systems (1), and have had a strong inductive bias toward continuous-time data modalities such as perceptual signals (e.g. audio, video). As discussed in Sections 3.1 and 3.5, the selection mechanism overcomes their weaknesses on discrete modalities such as text and DNA; but this conversely can impede their performance\\n17\\non data that LTI SSMs excel on. Our ablations on audio waveforms examine this tradeoï¬\\x80 in more detail.\\nDownstream Affordances. Transformer-based foundation models (particularly LLMs) have a rich ecosystem of properties and modes of interaction with pretrained models, such as ï¬\\x81ne-tuning, adaptation, prompting, in-context learning, instruction tuning, RLHF, quantization, and so on. We are particularly interested in whether Transformer alternatives such as SSMs have similar properties and aï¬\\x80ordances.\\nScaling. Our empirical evaluation is limited to small model sizes, below the threshold of most strong open source LLMs (e.g. Llama (Touvron et al. 2023)) as well as other recurrent models such as RWKV (B. Peng et al. 2023) and RetNet (Y. Sun et al. 2023), which have been evaluated at the 7B parameter scale and beyond. It remains to assess whether Mamba still compares favorably at these larger sizes. We also note that scaling SSMs may involve further engineering challenges and adjustments to the model that are not discussed in this paper.\\n# 6 Conclusion\\nWe introduce a selection mechanism to structured state space models, allowing them to perform context-dependent reasoning while scaling linearly in sequence length. When incorporated into a simple attention-free architecture, Mamba achieves state-of-the-art results on a diverse set of domains, where it matches or exceeds the performance of strong Transformer models. We are excited about the broad applications of selective state space models to build foundation models for diï¬\\x80erent domains, especially in emerging modalities requiring long context such as genomics, audio, and video. Our results suggest that Mamba is a strong candidate to be a general sequence model backbone.\\n# Acknowledgments\\nWe thank Karan Goel, Arjun Desai, and Kush Bhatia for helpful feedback on the draft.\\n# References\\n[1] Martin Arjovsky, Amar Shah, and Yoshua Bengio. â\\x80\\x9cUnitary Evolution Recurrent Neural Networksâ\\x80\\x9d. In: The\\nInternational Conference on Machine Learning (ICML). 2016, pp. 1120â\\x80\\x931128. iga Avsec, Vikram Agarwal, Daniel Visentin, Joseph R Ledsam, Agnieszka Grabska-Barwinska, Kyle R Taylor, Yannis Assael, John Jumper, Pushmeet Kohli, and David R Kelley. â\\x80\\x9cEffective Gene Expression Prediction from Sequence by Integrating Long-range Interactionsâ\\x80\\x9d. In: Nature Methods 18.10 (2021), pp. 1196â\\x80\\x931203. Jimmy Ba, Geoffrey E Hinton, Volodymyr Mnih, Joel Z Leibo, and Catalin Ionescu. â\\x80\\x9cUsing Fast Weights to Attend to the Recent Pastâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 29 (2016). Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. â\\x80\\x9cLayer Normalizationâ\\x80\\x9d. In: arXiv preprint arXiv:1607.06450 (2016).\\n[2]\\n[3]\\n[4]\\n[5] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. â\\x80\\x9cNeural Machine Translation by Jointly Learning to Align and Translateâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2015.\\n[6] David Balduzzi and Muhammad Ghifary. â\\x80\\x9cStrongly-typed Recurrent Neural Networksâ\\x80\\x9d. In: International Con- ference on Machine Learning. PMLR. 2016, pp. 1292â\\x80\\x931300.\\n[7] Stella Biderman, Hailey Schoelkopf, Quentin Gregory Anthony, Herbie Bradley, Kyle OBrien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, et al. â\\x80\\x9cPythia: A Suite for Analyzing Large Language Models across Training and Scalingâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 2397â\\x80\\x932430.\\n[8] Yonatan Bisk, Rowan Zellers, Jianfeng Gao, Yejin Choi, et al. â\\x80\\x9cPIQA: Reasoning about Physical Commonsense in Natural Languageâ\\x80\\x9d. In: Proceedings of the AAAI conference on Artificial Intelligence. Vol. 34. 05. 2020, pp. 7432â\\x80\\x93 7439.\\n[9] Guy E Blelloch. â\\x80\\x9cPrefix Sums and Their Applicationsâ\\x80\\x9d. In: (1990). [10]\\nJames Bradbury, Stephen Merity, Caiming Xiong, and Richard Socher. â\\x80\\x9cQuasi-recurrent Neural Networksâ\\x80\\x9d. In: arXiv preprint arXiv:1611.01576 (2016).\\n18\\n[11] Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Nee- lakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. â\\x80\\x9cLanguage Models are Few-shot Learnersâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 33 (2020), pp. 1877â\\x80\\x931901.\\n[12] Aydar Bulatov, Yuri Kuratov, and Mikhail S Burtsev. â\\x80\\x9cScaling Transformer to 1M tokens and Beyond with RMTâ\\x80\\x9d. In: arXiv preprint arXiv:2304.11062 (2023).\\n[13] Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. â\\x80\\x9cGenerating Long Sequences with Sparse Trans- formersâ\\x80\\x9d. In: arXiv preprint arXiv:1904.10509 (2019).\\n[14] Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Pe- ter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, et al. â\\x80\\x9cRethinking Attention with Performersâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2021.\\n[15] Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. â\\x80\\x9cPaLM: Scaling Language Modeling with Pathwaysâ\\x80\\x9d. In: Journal of Machine Learning Research 24.240 (2023), pp. 1â\\x80\\x93113. url: http://jmlr.org/ papers/v24/22-1144.html. Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio. â\\x80\\x9cEmpirical Evaluation of Gated Re- current Neural Networks on Sequence Modelingâ\\x80\\x9d. In: arXiv preprint arXiv:1412.3555 (2014).\\n[17] Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. â\\x80\\x9cThink you have Solved Question Answering? Try ARC, the AI2 Reasoning Challengeâ\\x80\\x9d. In: arXiv preprint arXiv:1803.05457 (2018).\\n[18] Tri Dao. â\\x80\\x9cFlashAttention-2: Faster Attention with Better Parallelism and Work Partitioningâ\\x80\\x9d. In: (2023). [19] Tri Dao, Daniel Y Fu, Stefano Ermon, Atri Rudra, and Christopher RÃ©. â\\x80\\x9cFlashAttention: Fast and Memory- Efficient Exact Attention with IO-Awarenessâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2022.\\n[20] Tri Dao, Daniel Y Fu, Khaled K Saab, Armin W Thomas, Atri Rudra, and Christopher RÃ©. â\\x80\\x9cHungry Hungry Hippos: Towards Language Modeling with State Space Modelsâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[21] Yann N Dauphin, Angela Fan, Michael Auli, and David Grangier. â\\x80\\x9cLanguage Modeling with Gated Convolu- tional Networksâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). PMLR. 2017, pp. 933â\\x80\\x93941.\\n# [22] DeepSound. SampleRNN. https://github.com/deepsound-project/samplernn-pytorch. 2017. [23]\\nJiayu Ding, Shuming Ma, Li Dong, Xingxing Zhang, Shaohan Huang, Wenhui Wang, and Furu Wei. â\\x80\\x9cLongNet: Scaling Transformers to 1,000,000,000 Tokensâ\\x80\\x9d. In: arXiv preprint arXiv:2307.02486 (2023).\\n[24] Chris Donahue, Julian McAuley, and Miller Puckette. â\\x80\\x9cAdversarial Audio Synthesisâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2019.\\n[25] Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. â\\x80\\x9cAn Image is Worth 16x16 Words: Transformers for Image Recognition at Scaleâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2020.\\n[26] Nelson Elhage, Neel Nanda, Catherine Olsson, Tom Henighan, Nicholas Joseph, Ben Mann, Amanda Askell, Yuntao Bai, Anna Chen, Tom Conerly, Nova DasSarma, Dawn Drain, Deep Ganguli, Zac Hatfield-Dodds, Danny Hernandez, Andy Jones, Jackson Kernion, Liane Lovitt, Kamal Ndousse, Dario Amodei, Tom Brown, Jack Clark, Jared Kaplan, Sam McCandlish, and Chris Olah. â\\x80\\x9cA Mathematical Framework for Transformer Circuitsâ\\x80\\x9d. In: Transformer Circuits Thread (2021). https://transformer-circuits.pub/2021/framework/index.html. [27] Mahan Fathi, Jonathan Pilault, Pierre-Luc Bacon, Christopher Pal, Orhan Firat, and Ross Goroshin. â\\x80\\x9cBlock-\\nState Transformerâ\\x80\\x9d. In: arXiv preprint arXiv:2306.09539 (2023).\\n[28] Yassir Fathullah, Chunyang Wu, Yuan Shangguan, Junteng Jia, Wenhan Xiong, Jay Mahadeokar, Chunxi Liu, Yangyang Shi, Ozlem Kalinli, Mike Seltzer, et al. â\\x80\\x9cMulti-Head State Space Model for Sequence Modelingâ\\x80\\x9d. In: INTERSPEECH. 2023.\\n[29] Karl J Friston, Lee Harrison, and Will Penny. â\\x80\\x9cDynamic Causal Modellingâ\\x80\\x9d. In: Neuroimage 19.4 (2003), pp. 1273â\\x80\\x93 1302.\\n[30] Daniel Y Fu, Elliot L Epstein, Eric Nguyen, Armin W Thomas, Michael Zhang, Tri Dao, Atri Rudra, and Christo- pher RÃ©. â\\x80\\x9cSimple Hardware-efficient Long Convolutions for Sequence Modelingâ\\x80\\x9d. In: The International Confer- ence on Machine Learning (ICML) (2023).\\n[31] Ken-ichi Funahashi and Yuichi Nakamura. â\\x80\\x9cApproximation of Dynamical Systems by Continuous Time Recur- rent Neural Networksâ\\x80\\x9d. In: Neural Networks 6.6 (1993), pp. 801â\\x80\\x93806.\\n19\\n[32] Leo Gao, Stella Biderman, Sid Black, Laurence Golding, Travis Hoppe, Charles Foster, Jason Phang, Horace He, Anish Thite, Noa Nabeshima, Shawn Presser, and Connor Leahy. â\\x80\\x9cThe Pile: An 800GB Dataset of Diverse Text for Language Modelingâ\\x80\\x9d. In: arXiv preprint arXiv:2101.00027 (2020).\\n[33] Leo Gao, Jonathan Tow, Stella Biderman, Sid Black, Anthony DiPofi, Charles Foster, Laurence Golding, Jeffrey Hsu, Kyle McDonell, Niklas Muennighoff, Jason Phang, Laria Reynolds, Eric Tang, Anish Thite, Ben Wang, Kevin Wang, and Andy Zou. A Framework for Few-shot Language Model Evaluation. Version v0.0.1. Sept. 2021. doi: 10.5281/zenodo.5371628. url: https://doi.org/10.5281/zenodo.5371628.\\n[34] Karan Goel, Albert Gu, Chris Donahue, and Christopher RÃ©. â\\x80\\x9cItâ\\x80\\x99s Raw! Audio Generation with State-Space Modelsâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2022.\\n[35] Albert Gu, Tri Dao, Stefano Ermon, Atri Rudra, and Christopher RÃ©. â\\x80\\x9cHIPPO: Recurrent Memory with Optimal Polynomial Projectionsâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2020.\\n[36] Albert Gu, Karan Goel, and Christopher RÃ©. â\\x80\\x9cEfficiently Modeling Long Sequences with Structured State Spacesâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2022.\\n[37] Albert Gu, Caglar Gulcehre, Tom Le Paine, Matt Hoffman, and Razvan Pascanu. â\\x80\\x9cImproving the Gating Mech- anism of Recurrent Neural Networksâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2020.\\n[38] Albert Gu, Ankit Gupta, Karan Goel, and Christopher RÃ©. â\\x80\\x9cOn the Parameterization and Initialization of Diag-\\nonal State Space Modelsâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2022.\\n[39] Albert Gu, Isys Johnson, Karan Goel, Khaled Saab, Tri Dao, Atri Rudra, and Christopher RÃ©. â\\x80\\x9cCombining Recur- rent, Convolutional, and Continuous-time Models with the Linear State Space Layerâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2021.\\n[40] Albert Gu, Isys Johnson, Aman Timalsina, Atri Rudra, and Christopher RÃ©. â\\x80\\x9cHow to Train Your HIPPO: State Space Models with Generalized Basis Projectionsâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[41] Ankit Gupta, Albert Gu, and Jonathan Berant. â\\x80\\x9cDiagonal State Spaces are as Effective as Structured State Spacesâ\\x80\\x9d. In: Advances in Neural Information Processing Systems 35 (2022), pp. 22982â\\x80\\x9322994.\\n[42] David Ha, Andrew Dai, and Quoc V. Le. â\\x80\\x9cHyperNetworksâ\\x80\\x9d. In: The International Conference on Learning Rep- resentations (ICLR). 2017.\\n[43] Danijar Hafner, Timothy Lillicrap, Jimmy Ba, and Mohammad Norouzi. â\\x80\\x9cDream to Control: Learning Behav- iors by Latent Imaginationâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2020. [44] Ramin Hasani, Mathias Lechner, Tsun-Hsuan Wang, Makram Chahine, Alexander Amini, and Daniela Rus. â\\x80\\x9cLiquid Structural State-Space Modelsâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[45] Mikael Henaff, Arthur Szlam, and Yann LeCun. â\\x80\\x9cRecurrent Orthogonal Networks and Long-Memory Tasksâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2016.\\n[46] Dan Hendrycks and Kevin Gimpel. â\\x80\\x9cGaussian Error Linear Units (GELUs)â\\x80\\x9d. In: arXiv preprint arXiv:1606.08415 (2016).\\n[47] Sepp Hochreiter and JÃ¼rgen Schmidhuber. â\\x80\\x9cLong Short-Term Memoryâ\\x80\\x9d. In: Neural Computation 9.8 (1997),\\npp. 1735â\\x80\\x931780. Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. â\\x80\\x9cAn Empirical Analysis of Compute- Optimal Large Language Model Trainingâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 35 (2022), pp. 30016â\\x80\\x9330030.\\n48\\n[49] Weizhe Hua, Zihang Dai, Hanxiao Liu, and Quoc Le. â\\x80\\x9cTransformer Quality in Linear Timeâ\\x80\\x9d. In: The Interna- tional Conference on Machine Learning (ICML). PMLR. 2022, pp. 9099â\\x80\\x939117.\\n[50] Hassan Ismail Fawaz, Germain Forestier, Jonathan Weber, Lhassane Idoumghar, and Pierre-Alain Muller. â\\x80\\x9cDeep Learning for Time Series Classification: A Reviewâ\\x80\\x9d. In: Data Mining and Knowledge Discovery 33.4 (2019), pp. 917â\\x80\\x93963.\\n[51] Andrei Ivanov, Nikoli Dryden, Tal Ben-Nun, Shigang Li, and Torsten Hoefler. â\\x80\\x9cData Movement is All You Need: A Case Study on Optimizing Transformersâ\\x80\\x9d. In: Proceedings of Machine Learning and Systems 3 (2021), pp. 711â\\x80\\x93 732.\\n[52] Li Jing, Caglar Gulcehre, John Peurifoy, Yichen Shen, Max Tegmark, Marin Soljacic, and Yoshua Bengio. â\\x80\\x9cGated Orthogonal Recurrent Units: On Learning to Forgetâ\\x80\\x9d. In: Neural Computation 31.4 (2019), pp. 765â\\x80\\x93783. [53] Rudolph Emil Kalman. â\\x80\\x9cA New Approach to Linear Filtering and Prediction Problemsâ\\x80\\x9d. In: (1960).\\n20\\n[54] Angelos Katharopoulos, Apoorv Vyas, Nikolaos Pappas, and FranÃ§ois Fleuret. â\\x80\\x9cTransformers are RNNs: Fast Autoregressive Transformers with Linear Attentionâ\\x80\\x9d. In: International Conference on Machine Learning. PMLR. 2020, pp. 5156â\\x80\\x935165.\\n[55] Zhifeng Kong, Wei Ping, Jiaji Huang, Kexin Zhao, and Bryan Catanzaro. â\\x80\\x9cDiffWave: A Versatile Diffusion Model for Audio Synthesisâ\\x80\\x9d. In: International Conference on Learning Representations. 2021.\\n[56] Chrysoula Kosma, Giannis Nikolentzos, and Michalis Vazirgiannis. â\\x80\\x9cTime-Parameterized Convolutional Neu- ral Networks for Irregularly Sampled Time Seriesâ\\x80\\x9d. In: arXiv preprint arXiv:2308.03210 (2023).\\n[57] Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. â\\x80\\x9cImageNet Classification with Deep Convolutional Neural Networksâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 25 (2012).\\n[58] Tao Lei. â\\x80\\x9cWhen Attention Meets Fast Recurrence: Training Language Models with Reduced Computeâ\\x80\\x9d. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing. 2021, pp. 7633â\\x80\\x937648. [59] Tao Lei, Yu Zhang, Sida I Wang, Hui Dai, and Yoav Artzi. â\\x80\\x9cSimple Recurrent Units for Highly Parallelizable\\nRecurrenceâ\\x80\\x9d. In: arXiv preprint arXiv:1709.02755 (2017).\\n[60] Mario Lezcano-Casado and David MartÃ\\xadnez-Rubio. â\\x80\\x9cCheap Orthogonal Constraints in Neural Networks: A Simple Parametrization of the Orthogonal and Unitary Groupâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2019.\\n[61] Yuhong Li, Tianle Cai, Yi Zhang, Deming Chen, and Debadeepta Dey. â\\x80\\x9cWhat Makes Convolutional Models Great on Long Sequence Modeling?â\\x80\\x9d In: The International Conference on Learning Representations (ICLR). 2023. [62] Vasileios Lioutas and Yuhong Guo. â\\x80\\x9cTime-aware Large Kernel Convolutionsâ\\x80\\x9d. In: The International Conference\\non Machine Learning (ICML). PMLR. 2020, pp. 6172â\\x80\\x936183.\\n[63] Chris Lu, Yannick Schroecker, Albert Gu, Emilio Parisotto, Jakob Foerster, Satinder Singh, and Feryal Behba- hani. â\\x80\\x9cStructured State Space Models for In-Context Reinforcement Learningâ\\x80\\x9d. In: Advances in Neural Informa- tion Processing Systems (NeurIPS). 2023.\\n[64] Shahar Lutati, Itamar Zimerman, and Lior Wolf. â\\x80\\x9cFocus Your Attention (with Adaptive IIR Filters)â\\x80\\x9d. In: arXiv preprint arXiv:2305.14952 (2023).\\n[65] Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. â\\x80\\x9cMega: Moving Average Equipped Gated Attentionâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[66] Eric Martin and Chris Cundy. â\\x80\\x9cParallelizing Linear Recurrent Neural Nets Over Sequence Lengthâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2018.\\n[67] Soroush Mehri, Kundan Kumar, Ishaan Gulrajani, Rithesh Kumar, Shubham Jain, Jose Sotelo, Aaron Courville, and Yoshua Bengio. â\\x80\\x9cSampleRNN: An Unconditional End-to-End Neural Audio Generation Modelâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2017.\\n[68] Harsh Mehta, Ankit Gupta, Ashok Cutkosky, and Behnam Neyshabur. â\\x80\\x9cLong Range Language Modeling via Gated State Spacesâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[69] Zakaria Mhammedi, Andrew Hellicar, Ashfaqur Rahman, and James Bailey. â\\x80\\x9cEfficient Orthogonal Parametri- sation of Recurrent Neural Networks using Householder Reflectionsâ\\x80\\x9d. In: International Conference on Machine Learning. PMLR. 2017, pp. 2401â\\x80\\x932409.\\n[70] Eric Nguyen, Karan Goel, Albert Gu, Gordon Downs, Preey Shah, Tri Dao, Stephen Baccus, and Christopher RÃ©. â\\x80\\x9cS4ND: Modeling Images and Videos as Multidimensional Signals with State Spacesâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2022.\\n[71] Eric Nguyen, Michael Poli, Marjan Faizi, Armin Thomas, Callum Birch-Sykes, Michael Wornow, Aman Pa- tel, Clayton Rabideau, Stefano Massaroli, Yoshua Bengio, et al. â\\x80\\x9cHyenaDNA: Long-range Genomic Sequence Modeling at Single Nucleotide Resolutionâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2023.\\n[72] Catherine Olsson, Nelson Elhage, Neel Nanda, Nicholas Joseph, Nova DasSarma, Tom Henighan, Ben Mann, Amanda Askell, Yuntao Bai, Anna Chen, Tom Conerly, Dawn Drain, Deep Ganguli, Zac Hatfield-Dodds, Danny Hernandez, Scott Johnston, Andy Jones, Jackson Kernion, Liane Lovitt, Kamal Ndousse, Dario Amodei, Tom Brown, Jack Clark, Jared Kaplan, Sam McCandlish, and Chris Olah. â\\x80\\x9cIn-context Learning and Induction Headsâ\\x80\\x9d. In: Transformer Circuits Thread (2022). https://transformer-circuits.pub/2022/in-context-learning-and-induction- heads/index.html.\\n[73] Aaron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalch- brenner, Andrew Senior, and Koray Kavukcuoglu. â\\x80\\x9cWaveNet: A Generative Model for Raw Audioâ\\x80\\x9d. In: arXiv preprint arXiv:1609.03499 (2016).\\n21\\n[74] Antonio Orvieto, Samuel L Smith, Albert Gu, Anushan Fernando, Caglar Gulcehre, Razvan Pascanu, and So- ham De. â\\x80\\x9cResurrecting Recurrent Neural Networks for Long Sequencesâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2023.\\n[75] Denis Paperno, GermÃ¡n Kruszewski, Angeliki Lazaridou, Ngoc-Quan Pham, Raffaella Bernardi, Sandro Pezzelle, Marco Baroni, Gemma Boleda, and Raquel FernÃ¡ndez. â\\x80\\x9cThe LAMBADA Dataset: Word Prediction Requiring a Broad Discourse Contextâ\\x80\\x9d. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics. 2016, pp. 1525â\\x80\\x931534.\\n[76] Razvan Pascanu, Tomas Mikolov, and Yoshua Bengio. â\\x80\\x9cOn the Difficulty of Training Recurrent Neural Net- worksâ\\x80\\x9d. In: International Conference on Machine Learning. 2013, pp. 1310â\\x80\\x931318.\\n[77] Bo Peng, Eric Alcaide, Quentin Anthony, Alon Albalak, Samuel Arcadinho, Huanqi Cao, Xin Cheng, Michael Chung, Matteo Grella, Kranthi Kiran GV, et al. â\\x80\\x9cRWKV: Reinventing RNNs for the Transformer Eraâ\\x80\\x9d. In: arXiv preprint arXiv:2305.13048 (2023).\\n[78] Hao Peng, Nikolaos Pappas, Dani Yogatama, Roy Schwartz, Noah A Smith, and Lingpeng Kong. â\\x80\\x9cRandom Feature Attentionâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2021.\\n[79] Michael Poli, Stefano Massaroli, Eric Nguyen, Daniel Y Fu, Tri Dao, Stephen Baccus, Yoshua Bengio, Stefano Ermon, and Christopher RÃ©. â\\x80\\x9cHyena Hierarchy: Towards Larger Convolutional Language Modelsâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). 2023.\\n[80] Zhen Qin, Xiaodong Han, Weixuan Sun, Bowen He, Dong Li, Dongxu Li, Yuchao Dai, Lingpeng Kong, and Yiran Zhong. â\\x80\\x9cToeplitz Neural Network for Sequence Modelingâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[81] Zhen Qin, Xiaodong Han, Weixuan Sun, Dongxu Li, Lingpeng Kong, Nick Barnes, and Yiran Zhong. â\\x80\\x9cThe devil in linear transformerâ\\x80\\x9d. In: arXiv preprint arXiv:2210.10340 (2022).\\n[82] Zhen Qin, Weixuan Sun, Hui Deng, Dongxu Li, Yunshen Wei, Baohong Lv, Junjie Yan, Lingpeng Kong, and Yiran Zhong. â\\x80\\x9cCosFormer: Rethinking Softmax in Attentionâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2022.\\n[83] Ali Rahimi and Benjamin Recht. â\\x80\\x9cRandom features for large-scale kernel machinesâ\\x80\\x9d. In: Advances in neural information processing systems 20 (2007).\\n[84] Prajit Ramachandran, Barret Zoph, and Quoc V Le. â\\x80\\x9cSwish: A Self-gated Activation Functionâ\\x80\\x9d. In: arXiv preprint arXiv:1710.05941 7.1 (2017), p. 5.\\n[85] David W Romero, Anna Kuzina, Erik J Bekkers, Jakub M Tomczak, and Mark Hoogendoorn. â\\x80\\x9cCKConv: Con- tinuous Kernel Convolution For Sequential Dataâ\\x80\\x9d. In: arXiv preprint arXiv:2102.02611 (2021).\\n[86] Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. â\\x80\\x9cWinogrande: An Adversarial Wino- grad Schema Challenge at Scaleâ\\x80\\x9d. In: Communications of the ACM 64.9 (2021), pp. 99â\\x80\\x93106.\\n[87] George Saon, Ankit Gupta, and Xiaodong Cui. â\\x80\\x9cDiagonal State Space Augmented Transformers for Speech Recognitionâ\\x80\\x9d. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE. 2023, pp. 1â\\x80\\x935. Imanol Schlag, Kazuki Irie, and JÃ¼rgen Schmidhuber. â\\x80\\x9cLinear Transformers are Secretly Fast Weight Program- mersâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). PMLR. 2021, pp. 9355â\\x80\\x939366. [89] Noam Shazeer. â\\x80\\x9cGLU Variants Improve Transformerâ\\x80\\x9d. In: arXiv preprint arXiv:2002.05202 (2020). [90] Freda Shi, Xinyun Chen, Kanishka Misra, Nathan Scales, David Dohan, Ed H Chi, Nathanael SchÃ¤rli, and Denny Zhou. â\\x80\\x9cLarge Language Models can be Easily Distracted by Irrelevant Contextâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 31210â\\x80\\x9331227. Jiaxin Shi, Ke Alexander Wang, and Emily Fox. â\\x80\\x9cSequence Modeling with Multiresolution Convolutional Mem- oryâ\\x80\\x9d. In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 31312â\\x80\\x9331327. Jimmy TH Smith, Andrew Warrington, and Scott W Linderman. â\\x80\\x9cSimplified State Space Layers for Sequence Modelingâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023. Jianlin Su, Yu Lu, Shengfeng Pan, Ahmed Murtadha, Bo Wen, and Yunfeng Liu. â\\x80\\x9cRoformer: Enhanced Trans- former with Rotary Position Embeddingâ\\x80\\x9d. In: arXiv preprint arXiv:2104.09864 (2021).\\n[93]\\n[94] Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, and Furu Wei. â\\x80\\x9cRetentive network: A successor to transformer for large language modelsâ\\x80\\x9d. In: arXiv preprint arXiv:2307.08621 (2023). Ilya Sutskever, Oriol Vinyals, and Quoc V Le. â\\x80\\x9cSequence to Sequence Learning with Neural Networksâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 27 (2014).\\n22\\n[96] Corentin Tallec and Yann Ollivier. â\\x80\\x9cCan Recurrent Neural Networks Warp Time?â\\x80\\x9d In: The International Con- ference on Learning Representations (ICLR). 2018.\\n[97] Yi Tay, Mostafa Dehghani, Samira Abnar, Yikang Shen, Dara Bahri, Philip Pham, Jinfeng Rao, Liu Yang, Se- bastian Ruder, and Donald Metzler. â\\x80\\x9cLong Range Arena: A Benchmark for Efficient Transformersâ\\x80\\x9d. In: Inter- national Conference on Learning Representations (ICLR). 2021.\\n[98] Yi Tay, Mostafa Dehghani, Dara Bahri, and Donald Metzler. â\\x80\\x9cEfficient Transformers: A Surveyâ\\x80\\x9d. In: ACM Com- puting Surveys 55.6 (2022), pp. 1â\\x80\\x9328.\\n[99] Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, TimothÃ©e Lacroix, Bap- tiste RoziÃ¨re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. â\\x80\\x9cLlama: Open and Efficient Foundation Language Modelsâ\\x80\\x9d. In: arXiv preprint arXiv:2302.13971 (2023).\\n[100] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. â\\x80\\x9cAttention Is All You Needâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS). 2017.\\n[101] Eugene Vorontsov, Chiheb Trabelsi, Samuel Kadoury, and Chris Pal. â\\x80\\x9cOn Orthogonality and Learning Recur- rent Networks with Long Term Dependenciesâ\\x80\\x9d. In: International Conference on Machine Learning. PMLR. 2017, pp. 3570â\\x80\\x933578. Jue Wang, Wentao Zhu, Pichao Wang, Xiang Yu, Linda Liu, Mohamed Omar, and Raffay Hamid. â\\x80\\x9cSelective Structured State-Spaces for Long-form Video Understandingâ\\x80\\x9d. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2023, pp. 6387â\\x80\\x936397.\\n[102]\\n[103] Pete Warden. â\\x80\\x9cSpeech Commands: A Dataset for Limited-Vocabulary Speech Recognitionâ\\x80\\x9d. In: ArXiv abs/1804.03209 (2018).\\n[104] Samuel Williams, Andrew Waterman, and David Patterson. â\\x80\\x9cRoofline: An Insightful Visual Performance Model for Multicore Architecturesâ\\x80\\x9d. In: Communications of the ACM 52.4 (2009), pp. 65â\\x80\\x9376.\\n[105] Brandon Yang, Gabriel Bender, Quoc V Le, and Jiquan Ngiam. â\\x80\\x9cCondConv: Conditionally Parameterized Con- volutions for Efficient Inferenceâ\\x80\\x9d. In: Advances in Neural Information Processing Systems (NeurIPS) 32 (2019). [106] Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. â\\x80\\x9cHellaSwag: Can a Machine Really Finish Your Sentence?â\\x80\\x9d In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguis- tics. 2019.\\n[107] Shuangfei Zhai, Walter Talbott, Nitish Srivastava, Chen Huang, Hanlin Goh, Ruixiang Zhang, and Josh Susskind. â\\x80\\x9cAn Attention Free Transformerâ\\x80\\x9d. In: arXiv preprint arXiv:2105.14103 (2021).\\n[108] Michael Zhang, Khaled K Saab, Michael Poli, Tri Dao, Karan Goel, and Christopher RÃ©. â\\x80\\x9cEffectively Modeling Time Series with Simple Discrete State Spacesâ\\x80\\x9d. In: The International Conference on Learning Representations (ICLR). 2023.\\n[109] Lin Zheng, Chong Wang, and Lingpeng Kong. â\\x80\\x9cLinear complexity randomized self-attention mechanismâ\\x80\\x9d. In: International Conference on Machine Learning. PMLR. 2022, pp. 27011â\\x80\\x9327041.\\n[110] Simiao Zuo, Xiaodong Liu, Jian Jiao, Denis Charles, Eren Manavoglu, Tuo Zhao, and Jianfeng Gao. â\\x80\\x9cEfficient Long Sequence Modeling via State Space Augmented Transformerâ\\x80\\x9d. In: arXiv preprint arXiv:2212.08136 (2022).\\n23\\n# A Discussion: Selection Mechanism\\nOur selection mechanism is inspired by and related to concepts such as gating, hypernetworks, and data-dependence. It can also be viewed as related to â\\x80\\x9cfast weightsâ\\x80\\x9d (J. Ba et al. 2016), which connects classical RNNs with the mechanism of linear attention (Schlag, Irie, and Schmidhuber 2021). However, we believe that it is a distinct concept that is worth clarifying.\\nGating. Gating originally referred to the gating mechanisms of RNNs such as the LSTM (Hochreiter and Schmidhuber 1997) and GRU (J. Chung et al. 2014), or the gated equation (5)n Theorem 1. This was interpreted as a particular mechanism for controlling whether to let an input into the hidden state of an RNN. In particular, this aï¬\\x80ects the propagation of signal through time and causes inputs to interact along the sequence length dimension.\\nHowever, the concept of gating has since been relaxed in popular usage to simply mean any multiplicative interaction (often with an activation function). For example, elementwise multiplicative components of neural network architectures (that do not interact along sequence length) are now commonly referred to as gated architectures (Hua et al. 2022; Mehta et al. 2023), despite a very diï¬\\x80erent meaning than the original RNN sense. Thus we believe the original concept of RNN gating versus the popular usage of multiplicative gating actually have a very diï¬\\x80erent semantic meaning.\\nHypernetworks. Hypernetworks refer to neural networks whose parameters are themselves generated by smaller neural networks. The original idea (Ha, Dai, and Quoc V. Le 2017) used it in a narrow sense to deï¬\\x81ne a large RNN whose recurrent parameters are generated by a smaller RNN.\\nData-dependence. Similar to hypernetworks, data-dependence can refer to any notion where some parameters of the model depend on the data (Poli et al. 2023).\\nExample: GLU Activation. To illustrate the issues with these concepts, consider a simple diagonal linear layer ð\\x9d\\x91¦ = Dð\\x9d\\x91¥, where D is a diagonal weight parameter. Now suppose that D is itself generated from a linear transformation of ð\\x9d\\x91¥, with an optional nonlinearity: D = ð\\x9d\\x9c\\x8e(W ð\\x9d\\x91¥). Since it is diagonal, the multiplication becomes an elementwise product: ð\\x9d\\x91¦ = ð\\x9d\\x9c\\x8e(W ð\\x9d\\x91¥)â\\x97¦ð\\x9d\\x91¥.\\nThis is a rather trivial transformation, yet it technically satisï¬\\x81es the common meanings of gating (since it has a multiplicative â\\x80\\x9cbranchâ\\x80\\x9d), hypernetworks (since the parameter D is generated by another layer), and data-dependent (since D depends on the data ð\\x9d\\x91¥). However, this in fact simply deï¬\\x81nes a GLU function, which is so simple that it is often considered just an activation function (Dauphin et al. 2017; Shazeer 2020) instead of a meaningful layer.\\nSelection. Thus, while selection mechanisms could be considered a special case of ideas such as architectural gating, hypernetworks, or data-dependence, so can an enormous range of other constructionsâ\\x80\\x94essentially anything with a multiplication, including standard attention mechanisms (Bahdanau, Cho, and Bengio 2015; Vaswani et al. 2017) as wellâ\\x80\\x94and we ï¬\\x81nd it uninformative to think of them as such.\\nInstead, we view it as most closely related to the gating mechanism of traditional RNNs, which is a special case (Theorem 1) and also has a deeper history of connections to SSMs through variable (input-dependent) discretization of â\\x88\\x86 (Funahashi and Nakamura 1993; Gu, Dao, et al. 2020; Tallec and Ollivier 2018). We also eschew the term â\\x80\\x9cgatingâ\\x80\\x9d in favor of selection to clarify the overloaded use of former. More narrowly, we use selection to refer to the mechanistic action of a model to select or ignore inputs and facilitate data interaction along the sequence length (Section 3.1). Beyond selective SSMs and gated RNNs, other examples may include input-dependent convolutions (Kosma, Nikolentzos, and Vazirgiannis 2023; Lioutas and Guo 2020; Lutati, Zimerman, and Wolf 2023; Yang et al. 2019) and even attention.\\n24\\n# B Related Work\\nWe overview several prior works related to our methods. We mention that some of the most closely related models include recurrent layers such as S4, S5, and quasi-RNNs; as well as end-to-end architectures such as H3, RetNet, and RWKV.\\n# B.1 S4 Variants and Derivatives\\nWe describe a brief overview of some structured SSMs from past work, particularly those that have a relation to our method.\\nâ\\x80¢ S4 (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021) introduced the ï¬\\x81rst structured SSM, describing diagonal structure and diagonal plus low-rank (DPLR). It focused on eï¬\\x83cient convolutional algorithms for DPLR SSMs due to a connection to continuous-time online memorization (HIPPO) (Gu, Dao, et al. 2020).\\nâ\\x80¢ DSS (Gupta, Gu, and Berant 2022) ï¬\\x81rst discovered the empirical eï¬\\x80ectiveness of diagonal structured SSMs by approximating the HIPPO initialization. This was expanded on theoretically in S4D (Gu, Gupta, et al. 2022).\\nâ\\x80¢ S5 (Smith, Warrington, and Linderman 2023) independently discovered the diagonal SSM approximation, and is the ï¬\\x81rst S4 model to be computed recurrently with the parallel scan. However, this required lowering the eï¬\\x80ective state dimension, which they accomplished by switching the SSM dimensions from a SISO (single-input single-output) to MIMO (multi-input multi-output) formulation. Our proposed S6 shares the scan, but diï¬\\x80ers by (i) keeping the SISO dimensions, which provides a larger eï¬\\x80ective recurrent state, (ii) using a hardware-aware algorithm to overcome the computation issue, (iii) adding the selection mechanism.\\nLu et al. (2023) applied S5 to meta-RL in order to handle resetting the SSM state between episode trajectories. Their mechanism can be viewed as a particular hard-coded instance of a selection mechanism, where A is manually set to 0, instead of our learnable mechanism that depends on the input. It would be interesting to apply selective SSMs generically to this setting and probe if the model has learned to automatically reset its state on episode boundaries.\\nâ\\x80¢ Mega (Ma et al. 2023) introduced a simpliï¬\\x81cation of S4 to be real- instead of complex- valued, giving it an interpretation of being an exponential moving average (EMA). They additionally make an interesting connection of the discretization step of SSMs to an EMA damping term. Contrary to ï¬\\x81ndings in the original S4 papers, this was the ï¬\\x81rst model to show that real-valued SSMs are empirically eï¬\\x80ective in certain settings or when combined with diï¬\\x80erent architectural components.\\nâ\\x80¢ Liquid S4 (Hasani et al. 2023) is also motivated by augmenting S4 with an input-dependent state transition. From this perspective it shares similarity to selection mechanisms, although in a limited form which is still computed convolutionally and close to LTI.\\nâ\\x80¢ SGConv (Y. Li et al. 2023), Hyena (Poli et al. 2023), LongConv (Fu et al. 2023), MultiresConv (J. Shi, K. A. Wang, and Fox 2023), and Toeplitz Neural Network (Qin, Han, W. Sun, He, et al. 2023) all focus on the convolutional representation of S4 and create global or long convolution kernels with diï¬\\x80erent parameterizations. However, these methods cannot do fast autoregressive inference directly.\\nNotably, all of these methods, and all other structured SSMs that we are aware of, have been non-selective and usually strictly LTI (linear time invariant).\\n# B.2 SSM Architectures\\nWe use SSM architectures or state space neural networks (SSNN) to refer to deep neural network architectures incorporating one of the previous SSMs as a black box layer.\\nâ\\x80¢ GSS (Mehta et al. 2023) was the ï¬\\x81rst gated neural network architecture incorporating SSMs. It is motivated by the gated attention unit (GAU) of Hua et al. (2022) and looks quite similar to our block, except with additional projections. Most importantly, its projection contracts the model dimension to reduce the state size of the SSM, while ours expands the model dimension in order to increase the state size, based on the motivation in Section 3.1.\\n25\\nâ\\x80¢ Mega (Ma et al. 2023) combined the EMA simpliï¬\\x81cation of S4 described above into a hybrid architecture using an eï¬\\x83cient attention approximation.\\nâ\\x80¢ H3 (Dao, Fu, Saab, et al. 2023) is motivated by combining S4 with linear attention (Katharopoulos et al. 2020). It is the ï¬\\x81rst to generalize this formulation of linear attention to more general recurrences, which is also the basis of later architectures.\\nâ\\x80¢ Selective S4 (J. Wang et al. 2023) incorporates S4 as a black box to generate a binary mask which is multiplied on the input. While sharing the â\\x80\\x9cselectionâ\\x80\\x9d name, we consider this an architectural modiï¬\\x81cation that is closer to architectural gating than a selection mechanism (Appendix A). For example, we hypothesize that it would not solve the Selective Copying task because simply masking out the irrelevant inputs does not aï¬\\x80ect the spacing between the relevant ones (indeed, the Selective Copying task can even be viewed as coming pre-masked if the noise tokens are embedded to 0).\\nâ\\x80¢ RetNet (Y. Sun et al. 2023) is also based on Linear Attention and very similar to H3, but reduces the inner S4 layer to a special case where the state dimension is ð\\x9d\\x91\\x81 = 1. Although not framed as such, its recurrence can be viewed as a special case of a linear SSM.\\nIts primary source of improvement is using a linear attention with large head dimension, which can be viewed as another method to perform input-dependent state expansion. Using a larger head dimension in the context of linear attention variants was ï¬\\x81rst done by H3, but not extensively used since this requires a proportional amount of extra computation. RetNet avoids this with an alternate way to parallelize the computation with a variant of standard multi-head attention instead of convolutions, made feasible by their particular special case of SSMs which acts as a simple EMA.\\nâ\\x80¢ RWKV (B. Peng et al. 2023) is another recent RNN designed for language modeling. It is based on AFT (attention-free Transformer (S. Zhai et al. 2021)), another variant of linear attention. Its main â\\x80\\x9cWKVâ\\x80\\x9d mechanism involves LTI recurrences and can be seen as the ratio of two SSMs.\\nWe also highlight the gated attention unit (GAU) from Hua et al. (2022), which was motivated by combining the Transformerâ\\x80\\x99s MHA and MLP blocks together and was an inspiration for our architecture (Section 3.4) combining the H3 and MLP blocks.\\n# B.3 Relationship to RNNs\\nRNNs and SSMs are broadly related, as they both involve the concepts of recurrence on a latent state.\\nSeveral older RNNs such as the strongly typed RNN (Balduzzi and Ghifary 2016), quasi-RNN (QRNN) (Bradbury et al. 2016), and simple recurrent unit (SRU) (Lei 2021; Lei et al. 2017) involve forms of gated RNNs without time-wise nonlinearities. Because of the connections of gating mechanisms and selection mechanisms, these can be viewed as cases of selective SSMs, and are thus more powerful in a sense than the family of LTI structured SSMs above. The main diï¬\\x80erences are:\\nâ\\x80¢ They do not use state expansion (ð\\x9d\\x91\\x81 = 1) or selective B, C parameters, both of which are important for performance (Section 4.6).\\nâ\\x80¢ They use a heuristic gating mechanism, which we generalize as a consequence of the selection mechanism + discretization (Theorem 1). The connections to principled SSM theory provides better parameterizations and initializations (Section 3.6).\\nAdditionally, older RNNs famously suï¬\\x80ered from eï¬\\x83ciency issues and the vanishing gradients problem (Pascanu, Mikolov, and Bengio 2013), both caused by their sequential nature. The latter could be solved for some of the above RNNs by leveraging the parallel scan (Martin and Cundy 2018), but the former was diï¬\\x83cult without theory later developed for SSMs. For example, modern structured SSMs diï¬\\x80er in more careful parameterization of the recurrent dynamics inspired by classical SSM theory (e.g. through discretization (Gu, Johnson, Goel, et al. 2021; Gu, Johnson, Timalsina, et al. 2023)), or direct analysis (Orvieto et al. 2023)).\\nWe also note that there is a long line of work on orthogonal RNNs (Arjovsky, Shah, and Bengio 2016; Henaï¬\\x80, Szlam, and LeCun 2016; Lezcano-Casado and MartÃ\\xadnez-Rubio 2019; Mhammedi et al. 2017; Vorontsov et al. 2017)\\n26\\nwhich are motivated by constraining the A transition matrix to be orthogonal or unitary, in order to control its eigenvalues and prevent the vanishing gradient problem. However, these had other limitations; we believe that these stem from the fact that orthogonal/unitary RNNs are also LTI. For example, they are almost always evaluated on the Copying task which they can solve perfectly, but observed to struggle on the Selective Copying task (Jing et al. 2019).\\n# B.4 Linear Attention\\nThe Linear Attention (LA) (Katharopoulos et al. 2020) framework is an important result popularizing kernel attention and showing how it relates to recurrent autoregressive models. Many variants have proposed alternative kernels and other modiï¬\\x81cations. Random Feature Attention (RFA) (H. Peng et al. 2021) chooses the kernel feature map to approximate softmax attention (i.e. the exp feature map) using the random Fourier feature approximation of Gaussian kernels (Rahimi and Recht 2007). Performer (Choromanski et al. 2021) ï¬\\x81nds an approximation to the exponential kernel involving only positive features, which also allows the softmax normalization term. TransNormer (Qin, Han, W. Sun, D. Li, et al. 2022) showed that the LA denominator term can be unstable and proposed replacing it with a LayerNorm. cosFormer (Qin, W. Sun, et al. 2022) augments RFA with a cosine reweighting mechanism that incorporates positional information to emphasize locality. Linear Randomized Attention (Zheng, C. Wang, and L. Kong 2022) generalize RFA from the perspective of importance sampling, and generalize it to provide better estimates of the full softmax kernel (rather than just the exp-transformed numerator).\\nAside from kernel attention, many other variants of eï¬\\x83cient attention exist; the survey Tay, Dehghani, Bahri, et al. (2022) oï¬\\x80ers an extensive categorization of many of these.\\n# B.5 Long Context Models\\nLong context has become a popular subject, and several recent models have claimed to scale to longer and longer sequences. However, these are often from a computational standpoint and have not been extensively validated. These include:\\nâ\\x80¢ Recurrent Memory Transformer (Bulatov, Kuratov, and Burtsev 2023), a lightweight wrapper around a Transformer backbone. It showed ability to generalize up to 1M sequences but only on synthetic memorization tasks; their main result is similar to our Induction Heads extrapolation experiment (Table 2).\\nâ\\x80¢ LongNet (Ding et al. 2023), which claimed to scale to 1B length but only evaluated on length < 100ð\\x9d\\x90¾ for actual tasks.\\nâ\\x80¢ Hyena and HyenaDNA (Nguyen, Poli, et al. 2023; Poli et al. 2023), which claimed to leverage up to 1M context. However, their experiments trained on proportionally more data at longer contexts, making it hard to conclude if quality improvements at 1M context are due to context length or due to more data and computation.\\nâ\\x80¢ Sparse Transformer (Child et al. 2019) showed a proof-of-concept of using a strided sparse attention Transformer to model audio waveforms of length 220 = 1048576, although did not discuss performance tradeoï¬\\x80s when controlling for computation and model size.\\nIn contrast, we believe this work presents one of the ï¬\\x81rst approaches to meaningfully demonstrate increasing performance with longer context.\\n# C Mechanics of Selective SSMs\\nProof of Theorem 1. Consider a selective SSM (Algorithm 2) with ð\\x9d\\x91\\x81 = 1, A = â\\x88\\x921, B = 1, ð\\x9d\\x91\\xa0â\\x88\\x86 = ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥), ð\\x9d\\x9c\\x8fâ\\x88\\x86 = ð\\x9d\\x97\\x8cð\\x9d\\x97\\x88ð\\x9d\\x96¿ð\\x9d\\x97\\x8dð\\x9d\\x97\\x89ð\\x9d\\x97\\x85ð\\x9d\\x97\\x8eð\\x9d\\x97\\x8c. The corresponding continuous-time SSM (1) is\\nâ\\x84\\x8e(ð\\x9d\\x91¡) = â\\x88\\x92â\\x84\\x8e(ð\\x9d\\x91¡) + ð\\x9d\\x91¥(ð\\x9d\\x91¡)\\nwhich is also called a leaky integrator.\\n27\\nThe discretization step size is\\nThe discretization step size is\\n# â\\x88\\x86ð\\x9d\\x91¡ = ð\\x9d\\x9c\\x8fâ\\x88\\x86(ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b + ð\\x9d\\x91\\xa0â\\x88\\x86(ð\\x9d\\x91¥ð\\x9d\\x91¡))\\n= ð\\x9d\\x97\\x8cð\\x9d\\x97\\x88ð\\x9d\\x96¿ð\\x9d\\x97\\x8dð\\x9d\\x97\\x89ð\\x9d\\x97\\x85ð\\x9d\\x97\\x8eð\\x9d\\x97\\x8c(ð\\x9d\\x96¯ð\\x9d\\x96ºð\\x9d\\x97\\x8bð\\x9d\\x96ºð\\x9d\\x97\\x86ð\\x9d\\x96¾ð\\x9d\\x97\\x8dð\\x9d\\x96¾ð\\x9d\\x97\\x8b + ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)) = ð\\x9d\\x97\\x8cð\\x9d\\x97\\x88ð\\x9d\\x96¿ð\\x9d\\x97\\x8dð\\x9d\\x97\\x89ð\\x9d\\x97\\x85ð\\x9d\\x97\\x8eð\\x9d\\x97\\x8c(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡))\\nwhere we observe that the parameter can be viewed as a learnable bias and folded into the linear projection.\\nNow applying the zero-order hold (ZOH) discretization formulas:\\nAð\\x9d\\x91¡ = exp(â\\x88\\x86A) = 1 1 + exp(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡) = ð\\x9d\\x9c\\x8e(â\\x88\\x92ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)) = 1 â\\x88\\x92 ð\\x9d\\x9c\\x8e(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)) Bð\\x9d\\x91¡ = (â\\x88\\x86A)â\\x88\\x921(exp(â\\x88\\x86A) â\\x88\\x92 I) â\\x8b\\x85 â\\x88\\x86B = â\\x88\\x92(exp(â\\x88\\x86A) â\\x88\\x92 I) = 1 â\\x88\\x92 A = ð\\x9d\\x9c\\x8e(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)).\\nThus the final discrete recurrence (2a) is\\nð\\x9d\\x91\\x94ð\\x9d\\x91¡ = ð\\x9d\\x9c\\x8e(ð\\x9d\\x96«ð\\x9d\\x97\\x82ð\\x9d\\x97\\x87ð\\x9d\\x96¾ð\\x9d\\x96ºð\\x9d\\x97\\x8b(ð\\x9d\\x91¥ð\\x9d\\x91¡)) â\\x84\\x8eð\\x9d\\x91¡ = (1 â\\x88\\x92 ð\\x9d\\x91\\x94ð\\x9d\\x91¡)â\\x84\\x8eð\\x9d\\x91¡â\\x88\\x921 + ð\\x9d\\x91\\x94ð\\x9d\\x91¡ð\\x9d\\x91¥ð\\x9d\\x91¡\\nas desired.\\n# D Hardware-aware Algorithm For Selective SSMs\\nWithout input-dependent selectivity, SSMs can be eï¬\\x83ciently implemented as a convolution (Dao, Fu, Saab, et al. 2023; Gu, Goel, and RÃ© 2022), which leverages the fast Fourier transform (FFT) as primitive. With selectivity, SSMs are no-longer equivalent to convolution, but we leverage the parallel associative scan. While SSM scans are theoretically eï¬\\x83cient (ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90·ð\\x9d\\x91\\x81) FLOPs, scaling linear in ð\\x9d\\x90¿), training foundation models with selective SSMs requires them to be eï¬\\x83cient on modern hardware (GPUs) as well. We describe how we use kernel fusion and recomputation to make SSM scan fast and memory-eï¬\\x83cient. We evaluate the speed of our scan implementation compared to convolution and attention in Section 4.5, showing that it is up to 7Ã\\x97 times faster than attention at sequence length 32K, and is as memory-eï¬\\x83cient as the best attention implementation (FlashAttention).\\nSpeed. On modern hardware accelerators (GPUs) most operations (except matrix multiply) are bounded by memory-bandwidth (Dao, Fu, Ermon, et al. 2022; Ivanov et al. 2021; Williams, Waterman, and Patterson 2009). This the case with our scan operation, and we use kernel fusion to reduce the amount of memory IOs, leading to signiï¬\\x81cant speedup compared to a standard implementation.\\nThe standard way to implement the scan algorithm in Section 3.2 is to prepare the scan input A, B of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·, ð\\x9d\\x91\\x81) in GPU HBM (high-bandwidth memory, commonly referred to as GPU memory), call a parallel associative scan implementation to write the scan output of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·, ð\\x9d\\x91\\x81) to GPU HBM, then multiply that scan output with C to produce an output of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·). However, this requires the number of memory reads/writes on the order of ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90·ð\\x9d\\x91\\x81). We can instead fuse the discretization step, the scan, and the multiplication with C into one kernel:\\n1. We read in ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x90· + ð\\x9d\\x90·ð\\x9d\\x91\\x81) bytes of memory (â\\x88\\x86, A, B, C) from slow HBM to fast SRAM.\\n2. We discretize to produce A, B of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·, ð\\x9d\\x91\\x81) in SRAM.\\n3. We perform a parallel associative scan, yielding intermediate states of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·, ð\\x9d\\x91\\x81) in SRAM.\\n4. We multiply and sum with C, producing outputs of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·) and write it to HBM.\\nThis way, we reduce IOs by a factor of ð\\x9d\\x91\\x82(ð\\x9d\\x91\\x81) (the state dimension), which in practice speeds up the operation by 20-40 times (Section 4.5).\\n28\\nTable 11: (Induction heads.) Models are trained on sequence length 2Â° = 256, and tested on various sequence lengths of 2Â° = 64 up to 2Â° = 1048576. Y denotes perfect generalization accuracy, while X denotes out of memory.\\nModel Params Test Accuracy (%) at Sequence Length 26 7 28 29 210 gl 212 913 214915216 917918919920 MHA-Abs 137K v 99.6 100.0 58.6 266 188 98 10.9 7.8 X x x x x x MHA-RoPE = 137K v v 100.0 83.6 31.3 184 8.6 9.0 5.5 xX x x x x x MHA-xPos 137K v v 100.0 99.6 67.6 254 7.0 9.0 78 =X x x x x x H3 153K v v 100.0 80.9 39.5 238 148 82 59 66 82 47 82 63 74 Hyena 69M* 977 Vo 100.0 Vv 441 125 66 5.1 70 #59 66 66 59 63 98 Mamba 74K v v 100.0 Vv v v v v v v v v v v v\\nâ\\x88\\x97 Most of the parameters are in learnable positional encodings.\\nFor sequence length ð\\x9d\\x90¿ too long where we cannot ï¬\\x81t the sequence in SRAM (which is much smaller than HBM), we split the sequences into chunks and perform the fused scan on each chunk. As long as we have the intermediate scan states, we can continue the scan with the next chunk.\\nMemory. We describe how we use the classical technique of recomputation to reduce the total amount of memory required to train selective SSM layers.\\nFrom the way we fuse the forward pass, we do not save the intermediate states of size (ð\\x9d\\x90µ, ð\\x9d\\x90¿, ð\\x9d\\x90·, ð\\x9d\\x91\\x81) to avoid memory blowup. However, these intermediate states are necessary for the backward pass to compute gradients. We instead recompute those intermediate states in the backward pass. Since the inputs â\\x88\\x86, A, B, C and output gradient read from HBM to SRAM are of size ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x91\\x81 + ð\\x9d\\x90·ð\\x9d\\x91\\x81), and the input gradients are also of size ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x91\\x81 + ð\\x9d\\x90·ð\\x9d\\x91\\x81), recomputation avoids the cost of reading ð\\x9d\\x91\\x82(ð\\x9d\\x90µð\\x9d\\x90¿ð\\x9d\\x91\\x81ð\\x9d\\x90·) elements from HBM. This means that recomputation of the SSM states in the backward pass speeds up the computation compared to storing them and reading them from HBM.\\nBeyond optimizing for the memory requirement of just the scan operation, we also use recomputation to optimize the memory requirement of the entire selective SSM block (input projection, convolution, activation, scan, output projection). In particular, we do not save intermediate activations that take a lot of memory but are fast to recompute (e.g. output of activation function or short convolution). As a result, the selective SSM layer has the same memory requirement as an optimized Transformer implementation with FlashAttention. In particular, each attention layer (FlashAttention) stores around 12 bytes of activations per token, an each MLP layer stores around 20 bytes of activations per token, for a total of 32 bytes ((assuming mixed-precision training in FP16 or BF16)). Each selective SSM stores around 16 bytes of activations per token. Hence two layers of selective SSMs have around the same activation memory as an attention layer and an MLP layer.\\n# E Experimental Details and Additional Results\\n# E.1 Synthetic Tasks\\nSelective Copying. Our setting is on sequences of length 4096, with a vocab size of 16 possible tokens (including the white â\\x80\\x9cnoiseâ\\x80\\x9d token from Figure 2) and requiring models to memorize 16 â\\x80\\x9cdataâ\\x80\\x9d tokens. We use 2 layer models with a model dimension of ð\\x9d\\x90· = 64.\\nModels are trained for 400K steps at a constant learning rate of 0.0001 with a batch size of 64.\\nInduction Heads. Training consists of randomly generating data every step, with a batch size of 8. We choose an â\\x80\\x9cepochâ\\x80\\x9d size of 8192 steps, and track the accuracy on ï¬\\x81xed validation sets (also randomly generated) of each target sequence length. For the MHA-Abs and Mamba models, results are reported after the 25th epoch (8192 Ã\\x97 25 = 204800 steps). For the MHA-RoPE and MHA-xPos models, results are reported after the 50th epoch (8192 Ã\\x97 50 = 409600 steps). For the LTI H3 and Hyena models, results are reported after the 10th epoch (81920 steps) because they had converged by then and failed to improve further.\\n29\\nTable 12: (Scaling Law Model Sizes.) Our model sizes and hyperparameters for scaling experiments. (Model dimension and number of heads applies only to Transformer models.)\\nParams ð\\x9d\\x9a\\x97_ð\\x9d\\x9a\\x95ð\\x9d\\x9a\\x8að\\x9d\\x9a¢ð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x9bð\\x9d\\x9a\\x9c ð\\x9d\\x9a\\x8d_ð\\x9d\\x9a\\x96ð\\x9d\\x9a\\x98ð\\x9d\\x9a\\x8dð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x95 ð\\x9d\\x9a\\x97_ð\\x9d\\x9a\\x91ð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x8að\\x9d\\x9a\\x8dð\\x9d\\x9a\\x9c / ð\\x9d\\x9a\\x8d_ð\\x9d\\x9a\\x91ð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x8að\\x9d\\x9a\\x8d Training steps Learning Rate Batch Size Tokens 125M 350M 760M 1.3B 12 24 24 24 768 1024 1536 2048 12 / 64 16 / 64 16 / 96 32 / 64 4800 13500 29000 50000 6e-4 3e-4 2.5e-4 2e-4 0.5M tokens 0.5M tokens 0.5M tokens 0.5M tokens 2.5B 7B 15B 26B\\nWe use the Adam optimizer with no weight decay. All models are trained at constant learning rates 2ð\\x9d\\x91\\x92 â\\x88\\x92 4 and 1ð\\x9d\\x91\\x92 â\\x88\\x92 3, and the better results are reported for each model (2ð\\x9d\\x91\\x92 â\\x88\\x92 4 for all models except Mamba). The attention and Hyena models did not learn at LR 1ð\\x9d\\x91\\x92 â\\x88\\x92 3. H3 learned at both LRs, but interestingly generalized better to shorter sequences at the smaller LR of 2ð\\x9d\\x91\\x92 â\\x88\\x92 4. Mamba learned at both LRs, but extrapolated better at the larger LR of 1ð\\x9d\\x91\\x92 â\\x88\\x92 3.\\n# E.2 Language Modeling\\n# E.2.1 Scaling Law Details\\nAll models were trained on the Pile.\\nModel Sizes. Table 12 speciï¬\\x81es the model sizes we use for scaling laws. This is taken directly from the GPT3 speciï¬\\x81cations (Brown et al. 2020), with very minor modiï¬\\x81cations. First, we changed the batch size of the 1.3B model from 1M tokens to 0.5M tokens, since we did not use enough parallelization to require the larger batch size. Second, we changed the number of training steps and total tokens to roughly match Chinchilla scaling laws (Hoï¬\\x80mann et al. 2022), which specify that training tokens should increase proportionally to model size.\\nTraining Recipes. All models used the AdamW optimizer with\\nâ\\x80¢ gradient clip value 1.0\\nâ\\x80¢ weight decay 0.1\\nno dropout\\nlinear learning rate warmup with cosine decay\\nBy default, the peak learning rate is the GPT3 speciï¬\\x81cation.\\nWe give several models an â\\x80\\x9cimproved recipeâ\\x80\\x9d, inspired by changes adopted by popular large language models such as PaLM (Chowdhery et al. 2023) and LLaMa (Touvron et al. 2023). These include:\\nâ\\x80¢ linear learning rate warmup with cosine decay to 1ð\\x9d\\x91\\x92 â\\x88\\x92 5, with a peak value of 5Ã\\x97 the GPT3 value\\nno linear bias terms\\nRMSNorm instead of LayerNorm\\nâ\\x80¢ AdamW hyperparameter ð\\x9d\\x9b½ = (.9, .95) (the GPT3 value) instead of the PyTorch default of ð\\x9d\\x9b½ = (.9, .999)\\nArchitecture and Training Details. Our models are: â\\x80¢ Transformer: The standard Transformer based on GPT3 (Table 12).\\nâ\\x80¢ Transformer++: A Transformer with an improved architecture, namely rotary positional encodings (Su et al. 2021) and SwiGLU MLP (Shazeer 2020), and the improved training recipe above.\\nâ\\x80¢ Hyena: Interleaving a Hyena block (the H3 block with S4 replaced by a global convolution parameterized by an MLP) with standard MLP blocks. The MLP blocks have expansion factor 2 instead of 4 and the number of layers is correspondingly increased by 1.5Ã\\x97 to preserve parameter count.\\n30\\nâ\\x80¢ H3++: The H3 architecture with a few modiï¬\\x81cations, including (i) using the same â\\x80\\x9cthinâ\\x80\\x9d Hyena dimensions above (ii) the improved training recipe above (iii) a linear attention head dimension of 8.\\nâ\\x80¢ RWKV: The default RWKV model from B. Peng et al. (2023), including its modiï¬\\x81ed MLP block. We also used as much of its speciï¬\\x81ed training recipe as possible, such as increasing the learning rates by 2Ã\\x97 or 3Ã\\x97 on certain parameters.\\nâ\\x80¢ RetNet: The default RetNet model from Y. Sun et al. (2023). We also gave it the improved training recipe above.\\nâ\\x80¢ Mamba: The standard Mamba architecture, with the improved training recipe.\\n# E.2.2 Additional Scaling Law Ablations\\nWe perform additional ablations on the architecture using the same protocol as the 2k context length scaling laws in Figure 4 (Left).\\nMamba Architecture: Interleaving Blocks. We test the eï¬\\x80ect of diï¬\\x80erent architectural blocks combined with the Mamba block. We focus on the viewpoint that the Mamba block is simply the standard SwiGLU block with an extra ð\\x9d\\x96¼ð\\x9d\\x97\\x88ð\\x9d\\x97\\x87ð\\x9d\\x97\\x8f â\\x86\\x92 ð\\x9d\\x96²ð\\x9d\\x96²ð\\x9d\\x96¬ path added. This leads to two natural ablations:\\nâ\\x80¢ What if the Mamba block is interleaved with a standard MLP block, instead of stacked homogenously? This can also be interpreted as taking Mamba and removing half of the SSMs.\\nâ\\x80¢ What if the Mamba block is interleaved with MHA (multi-head attention) blocks? This can also be interpreted as taking a Transformer with SwiGLU MLPs (i.e. what we call Transformer++) and simply adding SSMs to the MLP blocks.\\nFigure 9 (Right) shows these variants compared to the original (homogenous) Mamba architecture. Interestingly, neither change matters too much. The Mamba-MLP architecture is only slightly worse, and still better than all models except Transformer++. The Mamba-MHA architecture is only slightly better, which is somewhat surprising in light of the fact that many recent works have found that combining (LTI) SSMs with Attention can lead to substantial improvements (Dao, Fu, Saab, et al. 2023; Fathi et al. 2023; Fathullah et al. 2023; Saon, Gupta, and Cui 2023; Zuo et al. 2022).\\nH3 Architecture: Training Recipes. Next we ablate diï¬\\x80erences between the Hyena and H3++ models, our weakest and strongest models outside of Transformer++ and Mamba, particularly to isolate the eï¬\\x80ect of training recipes.\\nâ\\x80¢ Hyena: The Hyena block with its original architecture and GPT3 training recipe (same as Figure 4).\\nâ\\x80¢ Hyena+: The same architecture but with the improved training recipe described above.\\nâ\\x80¢ H3+: The same architecture as Hyena+ but with the Hyena convolution kernel swapped out for S4D convolution kernel.\\nâ\\x80¢ H3++: The same as H3+, but with a linear attention head dimension of 8. This increases computation inside the SSM recurrence but does not increase parameters.\\nOur general convention is that â\\x80\\x9cModel+â\\x80\\x9d represents the base model with the improved training recipe, and â\\x80\\x9cModel++â\\x80\\x9d also allows for architectural changes.\\nFigure 9 (Right) shows that\\nA large improvement is achieved by the improved training recipe, which was used for many of the models in the\\nmain Figure 4 (RetNet, H3++, Transformer++, Mamba).\\nThe choice of the inner LTI SSM does not matter (e.g. Hyena vs. S4), consistent with ï¬\\x81ndings throughout this\\npaper.\\nThe head dimension expansion improves performance, consistent with one of our main themes that expanded\\nstate dimension improves performance for SSMs (Section 3).\\n31\\nScaling Laws on The Pile (Sequence Length 2048) Scaling Laws on The Pile (Sequence Length 2048) â\\x80\\x94â\\x80\\x94 Mamba Hyena Mamba-mLp | = â\\x80\\x94 Hyenas â\\x80\\x94â\\x80\\x94 Members |g â\\x80\\x94â\\x80\\x94 He a â\\x80\\x94 He 3 Sox! = 2104 ext? 5 2S 7x0 Ea 1 1 1 1 10 30 10Â° 10â\\x80\\x9d FLOPS (log scale) FLOPs (log scale)\\ns 5 2 3\\n2 = 3 8\\nFigure 9: (Scaling laws: extra ablations.) (Left) Instead of (Right) Instead of\\n# E.2.3 Downstream Evaluation Details\\nThis pretraining procedure is the same as the scaling law protocol, but extended to 300B tokens. For the 1.3B model, we use a batch size of 1M tokens to be consistent with the GPT3 speciï¬\\x81cations. We report the perplexity on the Pile validation set, and for this metric only compare to models trained on the same dataset and with the same tokenizer, in particular Pythia and RWKV.\\nFor downstream evaluation, we use the LM evaluation harness from EleutherAI (L. Gao, Tow, et al. 2021), as done by most work in this area. We evaluate on the following tasks/datasets that measure common sense reasoning:\\nâ\\x80¢ LAMBADA (Paperno et al. 2016).\\nâ\\x80¢ HellaSwag (Zellers et al. 2019).\\nâ\\x80¢ PIQA (Bisk et al. 2020).\\nâ\\x80¢ ARC-challenge (P. Clark et al. 2018).\\nâ\\x80¢ ARC-easy: an easy subset of ARC-challenge.\\nâ\\x80¢ WinoGrande (Sakaguchi et al. 2021).\\nWe report accuracy for LAMBADA, WinoGrande, PIQA, and ARC-easy, and accuracy normalized by sequence length for HellaSwag and ARC-challenge (since normalized accuracy is higher for almost all models for these task).\\n# E.3 DNA Modeling\\n# E.3.1 Pretraining Details\\nWe describe the dataset and training procedure of the HG38 pretraining task in more detail.\\nThe dataset follows the splits from the prior Enformer work on genomics (Avsec et al. 2021); the training split contains a total of ð\\x9d\\x91\\x86 = 34021 segments of length 217 = 131072 that cover the genome, for a total of approximately 4.5 billion tokens (DNA base pairs). These segments are pairs of (chromosome number, starting index, ending index), and can be extended if necessary (e.g. to get longer segments). We deviate from HyenaDNA when the training sequence length is not 217. HyenaDNA always takes a ï¬\\x81xed sub-segment (e.g. the beginning or middle of the prescribed segment), and thus for any training sequence length each epoch is ï¬\\x81xed to 34021 samples and doesnâ\\x80\\x99t necessarily go through the whole genome. On the other hand, we use the entire training data: â\\x80¢ When the context length ð\\x9d\\x90¿ is less than (or equal to) 217, we divide up each segment into non-overlapping\\nsub-segments of length ð\\x9d\\x90¿, so that there are ð\\x9d\\x91\\x86 Ã\\x97 217 ð\\x9d\\x90¿ total samples and ð\\x9d\\x91\\x86 Ã\\x97 217 â\\x89\\x88 4.5ð\\x9d\\x90µ tokens per epoch.\\nâ\\x80¢ When the context length ð\\x9d\\x90¿ is greater than 217, we turn each segment into two samples, one that begins with the prescribed segment and one that ends with the prescribed segment. Thus each epoch has 2ð\\x9d\\x91\\x86 items and 2ð\\x9d\\x91\\x86ð\\x9d\\x90¿\\n32\\ntokens per epoch. For example, at sequence length 218 = 262144 there are 4Ã\\x97 as many tokens as the default, and at sequence length 220 there are 16Ã\\x97 as many tokens.\\nOther training details generally follow the same protocol as our language modeling experiments (Appendix E.2). For example, we use the AdamW with (ð\\x9d\\x9b½1, ð\\x9d\\x9b½2) = (0.9, 0.95), no dropout, weight decay 0.1. We use a cosine learning rate scheduler with linear warmup for 10% of total steps.\\n# E.3.2 Scaling: Model Size Details\\nModels. The models we consider are: â\\x80¢ Transformer++: a Transformer with improved architecture, notably the usage of RoPE positional encodings (Su et al. 2021). Informally, we found these to be noticeably better than vanilla positional encodings from (Vaswani et al. 2017).\\nâ\\x80¢ HyenaDNA: the Hyena model from Nguyen, Poli, et al. (2023) and Poli et al. (2023), which is roughly a Transformer with the MHA block replaced by an H3 block using a global convolution parameterized by an MLP.\\nâ\\x80¢ Mamba: the standard Mamba architecture.\\nModel Sizes. We use the following model sizes.\\nBlocks Model Dimension Params (Approx.) 4 64 250K 700K 1.4M 3.5M 7.0M 19.3M 40.7M 5 96 6 128 7 192 8 256 10 384 12 512\\nNote that the number of blocks for Mamba is doubled, because one Transformer â\\x80\\x9clayerâ\\x80\\x9d includes both the MHA and MLP blocks (and similarly for Hyena), which requires two Mamba blocks to match parameters (Section 3.4).\\nTraining. For each model (Transformer++, HyenaDNA, Mamba), we swept the learning rate across {1ð\\x9d\\x91\\x92 â\\x88\\x92 3, 2ð\\x9d\\x91\\x92 â\\x88\\x92 3, 4ð\\x9d\\x91\\x92 â\\x88\\x92 3, 8ð\\x9d\\x91\\x92 â\\x88\\x92 3}. The optimal Transformer and HyenaDNA learning rates were 2e-3 across all sizes. The optimal Mamba learning rate was 8e-3; note that Mamba performed better than baselines with matched learning rates (2e-3), but was more stable and improved even more at higher learning rates. (Furthermore, as this LR is on the upper range of the sweep, it is possible that our results are still suboptimal.)\\nNote that, in contrast to standard LM scaling laws (Table 12), our LR held constant across model sizes for simplicity. The optimal LR should go down for larger models, but we didnâ\\x80\\x99t ï¬\\x81nd a noticeable eï¬\\x80ect at the small model sizes (at most a few million parameters) we considered.\\nE.3.3 Scaling: Context Length Details We use a total batch size of 224 â\\x89\\x88 16ð\\x9d\\x91\\x80 tokens per training step, for every sequence length (e.g. at length 220 there are 16 segments per batch and at length 210 there are 16384 segments per batch). This is a large batch size relative to the model size by usual LM standards, but note that a batch size of 223 is the minimum possible on a machine with 8 GPUs and sequence length of 220, and that HyenaDNA used much larger batches of 228. The learning rate used was 0.008 for Mamba and 0.001 for HyenaDNA; we initially attempted to use the same learning rate of 0.002 from the previous section for HyenaDNA, but found that it was unstable at the longest context length.\\nSequence Length Warmup. Following (Nguyen, Poli, et al. 2023), we use sequence length warmup (SLW) during pretraining. We choose a simple schedule of 2 epochs at each power-of-two sequence length starting from 210 = 1024. (Note that because of how data is curated, at the longest sequence lengths more steps and tokens are spent proportionally. In particular, each stage up to length 217 processes the same number of tokens, but 4Ã\\x97 as many tokens are processed at length 218, 8Ã\\x97 as many at length 219, and 16Ã\\x97 as many at length 220.)\\nUnlike HyenaDNA, we always control for the number of tokens per gradient update, so the batch size is successively halved as the sequence lengths are doubled in each stage.\\n33\\nTable 13: (Great Apes DNA Classification.) Accuracy after fine-tuning on sequences of length 210 = 1024 up to 220 = 1048576 using pretrained models of the same context length. Random guessing is 20%.\\nParams Accuracy (%) at Sequence Length 210 212 214 216 218 220 28.04 31.47 28.43 27.50 41.17 27.66 42.22 40.72 31.10 42.41 7M 30.00 29.01 31.48 43.73 56.60\\nRemark E.1. We also note that the schedule was not tuned, and we never experimented with turning off sequence length warmup for these pretraining experiments. We later found that SLW did not help noticeably for audio pretraining at similar lengths (Section 4.4), and it is possible that it is not necessary for DNA pretraining either.\\n# E.3.4 Species (Great Apes) Classification\\nModels are causal and therefore only the last element (across the sequence length) of the modelâ\\x80\\x99s output is used for the classiï¬\\x81cation head. Note that we control for the total number of elements in the loss function per gradient step. The pretraining objective includes all positions across the sequence length, so that ð\\x9d\\x9a\\x8bð\\x9d\\x9a\\x8að\\x9d\\x9a\\x9dð\\x9d\\x9a\\x8cð\\x9d\\x9a\\x91_ð\\x9d\\x9a\\x9cð\\x9d\\x9a\\x92ð\\x9d\\x9a£ð\\x9d\\x9a\\x8eÃ\\x97ð\\x9d\\x9a\\x9cð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x9að\\x9d\\x9a\\x9eð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x97ð\\x9d\\x9a\\x8cð\\x9d\\x9a\\x8e_ð\\x9d\\x9a\\x95ð\\x9d\\x9a\\x8eð\\x9d\\x9a\\x97ð\\x9d\\x9a\\x90ð\\x9d\\x9a\\x9dð\\x9d\\x9a\\x91 is held constant; in other words, the batch size decreases as the sequence length increases. However, for a classiï¬\\x81cation task, since only the last position enters the loss, the batch size itself is held constant. Note that this also means that ï¬\\x81ne-tuning models with longer sequence lengths is more computationally expensive.\\nTraining consists of 10 epochs, each of which has 1024 gradient steps. Each gradient step uses batch size 64, which are all independently randomly drawn by uniformly picking a species, uniformly picking a chromosome, and then uniformly picking a contiguous segment of DNA. Following (Nguyen, Poli, et al. 2023), models with a maximum context length greater than 214 = 16384 use sequence length warmup with 1 epoch at length 214 = 16384, 1 epoch at length 215 = 32768, 1 epoch at length 216 = 65536, and so on up to the maximum sequence length. For example, the model with 220 = 1048576 context undergoes 6 epochs of sequence length warmup before 4 more epochs at its maximum sequence length.\\nThe learning rate for all Hyena models is ð\\x9d\\x9fºð\\x9d\\x9a\\x8e â\\x88\\x92 ð\\x9d\\x9f», while the learning rate for all Mamba models is ð\\x9d\\x9f·ð\\x9d\\x9a\\x8e â\\x88\\x92 ð\\x9d\\x9fº. These were found by performing learning rate sweeps for each model among {1ð\\x9d\\x91\\x92 â\\x88\\x92 5, 2ð\\x9d\\x91\\x92 â\\x88\\x92 5, 4ð\\x9d\\x91\\x92 â\\x88\\x92 5, 1ð\\x9d\\x91\\x92 â\\x88\\x92 4, 2ð\\x9d\\x91\\x92 â\\x88\\x92 4} for the smaller sequence lengths (210, 212, 214, 216), and these values were consistently found to be the best for each model. An abridged learning rate sweep was done at length 218, which agreed with these values, and a single run at length 220 was performed (as described above, the computational cost of these experiments is proportional to the sequence length). The learning rate followed a cosine decay schedule with warmup with 5 epochs of linear warmup to the maximum learning rate, and 5 epochs of cosine decay down to 1ð\\x9d\\x91\\x92 â\\x88\\x92 6. The unusually long learning rate warmup schedule was chosen because the sequence length warmup was also long (e.g. comprising 6 out of 10 epochs for the model with context length 220); we did not experiment with this choice.\\nResults for the Species classiï¬\\x81cation task are in Table 13.\\n# E.4 Audio Details\\n# E.4.1 YouTubeMix Audio Pretraining\\nModel. We use a model with 3 blocks per stage (3 Ã\\x97 5 = 15 total Mamba blocks), pooling factor ð\\x9d\\x91\\x9d = 16, and outer dimension ð\\x9d\\x90· = 64, for about 3.5M parameters.\\nDataset. The data is mu-law encoded at 8 bits, so the model is modeling discrete tokens with a vocab size of 256.\\nThe dataset consists of clips of up to 1 minute long, or length 960000, which is subsampled and divided into segments of any desired sequence length. Since the architecture involves two stages of pooling by a factor of 16,\\n34\\nTable 14: YouTubeMix length scaling sequence lengths and batch sizes.\\n468 Ã\\x97 2048 = 958464 234 Ã\\x97 2048 = 479232 117 Ã\\x97 2048 = 239616 59 Ã\\x97 2048 = 120832 30 Ã\\x97 2048 = 61440 15 Ã\\x97 2048 = 30720 8 Ã\\x97 2048 = 16384 4 Ã\\x97 2048 = 8192 1 2 4 8 16 32 64 128 958464 958464 958464 966656 983040 983040 1048576 1048576\\nAudio Waveforms - SSM Parameterization aso â\\x80\\x94â\\x80\\x94 samp â\\x80\\x94â\\x80\\x94 Mamba (s6) = â\\x80\\x9csy = sSeaive B/C Â° 1.40 4 â\\x80\\x94â\\x80\\x94 -selective A s ras | __Mamba-$4) B 1204 124 108 108 Sequence Length\\nAudio Waveforms - SSM Parameterization â\\x80\\x94â\\x80\\x94 Mamba ($6) 4 â\\x80\\x94â\\x80\\x94 +complex = Solestive a | (Mamba-S4) 1.35 1.304 1.254 108 108 Sequence Length\\n1.48 21404 . Ã© ag\\nFigure 10: (Audio Pretraining (YouTubeMix) Ablations.) As a uniformly-sampled â\\x80\\x9ccontinuousâ\\x80\\x9d signal modality, audio wave- forms actually benefit from LTI models which have matching inductive bias. (Left) Homogenous models (all blocks have the same parameterization) (Right) Only the center U-Net blocks are ablated; the outer blocks are Mamba-S4. Purple line is same as figure on left.\\nand we want the resulting sequence length to be a a multiple of 8 for hardware eï¬\\x83ciency, the longest possible sequence is 468 Ã\\x97 2048 = 958464. The rest of our sequence lengths are deï¬\\x81ned by successively halving this and rounding up to the nearest multiple of 2048.\\nTable 14 lists the speciï¬\\x81cations used in Figure 7. Beyond the varying batch sizes, the number of valid segments in the training set varied between diï¬\\x80erent sequence lengths (e.g. the number of training steps per epoch was not constant for diï¬\\x80erent points in the graph), which may have contributed to kinks in the scaling curves.\\nTraining. Models were trained for 200ð\\x9d\\x90¾ training steps with a maximum learning rate of 0.002, 20ð\\x9d\\x90¾ (10%) warmup steps, and weight decay 0.1 (similar to our general pretraining recipe across domains).\\nAdditional Ablations: SSM Parameterizations. We investigate SSM parameterizations on long-form audio waveform pretraining in the setting of Figure 7. The setting is modiï¬\\x81ed slightly to use larger models (8 layers and ð\\x9d\\x90· = 64 for 6M params, the SaShiMi default), shorter sequences (211 = 2048 to 218 = 262144 instead of 213 to 220), lower LR (0.001 from 0.002), and shorter training cycles (100K instead of 200K steps).\\nFigure 10 shows that the change from S4 â\\x86\\x92 S6 (i.e. the selection mechanism) is not always beneï¬\\x81cial. On long-form audio waveforms, it in fact signiï¬\\x81cantly hampers performance, which may be intuitive from the point of view that audio is uniformly sampled and very smooth, and therefore beneï¬\\x81ts from continuous linear time-invariant (LTI) methods. After ablating away the selection mechanism, note that the resulting model is the S4 layer inside the Mamba block. To disambiguate, we call this Mamba-S4 as opposed the default Mamba architecture Mamba-S6.\\nHowever, on the right side, we keep the outer layers of the U-Net Mamba-S4 and ablate only the inner layers. The performance diï¬\\x80erences shrink dramatically; this reinforces the hypothesis that layers closer to the raw audio signal should be LTI, but once they are â\\x80\\x9ctokenizedâ\\x80\\x9d and compressed by the outer layers, the inner layers no longer need to be LTI. In this setting however, the real-valued SSM still underperforms the complex-valued one.\\n35\\n# E.4.2 SC09 Speech Generation\\nAutoregressive training largely followed the autoregressive language modeling protocol, such as\\nâ\\x80¢ Weight decay 0.1\\nâ\\x80¢ Learning rate warmup for 10% of total steps\\nâ\\x80¢ AdamW optimizer with ð\\x9d\\x9b½ = (0.9, 0.95)\\nâ\\x80¢ Gradient clip value 0.1\\nWe used a learning rate of 0.002 and 200000 training steps at a batch size of 16.\\nThe large Mamba model in Table 4 has 15 layers per stage with an outer dimension of ð\\x9d\\x90· = 96 and pooling factor 4. We note that this dataset is small (training went through 100 epochs) and for this large model, there was signiï¬\\x81cant overï¬\\x81tting of the BPB or NLL. However, automated metrics of generated samples continually improving throughout training.\\nThe models in the architecture ablations in Table 5 all have 8 layers per stage with an outer dimension of ð\\x9d\\x99³ = 64 and pooling factor 4. The S4+MLP block has roughly 2ð\\x9d\\x90·2 + 4ð\\x9d\\x90·2 parameters (expansion factor 2 in the MLP). The Transformer block has 4ð\\x9d\\x90·2 + 2ð\\x9d\\x90·2 parameters (expansion factor 1 in the MLP). The Mamba block has the usual â\\x89\\x88 6ð\\x9d\\x90·2 parameters. All models have roughly 6M total parameters.\\n# E.5 Efficiency Benchmark\\nScan Operation. We compare the core operation of selective SSMs, which is the parallel scan (Section 3.3), against convolution and attention, measured on an A100 80GB PCIe GPU. Note that these do not include the cost of other operations outside of this core operation, such as computing the convolutional kernel in global-convolution models, or computing the QKV projections in attention.\\nAs a baseline, we implement a standard parallel scan in PyTorch with no kernel fusion. This requires materializing the parameters A, B, C in HBM.\\nOur scan implementation fuses the discretization step and the parallel scan, avoiding the cost of materializing all the large parameters in HBM.\\nFor convolution, we use the standard implementation in PyTorch, which separately performs FFTs on the inputs and the ï¬\\x81lters, multiply them in frequency domain, then performs an inverse FFT to obtain the result. The theoretical complexity is ð\\x9d\\x91\\x82(ð\\x9d\\x90¿ log(ð\\x9d\\x90¿)) for sequence length ð\\x9d\\x90¿.\\nFor attention, we compare against the fastest implementation that we are aware of (FlashAttention-2 (Dao 2023)), with causal mask. Note that FlashAttention-2 with causal mask is about 1.7Ã\\x97 faster than without causal mask, since approximately only half of the attention entries are computed. We use batch size of 1 and increase the sequence length from 29 = 512, 210 â\\x89\\x88 1ð\\x9d\\x90¾, 211 â\\x89\\x88 2ð\\x9d\\x90¾, up to 219 â\\x89\\x88 500ð\\x9d\\x90¾ (some of the baselines run out of memory before reaching 500K). We use a model dimension of ð\\x9d\\x90· = 1024 and state dimension ð\\x9d\\x91\\x81 = 16. We measure with BF16 inputs, which is the data type most commonly used for large scale training.\\nEnd-to-end Inference. We measure the inference throughput of a Mamba 1.4B model and an untrained Mamba 6.9B model, against a standard Transformer (GPT3 architecture) at 1.3B and 6.7B size. We use the standard Transformer implementation in the Huggingface transformers library.\\nWe set the prompt length to be 2048 and the generation length to be 128. We vary the batch size from 1, 2, 4, 8, 16, 32, 64, to 128, and measure time time taken to generate 128 tokens. We then calculate the throughput (tokens/s) as batch size Ã\\x97 128â\\x88\\x95time taken. We repeat the measurements 3 times and take the average. Measurements are done on an A100 80GB PCIe GPU.\\nMemory Benchmark. The memory usage simply scales proportionally to the size of the activation tensors, as with most deep sequence models. We report measurements of the training memory requirements of 125M models\\n36\\nTable 15: (Memory benchmark.) Mambaâ\\x80\\x99s memory footprint is comparable to the most optimized Transformer. Results for 125M models.\\nBatch size Transformer (w/ FlashAttention-2) Mamba 1 2 4 8 16 32 4.6GB 5.2GB 6.9GB 11.5GB 20.7GB 34.5GB 4.8GB 5.8GB 7.3GB 12.3GB 23.1GB 38.2GB\\non 1 A100 80GB GPU. Each batch consists of sequences of length 2048. We compare to the most memory-eï¬\\x83cient Transformer implementation we are aware of (with kernel fusion from torch.compile and with FlashAttention-2). Table 15 shows that Mambaâ\\x80\\x99s memory requirement is comparable to a similar-sized Transformer with an extremely optimized implementation, and we expect further improvement in Mambaâ\\x80\\x99s memory footprint in the future.\\n37', source='', source_type=<SourceType.text_plain: 'text/plain'>, num_chunks=97, metadata={}, chunks=[ResponseChunk(id='chunk_85795fd7-7a6f-4bc4-b668-95c7a58ed5a4', content='# Mamba: Linear-Time Sequence Modeling with Selective State Spaces # Albert Gu*1 and Tri Dao*2 1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me # Abstract Foundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ computational ineï¬ ciency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second, even though this change prevents the use of eï¬ cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simpliï¬ ed end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5Ã higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation. # 1 Introduction Foundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ ective paradigm in modern machine learning.', chunk_index=1, num_tokens=401, metadata={}), ResponseChunk(id='chunk_df41d4bb-5317-4671-abed-dc258b250eec', content='The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014). While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬ cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data. However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬ nite window, and quadratic scaling with respect to the window length. An enormous body of research has appeared on more eï¬ cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ ective. As of yet, none of these variants have been shown to be empirically eï¬ ective at scale across domains. Recently, structured state space sequence models (SSMs) (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling.', chunk_index=2, num_tokens=366, metadata={}), ResponseChunk(id='chunk_39635a09-8d44-4c24-b034-10704e15e699', content='These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960). This class of models can be computed very eï¬ ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length. Additionally, they have principled Equal contribution. 1 mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021). Many ï¬ avors of SSMs (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023). However, they have been less eï¬ ective at modeling discrete and information-dense data such as text. We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length. Selection Mechanism. First, we identify a key limitation of prior models: the ability to eï¬ ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs).', chunk_index=3, num_tokens=368, metadata={}), ResponseChunk(id='chunk_ac9b2781-284a-4eb0-8d8f-29d9aab95998', content='Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input. This allows the model to ï¬ lter out irrelevant information and remember relevant information indeï¬ nitely. Hardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬ cient. We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬ erent levels of the GPU memory hierarchy. The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3Ã faster on A100 GPUs). Architecture. We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M.', chunk_index=4, num_tokens=395, metadata={}), ResponseChunk(id='chunk_f11aca83-3e86-448b-9532-855c93b8999b', content='We empirically validate Mambaâ s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬ c task performance, on several types of modalities and settings: â ¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬ nitely long (>1M tokens). â ¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences. â ¢ Language Modeling. Mamba is the ï¬ rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5Ã generation throughput compared to Transformers of similar size, and Mamba-3Bâ s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B). Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba. 2 # Selective State Space Model # with Hardware-aware State Expansion # A vuvy GPU SRAM Selection Mechanism es Selection Mechanism Figure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð', chunk_index=5, num_tokens=398, metadata={}), ResponseChunk(id='chunk_d4ee6195-6047-4e68-a031-38aa81cea513', content='· = 5) of an input ð ¥ to output ð ¦ through a higher dimensional latent state â (e.g. ð = 4). Prior SSMs avoid materializing this large effective state (ð ·ð , times batch size ð µ and sequence length ð ¿) through clever alternate computation paths requiring time-invariance: the (â , A, B, C) parameters are constant across time. Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy. # 2 State Space Models Structured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð ¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â through an implicit latent state â (ð ¡) â â ð . Concretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬ ne a sequence-to-sequence trans- formation in two stages. â â ²(ð ¡) = Aâ (ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â ð ¡ = Aâ ð ¡â 1 + Bð ¥ð ¡ ð ¦ð ¡ = Câ ð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â ¦ ) (3a) (3b) Discretization. The ï¬ rst stage transforms the â', chunk_index=6, num_tokens=421, metadata={}), ResponseChunk(id='chunk_5ed80602-1a9d-4713-8610-b030888a931b', content='continuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð ð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬ ned in equation (4). A = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4) Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023). It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5. However, from a mechanical point of view discretization can simply be viewed as the ï¬ rst step of the computation graph in the forward pass of an SSM. Alternate ï¬ avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation. After the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3). 3 Commonly, the model uses the convolutional mode (3) for eï¬', chunk_index=7, num_tokens=405, metadata={}), ResponseChunk(id='chunk_30fcbdd3-5bf9-4a62-8711-b68475ab2961', content='cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time). Linear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time. In other words (â , A, B, C), and consequently (A, B) as well, are ï¬ xed for all time-steps. This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models. Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ ciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ ciency bottlenecks. Structure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬ ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use. In this case, the A â â ð Ã ð , B â â ð Ã 1, C â â 1Ã ð matrices can all be represented by ð numbers. To operate over an input sequence ð ¥ of batch size ð µ and length ð', chunk_index=8, num_tokens=403, metadata={}), ResponseChunk(id='chunk_680bad0d-9fe2-4d78-867b-c5173078cdae', content='¿ with ð · channels, the SSM is applied independently to each channel. Note that in this case, the total hidden state has dimension ð ·ð per input, and computing it over the sequence length requires ð (ð µð ¿ð ·ð ) time and memory; this is the root of the fundamental eï¬ ciency bottleneck addressed in Section 3.3. General State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state. It has been used to refer to many disparate concepts in diï¬ erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬ lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning). Throughout this entire paper we use the term â SSMâ to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary. SSM Architectures.', chunk_index=9, num_tokens=400, metadata={}), ResponseChunk(id='chunk_a8f2e857-2ee5-430e-bc18-105773887558', content='SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines. â ¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM. â ¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer. â ¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021). â ¢ RetNet (Y. Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions. 4 â ¢ RWKV (B. Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)). Its main â WKVâ mechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B).', chunk_index=10, num_tokens=378, metadata={}), ResponseChunk(id='chunk_84967934-1624-4c40-9ce9-0cd6f318c024', content='We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬ ciently. We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5). # 3.1 Motivation: Selection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state. In fact, we can view the tradeoï¬ s of popular sequence models from this point of view. For example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬ cient because they have a ï¬ nite state, implying constant-time inference and linear-time training. However, their eï¬ ectiveness is limited by how well this state has compressed the context. To understand this principle, we focus on two running examples of synthetic tasks (Figure 2). â ¢ The Selective Copying task modiï¬', chunk_index=11, num_tokens=399, metadata={}), ResponseChunk(id='chunk_9e96742c-513f-42a0-94a7-ce80366c2dd5', content='es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬ lter out the irrelevant ones (white). â ¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black). These tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels. In summary, the eï¬ ciency vs. eï¬ ectiveness tradeoï¬ of sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).', chunk_index=12, num_tokens=402, metadata={}), ResponseChunk(id='chunk_3be0d952-e791-467c-999a-033e6aa0d2ca', content='# Improving SSMs with Selection One method of incorporating a selection mechanism into models is by letting their parameters that aï¬ ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the convolution kernel of a CNN) be input-dependent. 5 Copying Output noo am > mt HE nee Tt Solution # Tetons | # oO S lective Copying # aoe # i) # [coe # Induction Heads # EES > # fo Perfectly solved by LTI (e.g. convolutional) models that do not need to look at the actual inputs Hi i Hl ] Bw H a H > BH Figure 2: (Left) The standard version of the Copying task involves constant spacing between input and output elements and is easily solved by time-invariant models such as linear recurrences and global convolutions. (Right Top) The Selective Copying task has random spacing in between inputs and requires time-varying models that can selectively remember or ignore inputs depending on their content. (Right Bottom) The Induction Heads task is an example of associative recall that requires retrieving an answer based on context, a key ability for LLMs. Algorithm 2 SSM + Selection (S6) Input: ð ¥ â ¶ (ð ±, ð », ð ³) Output: ð ¦ â ¶ (ð ±, ð », ð ³) 1: A â ¶ (ð ³, ð ½) â ð ¯ð ºð ð ºð ð ¾ð ð ¾ð â ³ Represents structured ð Ã ð matrix â ³ Represents structured ð Ã ð matrix 2: B â ¶ (ð ³, ð ½) â ð ¯ð ºð ð ºð ð ¾ð ð ¾ð 3: C â ¶ (ð ³, ð ½) â ð ¯ð ºð ð ºð ð ¾ð ð ¾ð 4: â â', chunk_index=13, num_tokens=443, metadata={}), ResponseChunk(id='chunk_42ebb06a-521a-4644-85d6-51accc0b128f', content='¶ (ð ³) â ð â (ð ¯ð ºð ð ºð ð ¾ð ð ¾ð ) 5: A, B â ¶ (ð ³, ð ½) â ð ½ð ð ð ¼ð ð ¾ð ð ð ð ¾(â , A, B) 6: ð ¦ â ð ²ð ²ð ¬(A, B, C)(ð ¥) 2: B â ¶ (ð ±, ð », ð ½) â ð ð µ(ð ¥) 3: C â ¶ (ð ±, ð », ð ½) â ð ð ¶(ð ¥) 4: â â ¶ (ð ±, ð », ð ³) â ð â (ð ¯ð ºð ð ºð ð ¾ð ð ¾ð +ð â (ð ¥)) 5: A, B â ¶ (ð ±, ð », ð ³, ð ½) â ð ½ð ð ð ¼ð ð ¾ð ð ð ð ¾(â , A, B) 6: ð ¦ â ð ²ð ²ð ¬(A, B, C)(ð ¥) â ³ Time-invariant: recurrence or convolution â ³ Time-varying: recurrence (scan) only 7: return ð ¦ 7: return ð ¦ Algorithms 1 and 2 illustrates the main selection mechanism that we use. The main diï¬ erence is simply making several parameters â , B, C functions of the input, along with the associated changes to tensor shapes throughout. In particular, we highlight that these parameters now have a length dimension ð ¿, meaning that the model has changed from time-invariant to time-varying. (Note that shape annotations were described in Section 2). This loses the equivalence to convolutions (3) with implications for its eï¬ ciency, discussed next.', chunk_index=14, num_tokens=459, metadata={}), ResponseChunk(id='chunk_ed6ddb18-add1-4770-86e8-82dec84b092c', content='We speciï¬ cally choose ð ð µ(ð ¥) = ð «ð ð ð ¾ð ºð ð (ð ¥), ð ð ¶(ð ¥) = ð «ð ð ð ¾ð ºð ð (ð ¥), ð â (ð ¥) = ð ¡ð ð ð ºð ½ð ¼ð ºð ð ð ·(ð «ð ð ð ¾ð ºð 1(ð ¥)), and ð â = ð ð ð ¿ð ð ð ð ð , where ð «ð ð ð ¾ð ºð ð is a parameterized projection to dimension ð . The choice of ð â and ð â is due to a connection to RNN gating mechanisms explained in Section 3.5. # 3.3 Efficient Implementation of Selective SSMs Hardware-friendly architectures such as convolutions (Krizhevsky, Sutskever, and Hinton 2012) and Transform- ers (Vaswani et al. 2017) enjoy widespread application. Here we aim to make selective SSMs eï¬ cient on modern hardware (GPU) as well. The selection mechanism is quite natural, and earlier works attempted to incorporate special cases of selection, such as letting â vary over time in recurrent SSMs (Gu, Dao, et al. 2020). However, as previously mentioned a core limitation in the usage of SSMs is their computational eï¬ ciency, which was why S4 and all derivatives used LTI (non-selective) models, most commonly in the form of global convolutions. # 3.3.1 Motivation of Prior Models We ï¬ rst revisit this motivation and overview our approach to overcome limitations of prior methods. â ¢ At a high level, recurrent models such as SSMs always balance a tradeoï¬', chunk_index=15, num_tokens=425, metadata={}), ResponseChunk(id='chunk_cd508d1b-20d7-482d-b820-b0e7cfd510d7', content='between expressivity and speed: as discussed in Section 3.1, models with larger hidden state dimension should be more eï¬ ective but slower. Thus 6 we want to maximize hidden state dimension without paying speed and memory costs. â ¢ Note that the recurrent mode is more ï¬ exible than the convolution mode, since the latter (3) is derived from expanding the former (2) (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021). However, this would require computing and materializing the latent state â with shape (ð ±, ð », ð ³, ð ½), much larger (by a factor of ð , the SSM state dimension) than the input ð ¥ and output ð ¦ of shape (ð ±, ð », ð ³). Thus the more eï¬ cient convolution mode was introduced which could bypass the state computation and materializes a convolution kernel (3a) of only (ð ±, ð », ð ³). â ¢ Prior LTI SSMs leverage the dual recurrent-convolutional forms to increase the eï¬ ective state dimension by a factor of ð (â 10 â 100), much larger than traditional RNNs, without eï¬ ciency penalties. # 3.3.2 Overview of Selective Scan: Hardware-Aware State Expansion The selection mechanism is designed to overcome the limitations of LTI models; at the same time, we therefore need to revisit the computation problem of SSMs. We address this with three classical techniques: kernel fusion, parallel scan, and recomputation. We make two main observations: â ¢ The naive recurrent computation uses ð (ð µð ¿ð ·ð ) FLOPs while the convolutional computation uses ð (ð µð ¿ð · log(ð', chunk_index=16, num_tokens=399, metadata={}), ResponseChunk(id='chunk_cc7ea622-61b7-43f3-8956-47071689a762', content='¿)) FLOPs, and the former has a lower constant factor. Thus for long sequences and not-too-large state dimension ð , the recurrent mode can actually use fewer FLOPs. â ¢ The two challenges are the sequential nature of recurrence, and the large memory usage. To address the latter, just like the convolutional mode, we can attempt to not actually materialize the full state â . The main idea is to leverage properties of modern accelerators (GPUs) to materialize the state â only in more eï¬ cient levels of the memory hierarchy. In particular, most operations (except matrix multiplication) are bounded by memory bandwidth (Dao, Fu, Ermon, et al. 2022; Ivanov et al. 2021; Williams, Waterman, and Patterson 2009). This includes our scan operation, and we use kernel fusion to reduce the amount of memory IOs, leading to a signiï¬ cant speedup compared to a standard implementation. Concretely, instead of preparing the scan input (A, B) of size (ð ±, ð », ð ³, ð ½) in GPU HBM (high-bandwidth memory), we load the SSM parameters (â , A, B, C) directly from slow HBM to fast SRAM, perform the discretization and recurrence in SRAM, and then write the ï¬ nal outputs of size (ð ±, ð », ð ³) back to HBM. To avoid the sequential recurrence, we observe that despite not being linear it can still be parallelized with a work-eï¬ cient parallel scan algorithm (Blelloch 1990; Martin and Cundy 2018; Smith, Warrington, and Linderman 2023). Finally, we must also avoid saving the intermediate states, which are necessary for backpropagation.', chunk_index=17, num_tokens=389, metadata={}), ResponseChunk(id='chunk_8a1ea63f-0902-4066-8e5f-527e615a8808', content='We carefully apply the classic technique of recomputation to reduce the memory requirements: the intermediate states are not stored but recomputed in the backward pass when the inputs are loaded from HBM to SRAM. As a result, the fused selective scan layer has the same memory requirements as an optimized transformer implementation with FlashAttention. Details of the fused kernel and recomputation are in Appendix D. The full Selective SSM layer and algorithm is illustrated in Figure 1. # 3.4 A Simplified SSM Architecture As with structured SSMs, selective SSMs are standalone sequence transformations that can be ï¬ exibly incorporated into neural networks. The H3 architecture is the basis for the most well-known SSM architectures (Section 2), which are generally comprised of a block inspired by linear attention interleaved with an MLP (multi-layer perceptron) block. We simplify this architecture by combining these two components into one, which is stacked homogenously (Figure 3). This is inspired by the gated attention unit (GAU) (Hua et al. 2022), which did something similar for attention. This architecture involves expanding the model dimension ð · by a controllable expansion factor ð ¸. For each block, most of the parameters (3ð ¸ð ·2) are in the linear projections (2ð ¸ð ·2 for input projections, ð ¸ð ·2 for output projection) while the inner SSM contributes less. The number of SSM parameters (projections for â , B, C, and 7 Linear projection Sequence transformation Nonlinearity (activation multiplication) H3 Â®@ Gated MLP â Mamba # or Figure 3: (Architecture.) Our simplified block design combines the H3 block, which is the basis of most SSM architectures, with the ubiquitous MLP block of modern neural networks. Instead of interleaving these two blocks, we simply repeat the Mamba block homogenously.', chunk_index=18, num_tokens=400, metadata={}), ResponseChunk(id='chunk_bf4aa797-48e2-4534-92c1-cd5b44ca4efa', content='Compared to the H3 block, Mamba replaces the first multiplicative gate with an activation function. Compared to the MLP block, Mamba adds an SSM to the main branch. For ð we use the SiLU / Swish activation (Hendrycks and Gimpel 2016; Ramachandran, Zoph, and Quoc V Le 2017). the matrix A) are much smaller in comparison. We repeat this block, interleaved with standard normalization and residual connections, to form the Mamba architecture. We always ï¬ x to ð ¸ = 2 in our experiments and use two stacks of the block to match the 12ð ·2 parameters of a Transformerâ s interleaved MHA (multi-head attention) and MLP blocks. We use the SiLU / Swish activation function (Hendrycks and Gimpel 2016; Ramachandran, Zoph, and Quoc V Le 2017), motivated so that the Gated MLP becomes the popular â SwiGLUâ variant (Chowdhery et al. 2023; Shazeer 2020; Touvron et al. 2023). Finally, we additionally use an optional normalization layer (we choose LayerNorm (J. L. Ba, Kiros, and Hinton 2016)), motivated by RetNetâ s usage of a normalization layer in a similar location (Y. Sun et al. 2023). # 3.5 Properties of Selection Mechanisms The selection mechanism is a broader concept that can be applied in diï¬ erent ways, such as to more traditional RNNs or CNNs, to diï¬ erent parameters (e.g. A in Algorithm 2), or using diï¬ erent transformations ð (ð ¥). # 3.5.1 Connection to Gating Mechanisms', chunk_index=19, num_tokens=389, metadata={}), ResponseChunk(id='chunk_c87057ef-5a12-43b5-aed2-8f3633e354e3', content='We highlight the most important connection: the classical gating mechanism of RNNs is an instance of our selection mechanism for SSMs. We note that the connection between RNN gating and the discretization of continuous-time systems is well established (Funahashi and Nakamura 1993; Tallec and Ollivier 2018). In fact, Theorem 1 is an improvement of Gu, Johnson, Goel, et al. (2021, Lemma 3.1) generalizing to the ZOH discretization and input-dependent gates (proof in Appendix C). More broadly, â in SSMs can be seen to play a generalized role of the RNN gating mechanism. In line with prior work, we adopt the view that discretization of SSMs is the principled foundation of heuristic gating mechanisms. Theorem 1. When ð = 1, A = â 1, B = 1, ð â = ð «ð ð ð ¾ð ºð (ð ¥), and ð â = ð ð ð ¿ð ð ð ð ð , then the selective SSM recurrence (Algorithm 2) takes the form ð ð ¡ = ð (ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) â ð ¡ = (1 â ð ð ¡)â ð ¡â 1 + ð ð ¡ð ¥ð ¡. (5) As mentioned in Section 3.2, our speciï¬ c choices of ð â , ð â is from this connection. In particular, note that if a given input ð ¥ð ¡ should be completely ignored (as necessary in the synthetic tasks), all ð · channels should ignore it, and so we project the input down to 1 dimension before repeating/broadcasting with â . 8 # Interpretation of Selection Mechanisms We elaborate on two particular mechanistic eï¬ ects of selection. Variable Spacing.', chunk_index=20, num_tokens=424, metadata={}), ResponseChunk(id='chunk_0ad9aad4-a836-4ee3-b319-b058e5262425', content='Selectivity allows ï¬ ltering out irrelevant noise tokens that may occur between inputs of interest. This is exempliï¬ ed by the Selective Copying task, but occurs ubiquitously in common data modalities, particularly for discrete data â for example the presence of language ï¬ llers such as â umâ . This property arises because the model can mechanistically ï¬ lter out any particular input ð ¥ð ¡, for example in the gated RNN case (Theorem 1) when ð ð ¡ â 0. It has been empirically observed that many sequence models do not improve with longer Filtering Context. context (F. Shi et al. 2023), despite the principle that more context should lead to strictly better performance. An explanation is that many sequence models cannot eï¬ ectively ignore irrelevant context when necessary; an intuitive example are global convolutions (and general LTI models). On the other hand, selective models can simply reset their state at any time to remove extraneous history, and thus their performance in principle improves monotonicly with context length (e.g. Section 4.3.2). In settings where multiple independent sequences are stitched together, Transformers Boundary Resetting. can keep them separate by instantiating a particular attention mask, while LTI models will bleed information between the sequences. Selective SSMs can also reset their state at boundaries (e.g. â ð ¡ â â or Theorem 1 when ð ð ¡ â 1). These settings may occur artiï¬ cially (e.g. packing documents together to improve hardware utilization) or naturally (e.g. episode boundaries in reinforcement learning (Lu et al. 2023)). Additionally, we elaborate on eï¬ ects of each selective parameter. In general, â controls the balance between how much to focus or ignore the current input Interpretation of â . ð ¥ð ¡.', chunk_index=21, num_tokens=398, metadata={}), ResponseChunk(id='chunk_a9864c86-75c6-4de0-bb35-b36bee8e5dd1', content='It generalizes RNN gates (e.g. ð ð ¡ in Theorem 1), mechanically, a large â resets the state â and focuses on the current input ð ¥, while a small â persists the state and ignores the current input. SSMs (1)-(2) can be interpreted as a continuous system discretized by a timestep â , and in this context the intuition is that large â â â represents the system focusing on the current input for longer (thus â selectingâ it and forgetting its current state) while a small â â 0 represents a transient input that is ignored. Interpretation of A. We remark that while the A parameter could also be selective, it ultimately aï¬ ects the model only through its interaction with â via A = exp(â A) (the discretization (4)). Thus selectivity in â is enough to ensure selectivity in (A, B), and is the main source of improvement. We hypothesize that making A selective in addition to (or instead of) â would have similar performance, and leave it out for simplicity. Interpretation of B and C. As discussed in Section 3.1, the most important property of selectivity is ï¬ ltering out irrelevant information so that a sequence modelâ s context can be compressed into an eï¬ cient state. In an SSM, modifying B and C to be selective allows ï¬ ner-grained control over whether to let an input ð ¥ð ¡ into the state â ð ¡ or the state into the output ð ¦ð ¡. These can be interpreted as allowing the model to modulate the recurrent dynamics based on content (input) and context (hidden states) respectively. 3.6 Additional Model Details Real vs. Complex. Most prior SSMs use complex numbers in their state â', chunk_index=22, num_tokens=378, metadata={}), ResponseChunk(id='chunk_358b7a88-dc66-4cae-ac67-41da45319e8e', content=', which is necessary for strong performance on many tasks (Gu, Goel, and RÃ© 2022). However, it has been empirically observed that completely real-valued SSMs seem to work ï¬ ne, and possibly even better, in some settings (Ma et al. 2023). We use real values as the default, which work well for all but one of our tasks; we hypothesize that the complex-real tradeoï¬ is related to the continuous-discrete spectrum in data modalities, where complex numbers are helpful for continuous modalities (e.g. audio, video) but not discrete (e.g. text, DNA). 9 Initialization. Most prior SSMs also suggest special initializations, particularly in the complex-valued case, which can help in several settings such as low-data regimes. Our default initialization for the complex case is S4D-Lin and for the real case is S4D-Real (Gu, Gupta, et al. 2022), which is based on the HIPPO theory (Gu, Dao, et al. 2020). These deï¬ ne the ð -th element of A as â 1â 2 + ð ð and â (ð + 1) respectively. However, we expect many initializations to work ï¬ ne, particularly in the large-data and real-valued SSM regimes; some ablations are considered in Section 4.6. Parameterization of â . We deï¬ ned the selective adjustment to â as ð â (ð ¥) = ð ¡ð ð ð ºð ½ð ¼ð ºð ð ð ·(ð «ð ð ð ¾ð ºð 1(ð ¥)), which was motivated by the mechanics of â (Section 3.5). We observe that it can be generalized from dimension 1 to a larger dimension ð . We set this to be a small fraction of ð', chunk_index=23, num_tokens=414, metadata={}), ResponseChunk(id='chunk_ec32cf2b-97c0-4b0e-8500-6d09d2df8579', content='³, which uses a negligible number of parameters compared to the main Linear projections in the block. We additionally note that the broadcasting operation can instead be viewed as another Linear projection, initialized to a speciï¬ c pattern of 1â s and 0â s; if this projection is trainable, this leads to the alternative ð â (ð ¥) = ð «ð ð ð ¾ð ºð ð ·(ð «ð ð ð ¾ð ºð ð (ð ¥)), which can be viewed as a low-rank projection. In our experiments, the â parameter (which can be viewed as a bias term) is initialized to ð â 1 â following prior work on SSMs (Gu, Johnson, Timalsina, et al. 2023). Remark 3.1. For brevity in our experimental results, we sometimes abbreviate selective SSMs as S6 models, because they are S4 models with a selection mechanism and computed with a scan. # 4 Empirical Evaluation In Section 4.1 we test Mambaâ s ability to solve the two synthetic tasks motivated in Section 3.1. We then evaluate on three domains, each evaluated on autoregressive pretraining as well as downstream tasks. Section 4.2: language model pretraining (scaling laws), and zero-shot downstream evaluation. Section 4.3: DNA sequence pretraining, and ï¬ ne-tuning on a long-sequence classiï¬ cation task. Section 4.4: audio waveform pretraining, and the quality of autoregressively generated speech clips. Finally, Section 4.5 shows Mambaâ s computational eï¬ ciency at both training and inference time, and Section 4.6 ablates various components of the architecture and selective SSMs. # 4.1 Synthetic Tasks', chunk_index=24, num_tokens=394, metadata={}), ResponseChunk(id='chunk_d6c894ab-cf3b-4d73-adad-941d166d315d', content='Full experiment details for these tasks including task details and training protocol are in Appendix E.1. # 4.1.1 Selective Copying The Copying task is one of the most well-studied synthetic tasks for sequence modeling, originally designed to test the memorization abilities of recurrent models. As discussed in Section 3.1, LTI SSMs (linear recurrences and global convolutions) can easily solve this task by only keeping track of time instead of reasoning about the data; for example, by constructing a convolution kernel of exactly the right length (Figure 2). This was explicitly validated in earlier work on global convolutions (Romero et al. 2021). The Selective Copying task prevents this shortcut by randomizing the spacing between tokens. Note that this task has been introduced before as the Denoising task (Jing et al. 2019). Note that many previous works argue that adding architecture gating (multiplicative interactions) can endow models with â data-dependenceâ and solve related tasks (Dao, Fu, Saab, et al. 2023; Poli et al. 2023). However, we ï¬ nd this explanation insuï¬ cient intuitively because such gating does not interact along the sequence axis, and cannot aï¬ ect the spacing between tokens. In particular architecture gating is not an instance of a selection mechanism (Appendix A). Table 1 conï¬ rms that gated architectures such as H3 and Mamba only partially improve performance, while the selection mechanism (modifying S4 to S6) easily solves this task, particularly when combined with these more powerful architectures. 10 Model Arch. Layer Acc.', chunk_index=25, num_tokens=347, metadata={}), ResponseChunk(id='chunk_c7df38c1-b4cc-4063-89f5-71dce42209e9', content=\"S4 - No gate No gate S4 S6 18.3 97.0 H3 Hyena - H3 H3 H3 S4 Hyena S6 57.0 30.1 99.7 - - Mamba Mamba Mamba Mamba Hyena S4 S6 56.4 28.4 99.8 Induction Heads Extrapolation Extrapolation 1.05 ' â â Mua-Absotute 08] ; â â MHA-RoPE i =~ MHA-xPos 6) i â HB oa = byena ' Random 1 ran benath 0.0 , ; ; : , 10Â° 10Â° 108 10Â° 10Â° Test Sequence Length > g 8 Table 1: (Selective Copying.) Accuracy for combinations of architectures and inner sequence layers. Table 2: (Induction Heads.) Models are trained on sequence length 28 = 256, and tested on increasing sequence lengths of 26 = 64 up to 220 = 1048576. Full numbers in Table 11. # 4.1.2 Induction Heads Induction heads (Olsson et al. 2022) is a simple task from the mechanistic interpretability lens (Elhage et al. 2021) that is surprisingly predictive of the in-context learning ability of LLMs. It requires models to perform associative recall and copy: for example, if the model has seen a bigram such as â Harry Potterâ in the sequence, then the next time â Harryâ appears in the same sequence, the model should be able to predict â Potterâ by copying from history. Dataset.\", chunk_index=26, num_tokens=356, metadata={}), ResponseChunk(id='chunk_9460c41e-950c-4e74-aa78-80a0025b79f4', content='We train a 2-layer model on the induction heads task at sequence length 256, with a vocab size of 16, which is comparable to prior work on this task (Dao, Fu, Saab, et al. 2023) but with longer sequences. We additionally investigate generalization and extrapolation abilities by evaluating on a range of sequence lengths from 26 = 64 up to 220 = 1048576 at test time. Models. Following established work on induction heads, we use 2 layer models, which allows attention to mechanistically solve the induction heads task (Olsson et al. 2022). We test both multi-head attention (8 heads, with various positional encodings) and SSM variants. We use a model dimension ð · of 64 for Mamba and 128 for the other models. Results. Table 2 shows that Mambaâ or more precisely, its selective SSM layerâ has the ability to solve the task perfectly because of its ability to selectively remember the relevant token while ignoring everything else in between. It generalizes perfectly to million-length sequences, or 4000Ã longer than it saw during training, while no other method goes beyond 2Ã . Out of positional encoding variants for attention models, xPos (which was designed for length extrapolation) is slightly better than the others; also note that all attention models were only tested up to sequence length 214 = 16384 due to memory limitations. Out of other SSMs, H3 and Hyena are similar, contrary to the ï¬ ndings in Poli et al. (2023). # 4.2 Language Modeling We evaluate the Mamba architecture on standard autoregressive language modeling against other architectures, on both pretraining metrics (perplexity) and zero-shot evaluations. We set the model sizes (depth and width) to mirror GPT3 speciï¬ cations.', chunk_index=27, num_tokens=391, metadata={}), ResponseChunk(id='chunk_6136fa91-721d-45a0-bdb5-38423b262605', content='We use the Pile dataset (L. Gao, Biderman, et al. 2020), and follow the training recipe described in Brown et al. (2020). All training details are in Appendix E.2. # 4.2.1 Scaling Laws For baselines, we compare against the standard Transformer architecture (GPT3 architecture), as well as the strongest Transformer recipe we know of (here referred to as Transformer++), based on the PaLM and LLaMa 11 Scaling Laws on The Pile (Sequence Length 2048) Scaling Laws on The Pile (Sequence Length 8192) 2x10\" 2x10 Hyena Hyena RWKV s RWKV â â Transformer Fy â â Transformer fd RetNet 2 â â RetNet 3+ 2 â HH wd â = Transformers |, | â â Transformert+ â â Mamba zg â â Mamba 2 2 S a 6x 10Â° 1 7 6x 10Â° 1 7 10\"? 102 10 107Â° FLOPs (log scale) FLOPs (log scale) s 8 fd 2 2 > 3 2 2 S a Figure 4: (Scaling Laws.) Models of size â 125ð to â 1.3ð µ parameters, trained on the Pile. Mamba scales better than all other attention-free models and is the first to match the performance of a very strong â Transformer++â recipe that has now become standard, particularly as the sequence length grows. architectures (e.g. rotary embedding, SwiGLU MLP, RMSNorm instead of LayerNorm, no linear bias, and higher learning rates). We also compare against other recent subquadratic architectures (Figure 4). All model details are in Appendix E.2.', chunk_index=28, num_tokens=388, metadata={}), ResponseChunk(id='chunk_aeff4d17-5ec3-46ce-b9b9-68810db8bdbf', content='Figure 4 shows scaling laws under the standard Chinchilla (Hoï¬ mann et al. 2022) protocol, on models from â 125ð to â 1.3ð µ parameters. Mamba is the ï¬ rst attention-free model to match the performance of a very strong Transformer recipe (Transformer++) that has now become standard, particularly as the sequence length grows. We note that full results on context length 8k are missing for the RWKV and RetNet baselines, prior strong recurrent models that can also be interpreted as SSMs, due to a lack of eï¬ cient implementation leading to out-of-memory or unrealistic computation requirements. # 4.2.2 Downstream Evaluations Table 3 shows the performance of Mamba on a range of popular downstream zero-shot evaluation tasks. We compare against the most well-known open source models at these sizes, most importantly Pythia (Biderman et al. 2023) and RWKV (B. Peng et al. 2023) which were trained with the same tokenizer, dataset, and training length (300B tokens) as our models. (Note that Mamba and Pythia are trained with context length 2048, while RWKV was trained with context length 1024.) # 4.3 DNA Modeling Motivated by the success of large language models, there has been recent exploration into using the foundation model paradigm for genomics. DNA has been likened to language in that it consists of sequences of discrete tokens with a ï¬ nite vocab. It is also known for requiring long-range dependencies to model (Avsec et al. 2021). We investigate Mamba as a FM backbone for pretraining and ï¬ ne-tuning in the same setting as recent works on long-sequence models for DNA (Nguyen, Poli, et al. 2023).', chunk_index=29, num_tokens=388, metadata={}), ResponseChunk(id='chunk_3fd3e2f8-9352-4053-ab28-86b9494099da', content='In particular, we focus on two explorations of scaling laws across model size and sequence length (Figure 5), and a diï¬ cult downstream synthetic classiï¬ cation task requiring long context (Figure 6). For pretraining, we largely follow a standard causal language modeling (next token prediction) setup for the training and model details (see also Appendix E.2). For the dataset, we largely follow the setup of HyenaDNA (Nguyen, Poli, et al. 2023), which uses the HG38 dataset for pretraining consisting of a single human genome with about 4.5 billion tokens (DNA base pairs) in the training split. # 4.3.1 Scaling: Model Size In this experiment, we investigate the scaling properties of genomics foundation models with various model backbones (Figure 5 Left). Training. To advantage the baselines, we train on a short sequence length of 1024; as shown in Section 4.3.2, we expect results to favor Mamba even more at longer sequence lengths. We ï¬ x a global batch size of 1024, for a 12 Table 3: (Zero-shot Evaluations.) Best results for each size in bold. We compare against open source LMs with various tokenizers, trained for up to 300B tokens. Pile refers to the validation split, comparing only against models trained on the same dataset and tokenizer (GPT-NeoX-20B). For each model size, Mamba is best-in-class on every single evaluation result, and generally matches baselines at twice the model size. Model Token. Pile ppl â LAMBADA LAMBADA HellaSwag ppl â acc â acc â acc â acc â acc â acc â Hybrid H3-130M GPT2 â', chunk_index=30, num_tokens=374, metadata={}), ResponseChunk(id='chunk_8360e651-c636-4a2b-b8df-3f4a43ce0fb1', content='Pythia-160M Mamba-130M NeoX NeoX 29.64 10.56 89.48 38.10 16.07 25.77 33.0 44.3 31.7 30.2 35.3 64.2 61.4 64.5 44.4 43.2 48.0 24.2 24.1 24.3 50.6 51.9 51.9 40.1 40.6 44.7 Hybrid H3-360M GPT2 â Pythia-410M Mamba-370M NeoX NeoX 9.95 8.28 12.58 10.84 8.14 48.0 51.4 55.6 41.5 40.6 46.5 68.1 66.9 69.5 51.4 52.1 55.1 24.7 24.6 28.0 54.1 53.8 55.3 48.0 48.2 50.0 Pythia-1B Mamba-790M NeoX NeoX 7.82 7.33 7.92 6.02 56.1 62.7 47.2 55.1 70.7 72.1 57.0 61.2 27.1 29.5 53.5 56.1 51.9 57.1 GPT-Neo 1.3B Hybrid H3-1.3B OPT-1.3B Pythia-1.4B RWKV-1.5B Mamba-1.4B GPT2 â GPT2 â â', chunk_index=31, num_tokens=389, metadata={}), ResponseChunk(id='chunk_0f09303b-e20f-4dae-8eb0-b076627d4817', content='OPT 7.51 NeoX 7.70 NeoX NeoX 6.80 7.50 11.25 6.64 6.08 7.04 5.04 57.2 49.6 58.0 61.7 56.4 64.9 48.9 52.6 53.7 52.1 52.5 59.1 71.1 71.3 72.4 71.0 72.4 74.2 56.2 59.2 56.7 60.5 60.5 65.5 25.9 28.1 29.6 28.5 29.4 32.8 54.9 56.9 59.5 57.2 54.6 61.5 52.4 53.0 55.0 55.2 54.3 59.7 GPT-Neo 2.7B Hybrid H3-2.7B OPT-2.7B Pythia-2.8B RWKV-3B Mamba-2.8B GPT2 â GPT2 â â', chunk_index=32, num_tokens=263, metadata={}), ResponseChunk(id='chunk_1bcfd648-e879-4bcd-9c09-4ee718b415d8', content='OPT 6.73 NeoX 7.00 NeoX NeoX 6.22 5.63 7.92 5.12 5.04 5.24 4.23 62.2 55.7 63.6 64.7 63.9 69.2 55.8 59.7 60.6 59.3 59.6 66.1 72.1 73.3 74.8 74.0 73.7 75.2 61.1 65.6 60.8 64.1 67.8 69.7 30.2 32.3 31.3 32.9 33.1 36.3 57.6 61.4 61.0 59.7 59.6 63.5 56.5 58.0 58.7 59.1 59.6 63.3 GPT-J-6B OPT-6.7B Pythia-6.9B RWKV-7.4B GPT2 OPT NeoX NeoX â â 6.51 6.31 4.10 4.25 4.45 4.38 68.3 67.7 67.1 67.2 66.3 67.2 64.0 65.5 75.4 76.3 75.2 76.1 67.0 65.6 67.3 67.8 36.6 34.9 35.5 37.5 64.1 65.5 61.3 61.0 63.0 62.9 61.7 62.5 total of 220 â 1ð tokens per batch.', chunk_index=33, num_tokens=396, metadata={}), ResponseChunk(id='chunk_6c0d0381-53dc-4cb1-b934-c6d344f66954', content='Models were trained for 10ð ¾ gradient steps for a total of 10ð µ tokens. Results. Figure 5 (Left) shows that Mambaâ s pretraining perplexity improves smoothly with model size, and that Mamba scales better than both HyenaDNA and Transformer++. For example, at the largest model size of â 40ð parameters, the curve shows that Mamba can match the Transformer++ and HyenaDNA models with roughly 3Ã to 4Ã fewer parameters. # 4.3.2 Scaling: Context Length In the next DNA experiment, we investigate the scaling properties of models with respect to sequence length. We only compare the HyenaDNA and Mamba models, as quadratic attention becomes prohibitively expensive at longer sequence lengths. We pretrain models on sequence lengths 210 = 1024, 212 = 4096, 214 = 16384, 216 = 65536, 218 = 262144, 220 = 1048576. We ï¬ x a model size of 6 layers by width 128 (about 1.3M-1.4M parameters). Models were trained for 20ð ¾ gradient steps for a total of â 330ð µ tokens. The longer sequence lengths used sequence length warmup similar to (Nguyen, Poli, et al. 2023). Results. Figure 5 (Right) shows that Mamba is able to make use of longer context even up to extremely long sequences of length 1M, and its pretraining perplexity improves as the context increases. On the other hand, the HyenaDNA model gets worse with sequence length. This is intuitive from the discussion in Section 3.5 on properties of the selection mechanism. In particular, LTI models cannot selectively ignore information; from a convolutional perspective, a very long convolution kernel is aggregating all information across a long sequence 13', chunk_index=34, num_tokens=395, metadata={}), ResponseChunk(id='chunk_e713d3d3-5b93-4b16-b689-e39ad7877965', content='Scaling Laws on the Human Genome (HG38) Scaling Laws - Sequence Length (HG38) â â HyenaDNa 1.4m â = Mamba 1.4M â â Mamba 7M ae â â HyenaDNA 3.00 4 â Mamba â â Transformert+ 2.98 | Perplexity Perplexity 2.80 4 284 2.754 274 r T r r r ; 10Â° 107 103 10 105 10Â° Parameters (log scale) Sequence Length Figure 5: (DNA Scaling Laws.) Pretraining on the HG38 (human genome) dataset. (Left) Fixing short context length 210 = 1024 and increasing size from â 200ð ¾ to â 40ð parameters, Mamba scales better than baselines. (Right) Fixing model size and increasing sequence lengths while keeping tokens/batch and total training tokens fixed. Unlike baselines, the selection mechanism of Mamba facilitates better performance with increasing context length. Finetuning Accuracy (Species DNA Classification) 0.8] â â HyenaDNA1.4M 0.7-| â â Mamba 1.4m â â Mamba 7M mag] â â Random g 5 os 3 â 8 oA 034 024 --------------------------------- T T T T 103 10Â¢ 108 10 Sequence Length Scaling Laws - Sequence Length (YouTubeMix) 1.475 â â SA+FEN 1.450 4 â â Mamba @ 1.4254 2 1.400 4 5 o 1.375 4 Â© 1.3504 1.325 4 1.300 T T T 10* 10Â° 10 Sequence Length', chunk_index=35, num_tokens=385, metadata={}), ResponseChunk(id='chunk_5106bbd0-f6d9-465d-87dc-b8e0844562d5', content='Figure 6: (Great Apes DNA Classification.) Accuracy after fine-tuning on sequences of length 210 = 1024 up to 220 = 1048576 using pretrained models of the same context length. Nu- merical results in Table 13. Figure 7: (Audio Pretraining.) Mamba improves performance over prior state-of-the-art (Sashimi) in autoregressive audio mod- eling, while improving up to minute-long context or million- length sequences (controlling for computation). which may be very noisy. Note that while HyenaDNA claims to improve with longer context, their results do not control for computation time. # 4.3.3 Synthetic Species Classification We evaluate models on a downstream task of classifying between 5 diï¬ erent species by randomly sampling a contigu- ous segment of their DNA. This task is adapted from HyenaDNA, which used the species {human, lemur, mouse, pig, hippo}. We modify the task to be signiï¬ cantly more challenging by classifying between the ï¬ ve great apes species {human, chimpanzee, gorilla, orangutan, bonobo}, which are known to share 99% of their DNA. # 4.4 Audio Modeling and Generation For the audio waveform modality, we compare primarily to the SaShiMi architecture and training protocols (Goel et al. 2022). This model comprises 1. a U-Net backbone with two stages of pooling by a factor ð that doubles the model dimension ð · per stage, 2. alternating S4 and MLP blocks in each stage. We consider replacing the S4+MLP blocks with Mamba blocks. Experiment details are in Appendix E.4. # 4.4.1 Long-Context Autoregressive Pretraining', chunk_index=36, num_tokens=380, metadata={}), ResponseChunk(id='chunk_11145d8c-d370-4680-bb26-7f052d3b638e', content='We evaluate pretraining quality (autoregressive next-sample prediction) on YouTubeMix (DeepSound 2017), a standard piano music dataset used by prior work consisting of 4 hours of solo piano music, sampled at a rate of 14 16000 Hz Pretraining details largely follow the standard language modeling setup (Section 4.2). Figure 7 evaluates the eï¬ ect of increasing training sequence lengths from 213 = 8192 to 220 â 106, while keeping computation ï¬ xed. (There are some slight edge cases to the way the data is curated, which may lead to kinks in the scaling curves. For example, only minute-long clips were available so the maximum sequence length is actually bounded by 60ð â 16000ð »ð § = 960000.) Both Mamba and the SaShiMi (S4+MLP) baseline improve consistently with longer context lengths; Mamba is better throughout, and the gap widens at longer lengths. The main metric is bits per byte (BPB), which is a constant factor log(2) of the standard negative log-likelihood (NLL) loss for pretraining other modalities. We note one important detail: this is the only experiment in this paper in which we switched from the real parameterization to complex (Section 3.6). We show additional ablations in Appendix E.4. # 4.4.2 Autoregressive Speech Generation SC09 is a benchmark speech generation dataset (Donahue, McAuley, and Puckette 2019; Warden 2018), consisting of 1-second clips sampled at 16000 Hz of the digits â zeroâ through â nineâ with highly variable characteristics. We largely follow the autoregressive training setup and generation protocol of Goel et al. (2022).', chunk_index=37, num_tokens=380, metadata={}), ResponseChunk(id='chunk_7cdbbce2-9b9f-423c-b363-723f12106708', content='Table 4 shows automated metrics of the Mamba-UNet model compared to a variety of baselines from Goel et al. (2022): WaveNet (Oord et al. 2016), SampleRNN (Mehri et al. 2017), WaveGAN (Donahue, McAuley, and Puckette 2019), Diï¬ Wave (Z. Kong et al. 2021), and SaShiMi. A small Mamba model outperforms the state-of-the-art (and much larger) GAN- and diï¬ usion- based models. A larger model parameter-matched to the baselines further improves on ï¬ delity metrics dramatically. Table 5 takes the small Mamba model and investigates combinations of diï¬ erent architectures for the outer stages and center stage. It shows that Mamba is consistently better than S4+MLP in the outer blocks, and Mamba > S4+MLP > MHA+MLP in the center blocks. Table 4: (SC09) Automated metrics for unconditional generation on a challenging dataset of fixed-length speech clips. (Top to Bottom) Autoregressive baselines, non-autoregressive baselines, Mamba, and dataset metrics. Table 5: (SC09 Model Ablations) Models with 6M parameters. In SaShiMiâ s U-Net backbone, there are 8 center blocks operat- ing on sequence length 1000, sandwiched on each side by 8 outer blocks on sequence length 4000, sandwiched by 8 outer blocks on sequence length 16000 (40 blocks total). The architecture of the 8 center blocks are ablated independently of the rest. Note that Transformers (MHA+MLP) were not tested in the more im- portant outer blocks because of efficiency constraints. Model Params NLL â FID â IS â', chunk_index=38, num_tokens=399, metadata={}), ResponseChunk(id='chunk_4182a889-6adb-43aa-8e80-441783320df5', content='mIS â AM â SampleRNN WaveNet SaShiMi 35.0M 4.2M 5.8M 2.042 1.925 1.873 8.96 5.08 1.99 1.71 2.27 5.13 3.02 5.80 42.57 1.76 1.47 0.74 WaveGAN DiffWave + SaShiMi Mamba Mamba Train Test 19.1M 24.1M 23.0M 6.1M 24.3M - - - - - 1.852 1.860 - - 2.03 1.92 1.42 0.94 0.67 0.00 0.02 4.90 5.26 5.94 6.26 7.33 8.56 8.33 36.10 51.21 69.17 88.54 144.9 292.5 257.6 0.80 0.68 0.59 0.52 0.36 0.16 0.19 Outer Center S4+MLP MHA+MLP S4+MLP S4+MLP Mamba Mamba Mamba Mamba S4+MLP MHA+MLP S4+MLP Mamba NLL â 1.859 1.867 1.859 1.850 1.853 1.852 FID â 1.45 1.43 1.42 1.37 1.07 0.94 IS â 5.06 5.42 5.71 5.63 6.05 6.26 mIS â', chunk_index=39, num_tokens=386, metadata={}), ResponseChunk(id='chunk_c20f0ed8-c2e8-4024-ac91-795e0443f80f', content='47.03 53.54 56.51 58.23 73.34 88.54 AM â 0.70 0.65 0.64 0.62 0.55 0.52 4.5 Speed and Memory Benchmarks We benchmark the speed of the SSM scan operation (state expansion ð = 16), as well as the end-to-end inference throughput of Mamba, in Figure 8. Our eï¬ cient SSM scan is faster than the best attention implementation that we know of (FlashAttention-2 (Dao 2023)) beyond sequence length 2K, and up to 20-40Ã faster than a standard scan implementation in PyTorch. Mamba achieves 4-5Ã higher inference throughput than a Transformer of similar size, since without the KV cache it can use much higher batch sizes. For example, a Mamba-6.9B (untrained) would have higher inference throughput than a 5Ã smaller Transformer-1.3B. Details in Appendix E.5, which additionally includes a benchmark of memory consumption. 15 Scan vs Convolution vs Attention time (A100 80GB PCle) Inference throughput on A100 80GB (prompt length 2048) â Flashattention-2 ame ee ES 1000-1 â convolution @ 1500] mm Mamba 6.98 wwe â â Scan (PyTorch) Py mmm Transformer 6.78 100 4 â â Scan (ours) Ei % 00M 2 a tod S 1000 B us Ff = 2 500 â = pad oid r S12 1k 2k Â«= 4k BKK 32K GK 128k 256K 512k 1 2 Hi A 16 32 oa 128 Sequence length Batch size @ = ~ Â£', chunk_index=40, num_tokens=400, metadata={}), ResponseChunk(id='chunk_80e62a41-3cd4-46a1-974e-e52855d53a91', content='Figure 8: (Efficiency Benchmarks.) (Left) Training: our efficient scan is 40Ã faster than a standard implementation. (Right) Inference: as a recurrent model, Mamba can achieve 5Ã higher throughput than Transformers. # 4.6 Model Ablations We perform a series of detailed ablations on components of our model, focusing on the setting of language modeling with size â 350M models at Chinchilla token counts (same setting as Figure 4). # 4.6.1 Architecture Table 6 investigates the eï¬ ects of the architecture (block) and its inner SSM layer (Figure 3). We ï¬ nd that â ¢ Among previous non-selective (LTI) SSMs, which are equivalent to global convolutions, performance is very similar. â ¢ Replacing the complex-valued S4 variant from previous work with a real-valued one does not aï¬ ect performance much, suggesting that (at least for LM) real-valued SSMs may be a better choice when accounting for hardware eï¬ ciency. â ¢ Replacing any of these with a selective SSM (S6) signiï¬ cantly improves performance, validating the motivation of Section 3. â ¢ The Mamba architecture performs similarly to the H3 architecture (and seems slightly better when using a selective layer). We also investigate interleaving the Mamba block with other blocks such as MLP (a traditional architecture) MHA (a hybrid attention architecture) in Appendix E.2.2. # 4.6.2 Selective SSM Table 7 ablates the selective SSM layer by considering diï¬ erent combinations of selective â , B, and C param- eters (Algorithm 2), showing that â is the most important parameter due to its connection to RNN gating (Theorem 1). Table 8 considers diï¬', chunk_index=41, num_tokens=401, metadata={}), ResponseChunk(id='chunk_727f7ae9-578a-46bc-be4a-faddc85158b1', content='erent initializations of the SSM, which have been shown to make a large diï¬ erence in some data modalities and settings (Gu, Goel, and RÃ© 2022; Gu, Gupta, et al. 2022). On language modeling, we ï¬ nd that simpler real-valued diagonal initializations (S4D-Real, row 3) instead of more standard complex-valued parameterizations (S4D-Lin, row 1) perform better. Random initializations also work well, consistent with ï¬ ndings from prior work (Mehta et al. 2023). Table 9 and Table 10 consider varying the dimension of the â and (B, C) projections respectively. Changing them from static to selective provides the most beneï¬ t, while increasing the dimensions further generally improves performance modestly with a small increase in parameter count. Of particular note is the dramatic improvement of the selective SSM when the state size ð is increased, with over a 1.0 perplexity improvement for a cost of only 1% additional parameters. This validates our core motivation in Sections 3.1 and 3.3. 16 Table 6: (Ablations: Architecture and SSM layer.) The Mamba block performs similarly to H3 while being simpler. In the inner layer, there is little difference among different parameterizations of LTI models, while selective SSMs (S6) provide a large improvement. More specifically, the S4 (real) variant is S4D-Real and the S4 (complex) variant is S4D-Lin. Model Arch. SSM Layer Perplexity Model Arch.', chunk_index=42, num_tokens=351, metadata={}), ResponseChunk(id='chunk_00aa512e-4ffa-48ce-9a81-7650f9032a59', content='SSM Layer Perplexity Hyena H3 H3 H3 H3 - H3 - Hyena S4 (complex) S4 (real) S6 10.24 10.30 10.34 8.95 Mamba Hyena - Mamba - - Mamba Mamba Mamba S4 (complex) S4 (real) S6 10.75 10.54 10.56 8.69 Table 7: (Ablations: Selective parameters.) â is the most im- portant parameter (Theorem 1), but using multiple selective pa- rameters together synergizes. Table 8: (Ablations: Parameterization of A.) The more standard initializations based on S4D-Lin (Gu, Gupta, et al. 2022) perform worse than S4D-Real or a random initializa- tion, when the SSM is selective. Selective A Selective B SelectiveC Perplexity \\\\Qx& xX Qk *Â®QX Qk Q&X 1093 10.15 9.98 9.81 8.71 Að Initialization Að = â 1 Complex Real Að = â 1â 2 Að = â (ð + 1) Real Að â ¼ exp(ð ©(0, 1)) Real Field + ð ð 2 9.16 8.85 8.71 8.71 Table 9: (Ablations: Expressivity of â .) The selection mechanism of â constructs it with a projection of the input. Project- ing it even to dim. 1 provides a large in- crease in performance; increasing it fur- ther provides further improvements at the cost of a modest increase in parameters. State size fixed to ð = 16. Size of â', chunk_index=43, num_tokens=394, metadata={}), ResponseChunk(id='chunk_903eaf60-0df0-4f03-9646-ae012e994439', content='proj. - 1 2 4 8 16 32 64 Params (M) 358.9 359.1 359.3 359.7 360.5 362.1 365.2 371.5 9.12 8.97 8.97 8.91 8.83 8.84 8.80 8.71 # Perplexity Table 10: (Ablations: SSM state dimension.) (Top) Constant B and C (Bottom) Selective B and C. Increasing the SSM state dimension ð , which can be viewed as an expansion factor on the dimension of the recurrent state, can significantly improve performance for a negligible cost in parameters/FLOPs, but only when B and C are also selective. Size of â projection fixed to 64. State dimension ð Params (M) Perplexity 1 2 4 8 16 1 2 4 8 16 367.1 367.4 368.0 369.1 371.5 367.1 367.4 368.0 369.1 371.5 9.88 9.86 9.82 9.82 9.81 9.73 9.40 9.09 8.84 8.71 # 5 Discussion We discuss related work, limitations, and some future directions. Related Work. Appendix A discusses how the selection mechanism relates to similar concepts. Appendix B has an extended related work of SSMs and other related models. No Free Lunch: Continuous-Discrete Spectrum. Structured SSMs were originally deï¬ ned as discretizations of continuous systems (1), and have had a strong inductive bias toward continuous-time data modalities such as perceptual signals (e.g. audio, video).', chunk_index=44, num_tokens=392, metadata={}), ResponseChunk(id='chunk_2a10abf5-707f-4333-9c35-7ed4697511ed', content='As discussed in Sections 3.1 and 3.5, the selection mechanism overcomes their weaknesses on discrete modalities such as text and DNA; but this conversely can impede their performance 17 on data that LTI SSMs excel on. Our ablations on audio waveforms examine this tradeoï¬ in more detail. Downstream Affordances. Transformer-based foundation models (particularly LLMs) have a rich ecosystem of properties and modes of interaction with pretrained models, such as ï¬ ne-tuning, adaptation, prompting, in-context learning, instruction tuning, RLHF, quantization, and so on. We are particularly interested in whether Transformer alternatives such as SSMs have similar properties and aï¬ ordances. Scaling. Our empirical evaluation is limited to small model sizes, below the threshold of most strong open source LLMs (e.g. Llama (Touvron et al. 2023)) as well as other recurrent models such as RWKV (B. Peng et al. 2023) and RetNet (Y. Sun et al. 2023), which have been evaluated at the 7B parameter scale and beyond. It remains to assess whether Mamba still compares favorably at these larger sizes. We also note that scaling SSMs may involve further engineering challenges and adjustments to the model that are not discussed in this paper. # 6 Conclusion We introduce a selection mechanism to structured state space models, allowing them to perform context-dependent reasoning while scaling linearly in sequence length. When incorporated into a simple attention-free architecture, Mamba achieves state-of-the-art results on a diverse set of domains, where it matches or exceeds the performance of strong Transformer models. We are excited about the broad applications of selective state space models to build foundation models for diï¬ erent domains, especially in emerging modalities requiring long context such as genomics, audio, and video.', chunk_index=45, num_tokens=392, metadata={}), ResponseChunk(id='chunk_297cd6f8-60fb-40ce-b3b8-36f220ca68c5', content='Our results suggest that Mamba is a strong candidate to be a general sequence model backbone. # Acknowledgments We thank Karan Goel, Arjun Desai, and Kush Bhatia for helpful feedback on the draft. # References [1] Martin Arjovsky, Amar Shah, and Yoshua Bengio. â Unitary Evolution Recurrent Neural Networksâ . In: The International Conference on Machine Learning (ICML). 2016, pp. 1120â 1128. iga Avsec, Vikram Agarwal, Daniel Visentin, Joseph R Ledsam, Agnieszka Grabska-Barwinska, Kyle R Taylor, Yannis Assael, John Jumper, Pushmeet Kohli, and David R Kelley. â Effective Gene Expression Prediction from Sequence by Integrating Long-range Interactionsâ . In: Nature Methods 18.10 (2021), pp. 1196â 1203. Jimmy Ba, Geoffrey E Hinton, Volodymyr Mnih, Joel Z Leibo, and Catalin Ionescu. â Using Fast Weights to Attend to the Recent Pastâ . In: Advances in Neural Information Processing Systems (NeurIPS) 29 (2016). Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. â Layer Normalizationâ . In: arXiv preprint arXiv:1607.06450 (2016). [2] [3] [4] [5] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. â Neural Machine Translation by Jointly Learning to Align and Translateâ . In: The International Conference on Learning Representations (ICLR). 2015. [6] David Balduzzi and Muhammad Ghifary. â Strongly-typed Recurrent Neural Networksâ . In: International Con- ference on Machine Learning.', chunk_index=46, num_tokens=393, metadata={}), ResponseChunk(id='chunk_6082fdca-7dc4-4e76-998b-4914064ef3ee', content='PMLR. 2016, pp. 1292â 1300. [7] Stella Biderman, Hailey Schoelkopf, Quentin Gregory Anthony, Herbie Bradley, Kyle OBrien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, et al. â Pythia: A Suite for Analyzing Large Language Models across Training and Scalingâ . In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 2397â 2430. [8] Yonatan Bisk, Rowan Zellers, Jianfeng Gao, Yejin Choi, et al. â PIQA: Reasoning about Physical Commonsense in Natural Languageâ . In: Proceedings of the AAAI conference on Artificial Intelligence. Vol. 34. 05. 2020, pp. 7432â 7439. [9] Guy E Blelloch. â Prefix Sums and Their Applicationsâ . In: (1990). [10] James Bradbury, Stephen Merity, Caiming Xiong, and Richard Socher. â Quasi-recurrent Neural Networksâ . In: arXiv preprint arXiv:1611.01576 (2016). 18 [11] Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Nee- lakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. â Language Models are Few-shot Learnersâ . In: Advances in Neural Information Processing Systems (NeurIPS) 33 (2020), pp. 1877â 1901. [12] Aydar Bulatov, Yuri Kuratov, and Mikhail S Burtsev. â', chunk_index=47, num_tokens=399, metadata={}), ResponseChunk(id='chunk_41edc83c-a2dc-4692-82e2-801cf69a4691', content='Scaling Transformer to 1M tokens and Beyond with RMTâ . In: arXiv preprint arXiv:2304.11062 (2023). [13] Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. â Generating Long Sequences with Sparse Trans- formersâ . In: arXiv preprint arXiv:1904.10509 (2019). [14] Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Pe- ter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, et al. â Rethinking Attention with Performersâ . In: The International Conference on Learning Representations (ICLR). 2021. [15] Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. â PaLM: Scaling Language Modeling with Pathwaysâ . In: Journal of Machine Learning Research 24.240 (2023), pp. 1â 113. url: http://jmlr.org/ papers/v24/22-1144.html. Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio. â Empirical Evaluation of Gated Re- current Neural Networks on Sequence Modelingâ . In: arXiv preprint arXiv:1412.3555 (2014). [17] Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. â', chunk_index=48, num_tokens=395, metadata={}), ResponseChunk(id='chunk_5453af11-1810-436b-bd73-e23b3074a501', content='Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challengeâ . In: arXiv preprint arXiv:1803.05457 (2018). [18] Tri Dao. â FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioningâ . In: (2023). [19] Tri Dao, Daniel Y Fu, Stefano Ermon, Atri Rudra, and Christopher RÃ©. â FlashAttention: Fast and Memory- Efficient Exact Attention with IO-Awarenessâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2022. [20] Tri Dao, Daniel Y Fu, Khaled K Saab, Armin W Thomas, Atri Rudra, and Christopher RÃ©. â Hungry Hungry Hippos: Towards Language Modeling with State Space Modelsâ . In: The International Conference on Learning Representations (ICLR). 2023. [21] Yann N Dauphin, Angela Fan, Michael Auli, and David Grangier. â Language Modeling with Gated Convolu- tional Networksâ . In: The International Conference on Machine Learning (ICML). PMLR. 2017, pp. 933â 941. # [22] DeepSound. SampleRNN. https://github.com/deepsound-project/samplernn-pytorch. 2017. [23] Jiayu Ding, Shuming Ma, Li Dong, Xingxing Zhang, Shaohan Huang, Wenhui Wang, and Furu Wei. â LongNet: Scaling Transformers to 1,000,000,000 Tokensâ . In: arXiv preprint arXiv:2307.02486 (2023). [24] Chris Donahue, Julian McAuley, and Miller Puckette. â Adversarial Audio Synthesisâ . In:', chunk_index=49, num_tokens=397, metadata={}), ResponseChunk(id='chunk_f3e0d29e-bf8e-45f0-b507-2d79e17c8e16', content='The International Conference on Learning Representations (ICLR). 2019. [25] Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. â An Image is Worth 16x16 Words: Transformers for Image Recognition at Scaleâ . In: The International Conference on Learning Representations (ICLR). 2020. [26] Nelson Elhage, Neel Nanda, Catherine Olsson, Tom Henighan, Nicholas Joseph, Ben Mann, Amanda Askell, Yuntao Bai, Anna Chen, Tom Conerly, Nova DasSarma, Dawn Drain, Deep Ganguli, Zac Hatfield-Dodds, Danny Hernandez, Andy Jones, Jackson Kernion, Liane Lovitt, Kamal Ndousse, Dario Amodei, Tom Brown, Jack Clark, Jared Kaplan, Sam McCandlish, and Chris Olah. â A Mathematical Framework for Transformer Circuitsâ . In: Transformer Circuits Thread (2021). https://transformer-circuits.pub/2021/framework/index.html. [27] Mahan Fathi, Jonathan Pilault, Pierre-Luc Bacon, Christopher Pal, Orhan Firat, and Ross Goroshin. â Block- State Transformerâ . In: arXiv preprint arXiv:2306.09539 (2023). [28] Yassir Fathullah, Chunyang Wu, Yuan Shangguan, Junteng Jia, Wenhan Xiong, Jay Mahadeokar, Chunxi Liu, Yangyang Shi, Ozlem Kalinli, Mike Seltzer, et al. â Multi-Head State Space Model for Sequence Modelingâ . In: INTERSPEECH. 2023.', chunk_index=50, num_tokens=398, metadata={}), ResponseChunk(id='chunk_92ef8328-2fa8-42d5-82aa-d4499de12728', content='[29] Karl J Friston, Lee Harrison, and Will Penny. â Dynamic Causal Modellingâ . In: Neuroimage 19.4 (2003), pp. 1273â 1302. [30] Daniel Y Fu, Elliot L Epstein, Eric Nguyen, Armin W Thomas, Michael Zhang, Tri Dao, Atri Rudra, and Christo- pher RÃ©. â Simple Hardware-efficient Long Convolutions for Sequence Modelingâ . In: The International Confer- ence on Machine Learning (ICML) (2023). [31] Ken-ichi Funahashi and Yuichi Nakamura. â Approximation of Dynamical Systems by Continuous Time Recur- rent Neural Networksâ . In: Neural Networks 6.6 (1993), pp. 801â 806. 19 [32] Leo Gao, Stella Biderman, Sid Black, Laurence Golding, Travis Hoppe, Charles Foster, Jason Phang, Horace He, Anish Thite, Noa Nabeshima, Shawn Presser, and Connor Leahy. â The Pile: An 800GB Dataset of Diverse Text for Language Modelingâ . In: arXiv preprint arXiv:2101.00027 (2020). [33] Leo Gao, Jonathan Tow, Stella Biderman, Sid Black, Anthony DiPofi, Charles Foster, Laurence Golding, Jeffrey Hsu, Kyle McDonell, Niklas Muennighoff, Jason Phang, Laria Reynolds, Eric Tang, Anish Thite, Ben Wang, Kevin Wang, and Andy Zou. A Framework for Few-shot Language Model Evaluation. Version v0.0.1. Sept. 2021. doi: 10.5281/zenodo.5371628. url: https://doi.org/10.5281/zenodo.5371628.', chunk_index=51, num_tokens=398, metadata={}), ResponseChunk(id='chunk_69ead650-29c7-4b10-8649-06ae673f0cf5', content='[34] Karan Goel, Albert Gu, Chris Donahue, and Christopher RÃ©. â Itâ s Raw! Audio Generation with State-Space Modelsâ . In: The International Conference on Machine Learning (ICML). 2022. [35] Albert Gu, Tri Dao, Stefano Ermon, Atri Rudra, and Christopher RÃ©. â HIPPO: Recurrent Memory with Optimal Polynomial Projectionsâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2020. [36] Albert Gu, Karan Goel, and Christopher RÃ©. â Efficiently Modeling Long Sequences with Structured State Spacesâ . In: The International Conference on Learning Representations (ICLR). 2022. [37] Albert Gu, Caglar Gulcehre, Tom Le Paine, Matt Hoffman, and Razvan Pascanu. â Improving the Gating Mech- anism of Recurrent Neural Networksâ . In: The International Conference on Machine Learning (ICML). 2020. [38] Albert Gu, Ankit Gupta, Karan Goel, and Christopher RÃ©. â On the Parameterization and Initialization of Diag- onal State Space Modelsâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2022. [39] Albert Gu, Isys Johnson, Karan Goel, Khaled Saab, Tri Dao, Atri Rudra, and Christopher RÃ©. â Combining Recur- rent, Convolutional, and Continuous-time Models with the Linear State Space Layerâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2021. [40] Albert Gu, Isys Johnson, Aman Timalsina, Atri Rudra, and Christopher RÃ©. â How to Train Your HIPPO: State Space Models with Generalized Basis Projectionsâ .', chunk_index=52, num_tokens=397, metadata={}), ResponseChunk(id='chunk_8fc78d45-43b1-4127-9614-69224359a4ec', content='In: The International Conference on Learning Representations (ICLR). 2023. [41] Ankit Gupta, Albert Gu, and Jonathan Berant. â Diagonal State Spaces are as Effective as Structured State Spacesâ . In: Advances in Neural Information Processing Systems 35 (2022), pp. 22982â 22994. [42] David Ha, Andrew Dai, and Quoc V. Le. â HyperNetworksâ . In: The International Conference on Learning Rep- resentations (ICLR). 2017. [43] Danijar Hafner, Timothy Lillicrap, Jimmy Ba, and Mohammad Norouzi. â Dream to Control: Learning Behav- iors by Latent Imaginationâ . In: The International Conference on Learning Representations (ICLR). 2020. [44] Ramin Hasani, Mathias Lechner, Tsun-Hsuan Wang, Makram Chahine, Alexander Amini, and Daniela Rus. â Liquid Structural State-Space Modelsâ . In: The International Conference on Learning Representations (ICLR). 2023. [45] Mikael Henaff, Arthur Szlam, and Yann LeCun. â Recurrent Orthogonal Networks and Long-Memory Tasksâ . In: The International Conference on Machine Learning (ICML). 2016. [46] Dan Hendrycks and Kevin Gimpel. â Gaussian Error Linear Units (GELUs)â . In: arXiv preprint arXiv:1606.08415 (2016). [47] Sepp Hochreiter and JÃ¼rgen Schmidhuber. â Long Short-Term Memoryâ . In: Neural Computation 9.8 (1997), pp. 1735â 1780.', chunk_index=53, num_tokens=375, metadata={}), ResponseChunk(id='chunk_ea6b2be3-bbb5-463a-89d4-48d821b19764', content='Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. â An Empirical Analysis of Compute- Optimal Large Language Model Trainingâ . In: Advances in Neural Information Processing Systems (NeurIPS) 35 (2022), pp. 30016â 30030. 48 [49] Weizhe Hua, Zihang Dai, Hanxiao Liu, and Quoc Le. â Transformer Quality in Linear Timeâ . In: The Interna- tional Conference on Machine Learning (ICML). PMLR. 2022, pp. 9099â 9117. [50] Hassan Ismail Fawaz, Germain Forestier, Jonathan Weber, Lhassane Idoumghar, and Pierre-Alain Muller. â Deep Learning for Time Series Classification: A Reviewâ . In: Data Mining and Knowledge Discovery 33.4 (2019), pp. 917â 963. [51] Andrei Ivanov, Nikoli Dryden, Tal Ben-Nun, Shigang Li, and Torsten Hoefler. â Data Movement is All You Need: A Case Study on Optimizing Transformersâ . In: Proceedings of Machine Learning and Systems 3 (2021), pp. 711â 732. [52] Li Jing, Caglar Gulcehre, John Peurifoy, Yichen Shen, Max Tegmark, Marin Soljacic, and Yoshua Bengio. â Gated Orthogonal Recurrent Units: On Learning to Forgetâ . In: Neural Computation 31.4 (2019), pp. 765â 783. [53] Rudolph Emil Kalman. â A New Approach to Linear Filtering and Prediction Problemsâ .', chunk_index=54, num_tokens=399, metadata={}), ResponseChunk(id='chunk_d2ef457e-a806-497b-9b3e-06d7f8056e29', content='In: (1960). 20 [54] Angelos Katharopoulos, Apoorv Vyas, Nikolaos Pappas, and FranÃ§ois Fleuret. â Transformers are RNNs: Fast Autoregressive Transformers with Linear Attentionâ . In: International Conference on Machine Learning. PMLR. 2020, pp. 5156â 5165. [55] Zhifeng Kong, Wei Ping, Jiaji Huang, Kexin Zhao, and Bryan Catanzaro. â DiffWave: A Versatile Diffusion Model for Audio Synthesisâ . In: International Conference on Learning Representations. 2021. [56] Chrysoula Kosma, Giannis Nikolentzos, and Michalis Vazirgiannis. â Time-Parameterized Convolutional Neu- ral Networks for Irregularly Sampled Time Seriesâ . In: arXiv preprint arXiv:2308.03210 (2023). [57] Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. â ImageNet Classification with Deep Convolutional Neural Networksâ . In: Advances in Neural Information Processing Systems (NeurIPS) 25 (2012). [58] Tao Lei. â When Attention Meets Fast Recurrence: Training Language Models with Reduced Computeâ . In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing. 2021, pp. 7633â 7648. [59] Tao Lei, Yu Zhang, Sida I Wang, Hui Dai, and Yoav Artzi. â Simple Recurrent Units for Highly Parallelizable Recurrenceâ . In: arXiv preprint arXiv:1709.02755 (2017). [60] Mario Lezcano-Casado and David MartÃ nez-Rubio. â Cheap Orthogonal Constraints in Neural Networks:', chunk_index=55, num_tokens=400, metadata={}), ResponseChunk(id='chunk_ac9a9503-8eab-4c46-86fe-e79699fc8665', content='A Simple Parametrization of the Orthogonal and Unitary Groupâ . In: The International Conference on Machine Learning (ICML). 2019. [61] Yuhong Li, Tianle Cai, Yi Zhang, Deming Chen, and Debadeepta Dey. â What Makes Convolutional Models Great on Long Sequence Modeling?â In: The International Conference on Learning Representations (ICLR). 2023. [62] Vasileios Lioutas and Yuhong Guo. â Time-aware Large Kernel Convolutionsâ . In: The International Conference on Machine Learning (ICML). PMLR. 2020, pp. 6172â 6183. [63] Chris Lu, Yannick Schroecker, Albert Gu, Emilio Parisotto, Jakob Foerster, Satinder Singh, and Feryal Behba- hani. â Structured State Space Models for In-Context Reinforcement Learningâ . In: Advances in Neural Informa- tion Processing Systems (NeurIPS). 2023. [64] Shahar Lutati, Itamar Zimerman, and Lior Wolf. â Focus Your Attention (with Adaptive IIR Filters)â . In: arXiv preprint arXiv:2305.14952 (2023). [65] Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. â Mega: Moving Average Equipped Gated Attentionâ . In: The International Conference on Learning Representations (ICLR). 2023. [66] Eric Martin and Chris Cundy. â Parallelizing Linear Recurrent Neural Nets Over Sequence Lengthâ . In: The International Conference on Learning Representations (ICLR). 2018.', chunk_index=56, num_tokens=385, metadata={}), ResponseChunk(id='chunk_ba183b0e-47f0-4928-ba80-0fee24589297', content='[67] Soroush Mehri, Kundan Kumar, Ishaan Gulrajani, Rithesh Kumar, Shubham Jain, Jose Sotelo, Aaron Courville, and Yoshua Bengio. â SampleRNN: An Unconditional End-to-End Neural Audio Generation Modelâ . In: The International Conference on Learning Representations (ICLR). 2017. [68] Harsh Mehta, Ankit Gupta, Ashok Cutkosky, and Behnam Neyshabur. â Long Range Language Modeling via Gated State Spacesâ . In: The International Conference on Learning Representations (ICLR). 2023. [69] Zakaria Mhammedi, Andrew Hellicar, Ashfaqur Rahman, and James Bailey. â Efficient Orthogonal Parametri- sation of Recurrent Neural Networks using Householder Reflectionsâ . In: International Conference on Machine Learning. PMLR. 2017, pp. 2401â 2409. [70] Eric Nguyen, Karan Goel, Albert Gu, Gordon Downs, Preey Shah, Tri Dao, Stephen Baccus, and Christopher RÃ©. â S4ND: Modeling Images and Videos as Multidimensional Signals with State Spacesâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2022. [71] Eric Nguyen, Michael Poli, Marjan Faizi, Armin Thomas, Callum Birch-Sykes, Michael Wornow, Aman Pa- tel, Clayton Rabideau, Stefano Massaroli, Yoshua Bengio, et al. â HyenaDNA: Long-range Genomic Sequence Modeling at Single Nucleotide Resolutionâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2023.', chunk_index=57, num_tokens=369, metadata={}), ResponseChunk(id='chunk_1f1de5e6-a40a-477e-bc7f-13b94353d128', content='[72] Catherine Olsson, Nelson Elhage, Neel Nanda, Nicholas Joseph, Nova DasSarma, Tom Henighan, Ben Mann, Amanda Askell, Yuntao Bai, Anna Chen, Tom Conerly, Dawn Drain, Deep Ganguli, Zac Hatfield-Dodds, Danny Hernandez, Scott Johnston, Andy Jones, Jackson Kernion, Liane Lovitt, Kamal Ndousse, Dario Amodei, Tom Brown, Jack Clark, Jared Kaplan, Sam McCandlish, and Chris Olah. â In-context Learning and Induction Headsâ . In: Transformer Circuits Thread (2022). https://transformer-circuits.pub/2022/in-context-learning-and-induction- heads/index.html. [73] Aaron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalch- brenner, Andrew Senior, and Koray Kavukcuoglu. â WaveNet: A Generative Model for Raw Audioâ . In: arXiv preprint arXiv:1609.03499 (2016). 21 [74] Antonio Orvieto, Samuel L Smith, Albert Gu, Anushan Fernando, Caglar Gulcehre, Razvan Pascanu, and So- ham De. â Resurrecting Recurrent Neural Networks for Long Sequencesâ . In: The International Conference on Machine Learning (ICML). 2023. [75] Denis Paperno, GermÃ¡n Kruszewski, Angeliki Lazaridou, Ngoc-Quan Pham, Raffaella Bernardi, Sandro Pezzelle, Marco Baroni, Gemma Boleda, and Raquel FernÃ¡ndez. â The LAMBADA Dataset: Word Prediction Requiring a Broad Discourse Contextâ . In:', chunk_index=58, num_tokens=397, metadata={}), ResponseChunk(id='chunk_739a06d2-04bd-4481-92e8-da178fa749cb', content='Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics. 2016, pp. 1525â 1534. [76] Razvan Pascanu, Tomas Mikolov, and Yoshua Bengio. â On the Difficulty of Training Recurrent Neural Net- worksâ . In: International Conference on Machine Learning. 2013, pp. 1310â 1318. [77] Bo Peng, Eric Alcaide, Quentin Anthony, Alon Albalak, Samuel Arcadinho, Huanqi Cao, Xin Cheng, Michael Chung, Matteo Grella, Kranthi Kiran GV, et al. â RWKV: Reinventing RNNs for the Transformer Eraâ . In: arXiv preprint arXiv:2305.13048 (2023). [78] Hao Peng, Nikolaos Pappas, Dani Yogatama, Roy Schwartz, Noah A Smith, and Lingpeng Kong. â Random Feature Attentionâ . In: The International Conference on Learning Representations (ICLR). 2021. [79] Michael Poli, Stefano Massaroli, Eric Nguyen, Daniel Y Fu, Tri Dao, Stephen Baccus, Yoshua Bengio, Stefano Ermon, and Christopher RÃ©. â Hyena Hierarchy: Towards Larger Convolutional Language Modelsâ . In: The International Conference on Machine Learning (ICML). 2023. [80] Zhen Qin, Xiaodong Han, Weixuan Sun, Bowen He, Dong Li, Dongxu Li, Yuchao Dai, Lingpeng Kong, and Yiran Zhong. â Toeplitz Neural Network for Sequence Modelingâ . In: The International Conference on Learning Representations (ICLR). 2023.', chunk_index=59, num_tokens=373, metadata={}), ResponseChunk(id='chunk_a26bc454-a0a5-48ba-b3d4-b6bacd711940', content='[81] Zhen Qin, Xiaodong Han, Weixuan Sun, Dongxu Li, Lingpeng Kong, Nick Barnes, and Yiran Zhong. â The devil in linear transformerâ . In: arXiv preprint arXiv:2210.10340 (2022). [82] Zhen Qin, Weixuan Sun, Hui Deng, Dongxu Li, Yunshen Wei, Baohong Lv, Junjie Yan, Lingpeng Kong, and Yiran Zhong. â CosFormer: Rethinking Softmax in Attentionâ . In: The International Conference on Learning Representations (ICLR). 2022. [83] Ali Rahimi and Benjamin Recht. â Random features for large-scale kernel machinesâ . In: Advances in neural information processing systems 20 (2007). [84] Prajit Ramachandran, Barret Zoph, and Quoc V Le. â Swish: A Self-gated Activation Functionâ . In: arXiv preprint arXiv:1710.05941 7.1 (2017), p. 5. [85] David W Romero, Anna Kuzina, Erik J Bekkers, Jakub M Tomczak, and Mark Hoogendoorn. â CKConv: Con- tinuous Kernel Convolution For Sequential Dataâ . In: arXiv preprint arXiv:2102.02611 (2021). [86] Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. â Winogrande: An Adversarial Wino- grad Schema Challenge at Scaleâ . In: Communications of the ACM 64.9 (2021), pp. 99â 106. [87] George Saon, Ankit Gupta, and Xiaodong Cui. â', chunk_index=60, num_tokens=391, metadata={}), ResponseChunk(id='chunk_0d1d2739-f939-4441-90e9-c10981e054d2', content='Diagonal State Space Augmented Transformers for Speech Recognitionâ . In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE. 2023, pp. 1â 5. Imanol Schlag, Kazuki Irie, and JÃ¼rgen Schmidhuber. â Linear Transformers are Secretly Fast Weight Program- mersâ . In: The International Conference on Machine Learning (ICML). PMLR. 2021, pp. 9355â 9366. [89] Noam Shazeer. â GLU Variants Improve Transformerâ . In: arXiv preprint arXiv:2002.05202 (2020). [90] Freda Shi, Xinyun Chen, Kanishka Misra, Nathan Scales, David Dohan, Ed H Chi, Nathanael SchÃ¤rli, and Denny Zhou. â Large Language Models can be Easily Distracted by Irrelevant Contextâ . In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 31210â 31227. Jiaxin Shi, Ke Alexander Wang, and Emily Fox. â Sequence Modeling with Multiresolution Convolutional Mem- oryâ . In: The International Conference on Machine Learning (ICML). PMLR. 2023, pp. 31312â 31327. Jimmy TH Smith, Andrew Warrington, and Scott W Linderman. â Simplified State Space Layers for Sequence Modelingâ . In: The International Conference on Learning Representations (ICLR). 2023. Jianlin Su, Yu Lu, Shengfeng Pan, Ahmed Murtadha, Bo Wen, and Yunfeng Liu. â Roformer: Enhanced Trans- former with Rotary Position Embeddingâ .', chunk_index=61, num_tokens=393, metadata={}), ResponseChunk(id='chunk_be78544f-acac-479d-8108-654df546aa3b', content='In: arXiv preprint arXiv:2104.09864 (2021). [93] [94] Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, and Furu Wei. â Retentive network: A successor to transformer for large language modelsâ . In: arXiv preprint arXiv:2307.08621 (2023). Ilya Sutskever, Oriol Vinyals, and Quoc V Le. â Sequence to Sequence Learning with Neural Networksâ . In: Advances in Neural Information Processing Systems (NeurIPS) 27 (2014). 22 [96] Corentin Tallec and Yann Ollivier. â Can Recurrent Neural Networks Warp Time?â In: The International Con- ference on Learning Representations (ICLR). 2018. [97] Yi Tay, Mostafa Dehghani, Samira Abnar, Yikang Shen, Dara Bahri, Philip Pham, Jinfeng Rao, Liu Yang, Se- bastian Ruder, and Donald Metzler. â Long Range Arena: A Benchmark for Efficient Transformersâ . In: Inter- national Conference on Learning Representations (ICLR). 2021. [98] Yi Tay, Mostafa Dehghani, Dara Bahri, and Donald Metzler. â Efficient Transformers: A Surveyâ . In: ACM Com- puting Surveys 55.6 (2022), pp. 1â 28.', chunk_index=62, num_tokens=335, metadata={}), ResponseChunk(id='chunk_f99d31ae-a4f9-4b06-a606-34b6737c6b6f', content='[99] Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, TimothÃ©e Lacroix, Bap- tiste RoziÃ¨re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. â Llama: Open and Efficient Foundation Language Modelsâ . In: arXiv preprint arXiv:2302.13971 (2023). [100] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. â Attention Is All You Needâ . In: Advances in Neural Information Processing Systems (NeurIPS). 2017. [101] Eugene Vorontsov, Chiheb Trabelsi, Samuel Kadoury, and Chris Pal. â On Orthogonality and Learning Recur- rent Networks with Long Term Dependenciesâ . In: International Conference on Machine Learning. PMLR. 2017, pp. 3570â 3578. Jue Wang, Wentao Zhu, Pichao Wang, Xiang Yu, Linda Liu, Mohamed Omar, and Raffay Hamid. â Selective Structured State-Spaces for Long-form Video Understandingâ . In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2023, pp. 6387â 6397. [102] [103] Pete Warden. â Speech Commands: A Dataset for Limited-Vocabulary Speech Recognitionâ . In: ArXiv abs/1804.03209 (2018). [104] Samuel Williams, Andrew Waterman, and David Patterson. â Roofline: An Insightful Visual Performance Model for Multicore Architecturesâ . In:', chunk_index=63, num_tokens=392, metadata={}), ResponseChunk(id='chunk_f31239ef-c839-406d-a057-1ce92f6abfa3', content='Communications of the ACM 52.4 (2009), pp. 65â 76. [105] Brandon Yang, Gabriel Bender, Quoc V Le, and Jiquan Ngiam. â CondConv: Conditionally Parameterized Con- volutions for Efficient Inferenceâ . In: Advances in Neural Information Processing Systems (NeurIPS) 32 (2019). [106] Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. â HellaSwag: Can a Machine Really Finish Your Sentence?â In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguis- tics. 2019. [107] Shuangfei Zhai, Walter Talbott, Nitish Srivastava, Chen Huang, Hanlin Goh, Ruixiang Zhang, and Josh Susskind. â An Attention Free Transformerâ . In: arXiv preprint arXiv:2105.14103 (2021). [108] Michael Zhang, Khaled K Saab, Michael Poli, Tri Dao, Karan Goel, and Christopher RÃ©. â Effectively Modeling Time Series with Simple Discrete State Spacesâ . In: The International Conference on Learning Representations (ICLR). 2023. [109] Lin Zheng, Chong Wang, and Lingpeng Kong. â Linear complexity randomized self-attention mechanismâ . In: International Conference on Machine Learning. PMLR. 2022, pp. 27011â 27041. [110] Simiao Zuo, Xiaodong Liu, Jian Jiao, Denis Charles, Eren Manavoglu, Tuo Zhao, and Jianfeng Gao. â Efficient Long Sequence Modeling via State Space Augmented Transformerâ . In: arXiv preprint arXiv:2212.08136 (2022). 23', chunk_index=64, num_tokens=400, metadata={}), ResponseChunk(id='chunk_a68ba7b6-b711-4e79-b060-db8657bdaf66', content='# A Discussion: Selection Mechanism Our selection mechanism is inspired by and related to concepts such as gating, hypernetworks, and data-dependence. It can also be viewed as related to â fast weightsâ (J. Ba et al. 2016), which connects classical RNNs with the mechanism of linear attention (Schlag, Irie, and Schmidhuber 2021). However, we believe that it is a distinct concept that is worth clarifying. Gating. Gating originally referred to the gating mechanisms of RNNs such as the LSTM (Hochreiter and Schmidhuber 1997) and GRU (J. Chung et al. 2014), or the gated equation (5)n Theorem 1. This was interpreted as a particular mechanism for controlling whether to let an input into the hidden state of an RNN. In particular, this aï¬ ects the propagation of signal through time and causes inputs to interact along the sequence length dimension. However, the concept of gating has since been relaxed in popular usage to simply mean any multiplicative interaction (often with an activation function). For example, elementwise multiplicative components of neural network architectures (that do not interact along sequence length) are now commonly referred to as gated architectures (Hua et al. 2022; Mehta et al. 2023), despite a very diï¬ erent meaning than the original RNN sense. Thus we believe the original concept of RNN gating versus the popular usage of multiplicative gating actually have a very diï¬ erent semantic meaning. Hypernetworks. Hypernetworks refer to neural networks whose parameters are themselves generated by smaller neural networks. The original idea (Ha, Dai, and Quoc V. Le 2017) used it in a narrow sense to deï¬ ne a large RNN whose recurrent parameters are generated by a smaller RNN. Data-dependence.', chunk_index=65, num_tokens=395, metadata={}), ResponseChunk(id='chunk_b4370378-a139-4777-98ff-fb8194f0b25b', content='Similar to hypernetworks, data-dependence can refer to any notion where some parameters of the model depend on the data (Poli et al. 2023). Example: GLU Activation. To illustrate the issues with these concepts, consider a simple diagonal linear layer ð ¦ = Dð ¥, where D is a diagonal weight parameter. Now suppose that D is itself generated from a linear transformation of ð ¥, with an optional nonlinearity: D = ð (W ð ¥). Since it is diagonal, the multiplication becomes an elementwise product: ð ¦ = ð (W ð ¥)â ¦ð ¥. This is a rather trivial transformation, yet it technically satisï¬ es the common meanings of gating (since it has a multiplicative â branchâ ), hypernetworks (since the parameter D is generated by another layer), and data-dependent (since D depends on the data ð ¥). However, this in fact simply deï¬ nes a GLU function, which is so simple that it is often considered just an activation function (Dauphin et al. 2017; Shazeer 2020) instead of a meaningful layer. Selection. Thus, while selection mechanisms could be considered a special case of ideas such as architectural gating, hypernetworks, or data-dependence, so can an enormous range of other constructionsâ essentially anything with a multiplication, including standard attention mechanisms (Bahdanau, Cho, and Bengio 2015; Vaswani et al. 2017) as wellâ and we ï¬ nd it uninformative to think of them as such. Instead, we view it as most closely related to the gating mechanism of traditional RNNs, which is a special case (Theorem 1) and also has a deeper history of connections to SSMs through variable (input-dependent) discretization of â', chunk_index=66, num_tokens=392, metadata={}), ResponseChunk(id='chunk_292a8fe1-ddab-4c4a-ae9b-3c9766a5c1c3', content='(Funahashi and Nakamura 1993; Gu, Dao, et al. 2020; Tallec and Ollivier 2018). We also eschew the term â gatingâ in favor of selection to clarify the overloaded use of former. More narrowly, we use selection to refer to the mechanistic action of a model to select or ignore inputs and facilitate data interaction along the sequence length (Section 3.1). Beyond selective SSMs and gated RNNs, other examples may include input-dependent convolutions (Kosma, Nikolentzos, and Vazirgiannis 2023; Lioutas and Guo 2020; Lutati, Zimerman, and Wolf 2023; Yang et al. 2019) and even attention. 24 # B Related Work We overview several prior works related to our methods. We mention that some of the most closely related models include recurrent layers such as S4, S5, and quasi-RNNs; as well as end-to-end architectures such as H3, RetNet, and RWKV. # B.1 S4 Variants and Derivatives We describe a brief overview of some structured SSMs from past work, particularly those that have a relation to our method. â ¢ S4 (Gu, Goel, and RÃ© 2022; Gu, Johnson, Goel, et al. 2021) introduced the ï¬ rst structured SSM, describing diagonal structure and diagonal plus low-rank (DPLR). It focused on eï¬ cient convolutional algorithms for DPLR SSMs due to a connection to continuous-time online memorization (HIPPO) (Gu, Dao, et al. 2020). â ¢ DSS (Gupta, Gu, and Berant 2022) ï¬ rst discovered the empirical eï¬', chunk_index=67, num_tokens=394, metadata={}), ResponseChunk(id='chunk_0bc77efd-212b-4529-90b4-d15dfa5a7e69', content='ectiveness of diagonal structured SSMs by approximating the HIPPO initialization. This was expanded on theoretically in S4D (Gu, Gupta, et al. 2022). â ¢ S5 (Smith, Warrington, and Linderman 2023) independently discovered the diagonal SSM approximation, and is the ï¬ rst S4 model to be computed recurrently with the parallel scan. However, this required lowering the eï¬ ective state dimension, which they accomplished by switching the SSM dimensions from a SISO (single-input single-output) to MIMO (multi-input multi-output) formulation. Our proposed S6 shares the scan, but diï¬ ers by (i) keeping the SISO dimensions, which provides a larger eï¬ ective recurrent state, (ii) using a hardware-aware algorithm to overcome the computation issue, (iii) adding the selection mechanism. Lu et al. (2023) applied S5 to meta-RL in order to handle resetting the SSM state between episode trajectories. Their mechanism can be viewed as a particular hard-coded instance of a selection mechanism, where A is manually set to 0, instead of our learnable mechanism that depends on the input. It would be interesting to apply selective SSMs generically to this setting and probe if the model has learned to automatically reset its state on episode boundaries. â ¢ Mega (Ma et al. 2023) introduced a simpliï¬ cation of S4 to be real- instead of complex- valued, giving it an interpretation of being an exponential moving average (EMA). They additionally make an interesting connection of the discretization step of SSMs to an EMA damping term. Contrary to ï¬ ndings in the original S4 papers, this was the ï¬ rst model to show that real-valued SSMs are empirically eï¬ ective in certain settings or when combined with diï¬ erent architectural components. â', chunk_index=68, num_tokens=406, metadata={}), ResponseChunk(id='chunk_1392e563-fc5e-4422-bf25-1b266f28ca40', content='¢ Liquid S4 (Hasani et al. 2023) is also motivated by augmenting S4 with an input-dependent state transition. From this perspective it shares similarity to selection mechanisms, although in a limited form which is still computed convolutionally and close to LTI. â ¢ SGConv (Y. Li et al. 2023), Hyena (Poli et al. 2023), LongConv (Fu et al. 2023), MultiresConv (J. Shi, K. A. Wang, and Fox 2023), and Toeplitz Neural Network (Qin, Han, W. Sun, He, et al. 2023) all focus on the convolutional representation of S4 and create global or long convolution kernels with diï¬ erent parameterizations. However, these methods cannot do fast autoregressive inference directly. Notably, all of these methods, and all other structured SSMs that we are aware of, have been non-selective and usually strictly LTI (linear time invariant). # B.2 SSM Architectures We use SSM architectures or state space neural networks (SSNN) to refer to deep neural network architectures incorporating one of the previous SSMs as a black box layer. â ¢ GSS (Mehta et al. 2023) was the ï¬ rst gated neural network architecture incorporating SSMs. It is motivated by the gated attention unit (GAU) of Hua et al. (2022) and looks quite similar to our block, except with additional projections. Most importantly, its projection contracts the model dimension to reduce the state size of the SSM, while ours expands the model dimension in order to increase the state size, based on the motivation in Section 3.1. 25 â ¢ Mega (Ma et al. 2023) combined the EMA simpliï¬', chunk_index=69, num_tokens=390, metadata={}), ResponseChunk(id='chunk_c0fd6e28-5e9f-45b4-9538-6fb516b949a9', content='cation of S4 described above into a hybrid architecture using an eï¬ cient attention approximation. â ¢ H3 (Dao, Fu, Saab, et al. 2023) is motivated by combining S4 with linear attention (Katharopoulos et al. 2020). It is the ï¬ rst to generalize this formulation of linear attention to more general recurrences, which is also the basis of later architectures. â ¢ Selective S4 (J. Wang et al. 2023) incorporates S4 as a black box to generate a binary mask which is multiplied on the input. While sharing the â selectionâ name, we consider this an architectural modiï¬ cation that is closer to architectural gating than a selection mechanism (Appendix A). For example, we hypothesize that it would not solve the Selective Copying task because simply masking out the irrelevant inputs does not aï¬ ect the spacing between the relevant ones (indeed, the Selective Copying task can even be viewed as coming pre-masked if the noise tokens are embedded to 0). â ¢ RetNet (Y. Sun et al. 2023) is also based on Linear Attention and very similar to H3, but reduces the inner S4 layer to a special case where the state dimension is ð = 1. Although not framed as such, its recurrence can be viewed as a special case of a linear SSM. Its primary source of improvement is using a linear attention with large head dimension, which can be viewed as another method to perform input-dependent state expansion. Using a larger head dimension in the context of linear attention variants was ï¬ rst done by H3, but not extensively used since this requires a proportional amount of extra computation.', chunk_index=70, num_tokens=362, metadata={}), ResponseChunk(id='chunk_14f4b170-676b-44ea-8291-21ab110c9501', content='RetNet avoids this with an alternate way to parallelize the computation with a variant of standard multi-head attention instead of convolutions, made feasible by their particular special case of SSMs which acts as a simple EMA. â ¢ RWKV (B. Peng et al. 2023) is another recent RNN designed for language modeling. It is based on AFT (attention-free Transformer (S. Zhai et al. 2021)), another variant of linear attention. Its main â WKVâ mechanism involves LTI recurrences and can be seen as the ratio of two SSMs. We also highlight the gated attention unit (GAU) from Hua et al. (2022), which was motivated by combining the Transformerâ s MHA and MLP blocks together and was an inspiration for our architecture (Section 3.4) combining the H3 and MLP blocks. # B.3 Relationship to RNNs RNNs and SSMs are broadly related, as they both involve the concepts of recurrence on a latent state. Several older RNNs such as the strongly typed RNN (Balduzzi and Ghifary 2016), quasi-RNN (QRNN) (Bradbury et al. 2016), and simple recurrent unit (SRU) (Lei 2021; Lei et al. 2017) involve forms of gated RNNs without time-wise nonlinearities. Because of the connections of gating mechanisms and selection mechanisms, these can be viewed as cases of selective SSMs, and are thus more powerful in a sense than the family of LTI structured SSMs above. The main diï¬ erences are: â ¢ They do not use state expansion (ð = 1) or selective B, C parameters, both of which are important for performance (Section 4.6). â', chunk_index=71, num_tokens=382, metadata={}), ResponseChunk(id='chunk_48705743-5989-4b64-9609-ba1c756a70d8', content='¢ They use a heuristic gating mechanism, which we generalize as a consequence of the selection mechanism + discretization (Theorem 1). The connections to principled SSM theory provides better parameterizations and initializations (Section 3.6). Additionally, older RNNs famously suï¬ ered from eï¬ ciency issues and the vanishing gradients problem (Pascanu, Mikolov, and Bengio 2013), both caused by their sequential nature. The latter could be solved for some of the above RNNs by leveraging the parallel scan (Martin and Cundy 2018), but the former was diï¬ cult without theory later developed for SSMs. For example, modern structured SSMs diï¬ er in more careful parameterization of the recurrent dynamics inspired by classical SSM theory (e.g. through discretization (Gu, Johnson, Goel, et al. 2021; Gu, Johnson, Timalsina, et al. 2023)), or direct analysis (Orvieto et al. 2023)). We also note that there is a long line of work on orthogonal RNNs (Arjovsky, Shah, and Bengio 2016; Henaï¬ , Szlam, and LeCun 2016; Lezcano-Casado and MartÃ nez-Rubio 2019; Mhammedi et al. 2017; Vorontsov et al. 2017) 26 which are motivated by constraining the A transition matrix to be orthogonal or unitary, in order to control its eigenvalues and prevent the vanishing gradient problem. However, these had other limitations; we believe that these stem from the fact that orthogonal/unitary RNNs are also LTI.', chunk_index=72, num_tokens=365, metadata={}), ResponseChunk(id='chunk_43b8a6b2-4306-4aa7-8219-c630f4ab1781', content='For example, they are almost always evaluated on the Copying task which they can solve perfectly, but observed to struggle on the Selective Copying task (Jing et al. 2019). # B.4 Linear Attention The Linear Attention (LA) (Katharopoulos et al. 2020) framework is an important result popularizing kernel attention and showing how it relates to recurrent autoregressive models. Many variants have proposed alternative kernels and other modiï¬ cations. Random Feature Attention (RFA) (H. Peng et al. 2021) chooses the kernel feature map to approximate softmax attention (i.e. the exp feature map) using the random Fourier feature approximation of Gaussian kernels (Rahimi and Recht 2007). Performer (Choromanski et al. 2021) ï¬ nds an approximation to the exponential kernel involving only positive features, which also allows the softmax normalization term. TransNormer (Qin, Han, W. Sun, D. Li, et al. 2022) showed that the LA denominator term can be unstable and proposed replacing it with a LayerNorm. cosFormer (Qin, W. Sun, et al. 2022) augments RFA with a cosine reweighting mechanism that incorporates positional information to emphasize locality. Linear Randomized Attention (Zheng, C. Wang, and L. Kong 2022) generalize RFA from the perspective of importance sampling, and generalize it to provide better estimates of the full softmax kernel (rather than just the exp-transformed numerator). Aside from kernel attention, many other variants of eï¬ cient attention exist; the survey Tay, Dehghani, Bahri, et al. (2022) oï¬ ers an extensive categorization of many of these. # B.5 Long Context Models', chunk_index=73, num_tokens=380, metadata={}), ResponseChunk(id='chunk_3b2ef457-bead-4d06-9b54-b0fa84e4c6ea', content='Long context has become a popular subject, and several recent models have claimed to scale to longer and longer sequences. However, these are often from a computational standpoint and have not been extensively validated. These include: â ¢ Recurrent Memory Transformer (Bulatov, Kuratov, and Burtsev 2023), a lightweight wrapper around a Transformer backbone. It showed ability to generalize up to 1M sequences but only on synthetic memorization tasks; their main result is similar to our Induction Heads extrapolation experiment (Table 2). â ¢ LongNet (Ding et al. 2023), which claimed to scale to 1B length but only evaluated on length < 100ð ¾ for actual tasks. â ¢ Hyena and HyenaDNA (Nguyen, Poli, et al. 2023; Poli et al. 2023), which claimed to leverage up to 1M context. However, their experiments trained on proportionally more data at longer contexts, making it hard to conclude if quality improvements at 1M context are due to context length or due to more data and computation. â ¢ Sparse Transformer (Child et al. 2019) showed a proof-of-concept of using a strided sparse attention Transformer to model audio waveforms of length 220 = 1048576, although did not discuss performance tradeoï¬ s when controlling for computation and model size. In contrast, we believe this work presents one of the ï¬ rst approaches to meaningfully demonstrate increasing performance with longer context. # C Mechanics of Selective SSMs Proof of Theorem 1. Consider a selective SSM (Algorithm 2) with ð = 1, A = â 1, B = 1, ð â = ð «ð ð ð ¾ð ºð (ð ¥), ð â = ð ð ð ¿ð ð ð ð ð .', chunk_index=74, num_tokens=408, metadata={}), ResponseChunk(id='chunk_5d9219c9-67da-4e44-a278-e10db2c4b228', content='The corresponding continuous-time SSM (1) is â (ð ¡) = â â (ð ¡) + ð ¥(ð ¡) which is also called a leaky integrator. 27 The discretization step size is The discretization step size is # â ð ¡ = ð â (ð ¯ð ºð ð ºð ð ¾ð ð ¾ð + ð â (ð ¥ð ¡)) = ð ð ð ¿ð ð ð ð ð (ð ¯ð ºð ð ºð ð ¾ð ð ¾ð + ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) = ð ð ð ¿ð ð ð ð ð (ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) where we observe that the parameter can be viewed as a learnable bias and folded into the linear projection. Now applying the zero-order hold (ZOH) discretization formulas: Að ¡ = exp(â A) = 1 1 + exp(ð «ð ð ð ¾ð ºð (ð ¥ð ¡) = ð (â ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) = 1 â ð (ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) Bð ¡ = (â A)â 1(exp(â A) â I) â â B = â (exp(â A) â I) = 1 â A = ð (ð «ð ð ð ¾ð ºð (ð ¥ð ¡)). Thus the final discrete recurrence (2a) is ð ð ¡ = ð (ð «ð ð ð ¾ð ºð (ð ¥ð ¡)) â ð ¡ = (1 â ð ð ¡)â ð ¡â 1 + ð ð ¡ð ¥ð ¡ as desired. # D Hardware-aware Algorithm For Selective SSMs', chunk_index=75, num_tokens=460, metadata={}), ResponseChunk(id='chunk_d5004b00-d95e-4db4-bbdf-be9e482a7feb', content='Without input-dependent selectivity, SSMs can be eï¬ ciently implemented as a convolution (Dao, Fu, Saab, et al. 2023; Gu, Goel, and RÃ© 2022), which leverages the fast Fourier transform (FFT) as primitive. With selectivity, SSMs are no-longer equivalent to convolution, but we leverage the parallel associative scan. While SSM scans are theoretically eï¬ cient (ð (ð µð ¿ð ·ð ) FLOPs, scaling linear in ð ¿), training foundation models with selective SSMs requires them to be eï¬ cient on modern hardware (GPUs) as well. We describe how we use kernel fusion and recomputation to make SSM scan fast and memory-eï¬ cient. We evaluate the speed of our scan implementation compared to convolution and attention in Section 4.5, showing that it is up to 7Ã times faster than attention at sequence length 32K, and is as memory-eï¬ cient as the best attention implementation (FlashAttention). Speed. On modern hardware accelerators (GPUs) most operations (except matrix multiply) are bounded by memory-bandwidth (Dao, Fu, Ermon, et al. 2022; Ivanov et al. 2021; Williams, Waterman, and Patterson 2009). This the case with our scan operation, and we use kernel fusion to reduce the amount of memory IOs, leading to signiï¬ cant speedup compared to a standard implementation. The standard way to implement the scan algorithm in Section 3.2 is to prepare the scan input A, B of size (ð µ, ð ¿, ð ·, ð ) in GPU HBM (high-bandwidth memory, commonly referred to as GPU memory), call a parallel associative scan implementation to write the scan output of size (ð µ, ð ¿, ð', chunk_index=76, num_tokens=398, metadata={}), ResponseChunk(id='chunk_3b465706-5bbf-40ec-9c48-76d912023c76', content='·, ð ) to GPU HBM, then multiply that scan output with C to produce an output of size (ð µ, ð ¿, ð ·). However, this requires the number of memory reads/writes on the order of ð (ð µð ¿ð ·ð ). We can instead fuse the discretization step, the scan, and the multiplication with C into one kernel: 1. We read in ð (ð µð ¿ð · + ð ·ð ) bytes of memory (â , A, B, C) from slow HBM to fast SRAM. 2. We discretize to produce A, B of size (ð µ, ð ¿, ð ·, ð ) in SRAM. 3. We perform a parallel associative scan, yielding intermediate states of size (ð µ, ð ¿, ð ·, ð ) in SRAM. 4. We multiply and sum with C, producing outputs of size (ð µ, ð ¿, ð ·) and write it to HBM. This way, we reduce IOs by a factor of ð (ð ) (the state dimension), which in practice speeds up the operation by 20-40 times (Section 4.5). 28 Table 11: (Induction heads.) Models are trained on sequence length 2Â° = 256, and tested on various sequence lengths of 2Â° = 64 up to 2Â° = 1048576. Y denotes perfect generalization accuracy, while X denotes out of memory.', chunk_index=77, num_tokens=329, metadata={}), ResponseChunk(id='chunk_157f0982-688f-49ce-91c2-81da2c810c74', content='Model Params Test Accuracy (%) at Sequence Length 26 7 28 29 210 gl 212 913 214915216 917918919920 MHA-Abs 137K v 99.6 100.0 58.6 266 188 98 10.9 7.8 X x x x x x MHA-RoPE = 137K v v 100.0 83.6 31.3 184 8.6 9.0 5.5 xX x x x x x MHA-xPos 137K v v 100.0 99.6 67.6 254 7.0 9.0 78 =X x x x x x H3 153K v v 100.0 80.9 39.5 238 148 82 59 66 82 47 82 63 74 Hyena 69M* 977 Vo 100.0 Vv 441 125 66 5.1 70 #59 66 66 59 63 98 Mamba 74K v v 100.0 Vv v v v v v v v v v v v â Most of the parameters are in learnable positional encodings. For sequence length ð ¿ too long where we cannot ï¬ t the sequence in SRAM (which is much smaller than HBM), we split the sequences into chunks and perform the fused scan on each chunk. As long as we have the intermediate scan states, we can continue the scan with the next chunk. Memory. We describe how we use the classical technique of recomputation to reduce the total amount of memory required to train selective SSM layers. From the way we fuse the forward pass, we do not save the intermediate states of size (ð µ, ð ¿, ð ·, ð', chunk_index=78, num_tokens=395, metadata={}), ResponseChunk(id='chunk_4b50fadb-908d-422b-894a-8c139412a0be', content=') to avoid memory blowup. However, these intermediate states are necessary for the backward pass to compute gradients. We instead recompute those intermediate states in the backward pass. Since the inputs â , A, B, C and output gradient read from HBM to SRAM are of size ð (ð µð ¿ð + ð ·ð ), and the input gradients are also of size ð (ð µð ¿ð + ð ·ð ), recomputation avoids the cost of reading ð (ð µð ¿ð ð ·) elements from HBM. This means that recomputation of the SSM states in the backward pass speeds up the computation compared to storing them and reading them from HBM. Beyond optimizing for the memory requirement of just the scan operation, we also use recomputation to optimize the memory requirement of the entire selective SSM block (input projection, convolution, activation, scan, output projection). In particular, we do not save intermediate activations that take a lot of memory but are fast to recompute (e.g. output of activation function or short convolution). As a result, the selective SSM layer has the same memory requirement as an optimized Transformer implementation with FlashAttention. In particular, each attention layer (FlashAttention) stores around 12 bytes of activations per token, an each MLP layer stores around 20 bytes of activations per token, for a total of 32 bytes ((assuming mixed-precision training in FP16 or BF16)). Each selective SSM stores around 16 bytes of activations per token. Hence two layers of selective SSMs have around the same activation memory as an attention layer and an MLP layer. # E Experimental Details and Additional Results # E.1 Synthetic Tasks Selective Copying. Our setting is on sequences of length 4096, with a vocab size of 16 possible tokens (including the white â noiseâ token from Figure 2) and requiring models to memorize 16 â dataâ tokens.', chunk_index=79, num_tokens=399, metadata={}), ResponseChunk(id='chunk_8e9d0143-f5f4-470f-907d-2481503538e0', content='We use 2 layer models with a model dimension of ð · = 64. Models are trained for 400K steps at a constant learning rate of 0.0001 with a batch size of 64. Induction Heads. Training consists of randomly generating data every step, with a batch size of 8. We choose an â epochâ size of 8192 steps, and track the accuracy on ï¬ xed validation sets (also randomly generated) of each target sequence length. For the MHA-Abs and Mamba models, results are reported after the 25th epoch (8192 Ã 25 = 204800 steps). For the MHA-RoPE and MHA-xPos models, results are reported after the 50th epoch (8192 Ã 50 = 409600 steps). For the LTI H3 and Hyena models, results are reported after the 10th epoch (81920 steps) because they had converged by then and failed to improve further. 29 Table 12: (Scaling Law Model Sizes.) Our model sizes and hyperparameters for scaling experiments. (Model dimension and number of heads applies only to Transformer models.) Params ð _ð ð ð ¢ð ð ð ð _ð ð ð ð ð ð _ð ð ð ð ð / ð _ð ð ð ð', chunk_index=80, num_tokens=292, metadata={}), ResponseChunk(id='chunk_27e7284d-594b-4695-abf5-fb70154e86fa', content='Training steps Learning Rate Batch Size Tokens 125M 350M 760M 1.3B 12 24 24 24 768 1024 1536 2048 12 / 64 16 / 64 16 / 96 32 / 64 4800 13500 29000 50000 6e-4 3e-4 2.5e-4 2e-4 0.5M tokens 0.5M tokens 0.5M tokens 0.5M tokens 2.5B 7B 15B 26B We use the Adam optimizer with no weight decay. All models are trained at constant learning rates 2ð â 4 and 1ð â 3, and the better results are reported for each model (2ð â 4 for all models except Mamba). The attention and Hyena models did not learn at LR 1ð â 3. H3 learned at both LRs, but interestingly generalized better to shorter sequences at the smaller LR of 2ð â 4. Mamba learned at both LRs, but extrapolated better at the larger LR of 1ð â 3. # E.2 Language Modeling # E.2.1 Scaling Law Details All models were trained on the Pile. Model Sizes. Table 12 speciï¬ es the model sizes we use for scaling laws. This is taken directly from the GPT3 speciï¬ cations (Brown et al. 2020), with very minor modiï¬ cations. First, we changed the batch size of the 1.3B model from 1M tokens to 0.5M tokens, since we did not use enough parallelization to require the larger batch size. Second, we changed the number of training steps and total tokens to roughly match Chinchilla scaling laws (Hoï¬', chunk_index=81, num_tokens=401, metadata={}), ResponseChunk(id='chunk_7413d6fd-e8a9-4f5a-936f-50c3b32d0719', content='mann et al. 2022), which specify that training tokens should increase proportionally to model size. Training Recipes. All models used the AdamW optimizer with â ¢ gradient clip value 1.0 â ¢ weight decay 0.1 no dropout linear learning rate warmup with cosine decay By default, the peak learning rate is the GPT3 speciï¬ cation. We give several models an â improved recipeâ , inspired by changes adopted by popular large language models such as PaLM (Chowdhery et al. 2023) and LLaMa (Touvron et al. 2023). These include: â ¢ linear learning rate warmup with cosine decay to 1ð â 5, with a peak value of 5Ã the GPT3 value no linear bias terms RMSNorm instead of LayerNorm â ¢ AdamW hyperparameter ð ½ = (.9, .95) (the GPT3 value) instead of the PyTorch default of ð ½ = (.9, .999) Architecture and Training Details. Our models are: â ¢ Transformer: The standard Transformer based on GPT3 (Table 12). â ¢ Transformer++: A Transformer with an improved architecture, namely rotary positional encodings (Su et al. 2021) and SwiGLU MLP (Shazeer 2020), and the improved training recipe above. â ¢ Hyena: Interleaving a Hyena block (the H3 block with S4 replaced by a global convolution parameterized by an MLP) with standard MLP blocks. The MLP blocks have expansion factor 2 instead of 4 and the number of layers is correspondingly increased by 1.5Ã to preserve parameter count. 30 â ¢ H3++: The H3 architecture with a few modiï¬ cations, including (i) using the same â thinâ', chunk_index=82, num_tokens=396, metadata={}), ResponseChunk(id='chunk_2ecbbff9-45e2-4457-aca1-0825030b7ddd', content='Hyena dimensions above (ii) the improved training recipe above (iii) a linear attention head dimension of 8. â ¢ RWKV: The default RWKV model from B. Peng et al. (2023), including its modiï¬ ed MLP block. We also used as much of its speciï¬ ed training recipe as possible, such as increasing the learning rates by 2Ã or 3Ã on certain parameters. â ¢ RetNet: The default RetNet model from Y. Sun et al. (2023). We also gave it the improved training recipe above. â ¢ Mamba: The standard Mamba architecture, with the improved training recipe. # E.2.2 Additional Scaling Law Ablations We perform additional ablations on the architecture using the same protocol as the 2k context length scaling laws in Figure 4 (Left). Mamba Architecture: Interleaving Blocks. We test the eï¬ ect of diï¬ erent architectural blocks combined with the Mamba block. We focus on the viewpoint that the Mamba block is simply the standard SwiGLU block with an extra ð ¼ð ð ð â ð ²ð ²ð ¬ path added. This leads to two natural ablations: â ¢ What if the Mamba block is interleaved with a standard MLP block, instead of stacked homogenously? This can also be interpreted as taking Mamba and removing half of the SSMs. â ¢ What if the Mamba block is interleaved with MHA (multi-head attention) blocks? This can also be interpreted as taking a Transformer with SwiGLU MLPs (i.e. what we call Transformer++) and simply adding SSMs to the MLP blocks. Figure 9 (Right) shows these variants compared to the original (homogenous) Mamba architecture. Interestingly, neither change matters too much.', chunk_index=83, num_tokens=394, metadata={}), ResponseChunk(id='chunk_b0927bb4-8208-4a69-8540-b0adbda5b0b4', content='The Mamba-MLP architecture is only slightly worse, and still better than all models except Transformer++. The Mamba-MHA architecture is only slightly better, which is somewhat surprising in light of the fact that many recent works have found that combining (LTI) SSMs with Attention can lead to substantial improvements (Dao, Fu, Saab, et al. 2023; Fathi et al. 2023; Fathullah et al. 2023; Saon, Gupta, and Cui 2023; Zuo et al. 2022). H3 Architecture: Training Recipes. Next we ablate diï¬ erences between the Hyena and H3++ models, our weakest and strongest models outside of Transformer++ and Mamba, particularly to isolate the eï¬ ect of training recipes. â ¢ Hyena: The Hyena block with its original architecture and GPT3 training recipe (same as Figure 4). â ¢ Hyena+: The same architecture but with the improved training recipe described above. â ¢ H3+: The same architecture as Hyena+ but with the Hyena convolution kernel swapped out for S4D convolution kernel. â ¢ H3++: The same as H3+, but with a linear attention head dimension of 8. This increases computation inside the SSM recurrence but does not increase parameters. Our general convention is that â Model+â represents the base model with the improved training recipe, and â Model++â also allows for architectural changes. Figure 9 (Right) shows that A large improvement is achieved by the improved training recipe, which was used for many of the models in the main Figure 4 (RetNet, H3++, Transformer++, Mamba). The choice of the inner LTI SSM does not matter (e.g. Hyena vs. S4), consistent with ï¬ ndings throughout this paper.', chunk_index=84, num_tokens=390, metadata={}), ResponseChunk(id='chunk_45e94e1b-fd7f-4e12-ae91-147bcf691880', content='The head dimension expansion improves performance, consistent with one of our main themes that expanded state dimension improves performance for SSMs (Section 3). 31 Scaling Laws on The Pile (Sequence Length 2048) Scaling Laws on The Pile (Sequence Length 2048) â â Mamba Hyena Mamba-mLp | = â Hyenas â â Members |g â â He a â He 3 Sox! = 2104 ext? 5 2S 7x0 Ea 1 1 1 1 10 30 10Â° 10â FLOPS (log scale) FLOPs (log scale) s 5 2 3 2 = 3 8 Figure 9: (Scaling laws: extra ablations.) (Left) Instead of (Right) Instead of # E.2.3 Downstream Evaluation Details This pretraining procedure is the same as the scaling law protocol, but extended to 300B tokens. For the 1.3B model, we use a batch size of 1M tokens to be consistent with the GPT3 speciï¬ cations. We report the perplexity on the Pile validation set, and for this metric only compare to models trained on the same dataset and with the same tokenizer, in particular Pythia and RWKV. For downstream evaluation, we use the LM evaluation harness from EleutherAI (L. Gao, Tow, et al. 2021), as done by most work in this area. We evaluate on the following tasks/datasets that measure common sense reasoning: â ¢ LAMBADA (Paperno et al. 2016). â ¢ HellaSwag (Zellers et al. 2019). â ¢ PIQA (Bisk et al. 2020). â ¢ ARC-challenge (P. Clark et al. 2018). â', chunk_index=85, num_tokens=397, metadata={}), ResponseChunk(id='chunk_adccab88-8116-4222-a589-2316f3a4d275', content='¢ ARC-easy: an easy subset of ARC-challenge. â ¢ WinoGrande (Sakaguchi et al. 2021). We report accuracy for LAMBADA, WinoGrande, PIQA, and ARC-easy, and accuracy normalized by sequence length for HellaSwag and ARC-challenge (since normalized accuracy is higher for almost all models for these task). # E.3 DNA Modeling # E.3.1 Pretraining Details We describe the dataset and training procedure of the HG38 pretraining task in more detail. The dataset follows the splits from the prior Enformer work on genomics (Avsec et al. 2021); the training split contains a total of ð = 34021 segments of length 217 = 131072 that cover the genome, for a total of approximately 4.5 billion tokens (DNA base pairs). These segments are pairs of (chromosome number, starting index, ending index), and can be extended if necessary (e.g. to get longer segments). We deviate from HyenaDNA when the training sequence length is not 217. HyenaDNA always takes a ï¬ xed sub-segment (e.g. the beginning or middle of the prescribed segment), and thus for any training sequence length each epoch is ï¬ xed to 34021 samples and doesnâ t necessarily go through the whole genome. On the other hand, we use the entire training data: â ¢ When the context length ð ¿ is less than (or equal to) 217, we divide up each segment into non-overlapping sub-segments of length ð ¿, so that there are ð Ã 217 ð ¿ total samples and ð Ã 217 â 4.5ð µ tokens per epoch. â ¢ When the context length ð', chunk_index=86, num_tokens=379, metadata={}), ResponseChunk(id='chunk_c1fb92e1-86ae-4d99-a872-a76464a723b3', content='¿ is greater than 217, we turn each segment into two samples, one that begins with the prescribed segment and one that ends with the prescribed segment. Thus each epoch has 2ð items and 2ð ð ¿ 32 tokens per epoch. For example, at sequence length 218 = 262144 there are 4Ã as many tokens as the default, and at sequence length 220 there are 16Ã as many tokens. Other training details generally follow the same protocol as our language modeling experiments (Appendix E.2). For example, we use the AdamW with (ð ½1, ð ½2) = (0.9, 0.95), no dropout, weight decay 0.1. We use a cosine learning rate scheduler with linear warmup for 10% of total steps. # E.3.2 Scaling: Model Size Details Models. The models we consider are: â ¢ Transformer++: a Transformer with improved architecture, notably the usage of RoPE positional encodings (Su et al. 2021). Informally, we found these to be noticeably better than vanilla positional encodings from (Vaswani et al. 2017). â ¢ HyenaDNA: the Hyena model from Nguyen, Poli, et al. (2023) and Poli et al. (2023), which is roughly a Transformer with the MHA block replaced by an H3 block using a global convolution parameterized by an MLP. â ¢ Mamba: the standard Mamba architecture. Model Sizes. We use the following model sizes. Blocks Model Dimension Params (Approx.) 4 64 250K 700K 1.4M 3.5M 7.0M 19.3M 40.7M 5 96 6 128 7 192 8 256 10 384 12 512', chunk_index=87, num_tokens=397, metadata={}), ResponseChunk(id='chunk_073aa678-dc5c-46df-85eb-729754d3d529', content='Note that the number of blocks for Mamba is doubled, because one Transformer â layerâ includes both the MHA and MLP blocks (and similarly for Hyena), which requires two Mamba blocks to match parameters (Section 3.4). Training. For each model (Transformer++, HyenaDNA, Mamba), we swept the learning rate across {1ð â 3, 2ð â 3, 4ð â 3, 8ð â 3}. The optimal Transformer and HyenaDNA learning rates were 2e-3 across all sizes. The optimal Mamba learning rate was 8e-3; note that Mamba performed better than baselines with matched learning rates (2e-3), but was more stable and improved even more at higher learning rates. (Furthermore, as this LR is on the upper range of the sweep, it is possible that our results are still suboptimal.) Note that, in contrast to standard LM scaling laws (Table 12), our LR held constant across model sizes for simplicity. The optimal LR should go down for larger models, but we didnâ t ï¬ nd a noticeable eï¬ ect at the small model sizes (at most a few million parameters) we considered. E.3.3 Scaling: Context Length Details We use a total batch size of 224 â 16ð tokens per training step, for every sequence length (e.g. at length 220 there are 16 segments per batch and at length 210 there are 16384 segments per batch). This is a large batch size relative to the model size by usual LM standards, but note that a batch size of 223 is the minimum possible on a machine with 8 GPUs and sequence length of 220, and that HyenaDNA used much larger batches of 228.', chunk_index=88, num_tokens=372, metadata={}), ResponseChunk(id='chunk_11da5774-bc25-43ac-9797-be88490cfb1a', content='The learning rate used was 0.008 for Mamba and 0.001 for HyenaDNA; we initially attempted to use the same learning rate of 0.002 from the previous section for HyenaDNA, but found that it was unstable at the longest context length. Sequence Length Warmup. Following (Nguyen, Poli, et al. 2023), we use sequence length warmup (SLW) during pretraining. We choose a simple schedule of 2 epochs at each power-of-two sequence length starting from 210 = 1024. (Note that because of how data is curated, at the longest sequence lengths more steps and tokens are spent proportionally. In particular, each stage up to length 217 processes the same number of tokens, but 4Ã as many tokens are processed at length 218, 8Ã as many at length 219, and 16Ã as many at length 220.) Unlike HyenaDNA, we always control for the number of tokens per gradient update, so the batch size is successively halved as the sequence lengths are doubled in each stage. 33 Table 13: (Great Apes DNA Classification.) Accuracy after fine-tuning on sequences of length 210 = 1024 up to 220 = 1048576 using pretrained models of the same context length. Random guessing is 20%. Params Accuracy (%) at Sequence Length 210 212 214 216 218 220 28.04 31.47 28.43 27.50 41.17 27.66 42.22 40.72 31.10 42.41 7M 30.00 29.01 31.48 43.73 56.60 Remark E.1. We also note that the schedule was not tuned, and we never experimented with turning off sequence length warmup for these pretraining experiments.', chunk_index=89, num_tokens=393, metadata={}), ResponseChunk(id='chunk_64f1a882-0484-424c-8dcb-6659a395394f', content='We later found that SLW did not help noticeably for audio pretraining at similar lengths (Section 4.4), and it is possible that it is not necessary for DNA pretraining either. # E.3.4 Species (Great Apes) Classification Models are causal and therefore only the last element (across the sequence length) of the modelâ s output is used for the classiï¬ cation head. Note that we control for the total number of elements in the loss function per gradient step. The pretraining objective includes all positions across the sequence length, so that ð ð ð ð ð _ð ð ð £ð Ã ð ð ð ð ð ð ð ð _ð ð ð ð ð ð is held constant; in other words, the batch size decreases as the sequence length increases. However, for a classiï¬ cation task, since only the last position enters the loss, the batch size itself is held constant. Note that this also means that ï¬ ne-tuning models with longer sequence lengths is more computationally expensive. Training consists of 10 epochs, each of which has 1024 gradient steps. Each gradient step uses batch size 64, which are all independently randomly drawn by uniformly picking a species, uniformly picking a chromosome, and then uniformly picking a contiguous segment of DNA. Following (Nguyen, Poli, et al. 2023), models with a maximum context length greater than 214 = 16384 use sequence length warmup with 1 epoch at length 214 = 16384, 1 epoch at length 215 = 32768, 1 epoch at length 216 = 65536, and so on up to the maximum sequence length. For example, the model with 220 = 1048576 context undergoes 6 epochs of sequence length warmup before 4 more epochs at its maximum sequence length. The learning rate for all Hyena models is ð ºð â ð', chunk_index=90, num_tokens=421, metadata={}), ResponseChunk(id='chunk_93e65dec-30cb-4338-ade0-9bf86da7afc6', content='», while the learning rate for all Mamba models is ð ·ð â ð º. These were found by performing learning rate sweeps for each model among {1ð â 5, 2ð â 5, 4ð â 5, 1ð â 4, 2ð â 4} for the smaller sequence lengths (210, 212, 214, 216), and these values were consistently found to be the best for each model. An abridged learning rate sweep was done at length 218, which agreed with these values, and a single run at length 220 was performed (as described above, the computational cost of these experiments is proportional to the sequence length). The learning rate followed a cosine decay schedule with warmup with 5 epochs of linear warmup to the maximum learning rate, and 5 epochs of cosine decay down to 1ð â 6. The unusually long learning rate warmup schedule was chosen because the sequence length warmup was also long (e.g. comprising 6 out of 10 epochs for the model with context length 220); we did not experiment with this choice. Results for the Species classiï¬ cation task are in Table 13. # E.4 Audio Details # E.4.1 YouTubeMix Audio Pretraining Model. We use a model with 3 blocks per stage (3 Ã 5 = 15 total Mamba blocks), pooling factor ð = 16, and outer dimension ð · = 64, for about 3.5M parameters. Dataset. The data is mu-law encoded at 8 bits, so the model is modeling discrete tokens with a vocab size of 256. The dataset consists of clips of up to 1 minute long, or length 960000, which is subsampled and divided into segments of any desired sequence length. Since the architecture involves two stages of pooling by a factor of 16, 34 Table 14:', chunk_index=91, num_tokens=403, metadata={}), ResponseChunk(id='chunk_d1392820-0ac1-49ca-a435-0be0566d7f4e', content='YouTubeMix length scaling sequence lengths and batch sizes. 468 Ã 2048 = 958464 234 Ã 2048 = 479232 117 Ã 2048 = 239616 59 Ã 2048 = 120832 30 Ã 2048 = 61440 15 Ã 2048 = 30720 8 Ã 2048 = 16384 4 Ã 2048 = 8192 1 2 4 8 16 32 64 128 958464 958464 958464 966656 983040 983040 1048576 1048576 Audio Waveforms - SSM Parameterization aso â â samp â â Mamba (s6) = â sy = sSeaive B/C Â° 1.40 4 â â -selective A s ras | __Mamba-$4) B 1204 124 108 108 Sequence Length Audio Waveforms - SSM Parameterization â â Mamba ($6) 4 â â +complex = Solestive a | (Mamba-S4) 1.35 1.304 1.254 108 108 Sequence Length 1.48 21404 . Ã© ag Figure 10: (Audio Pretraining (YouTubeMix) Ablations.) As a uniformly-sampled â continuousâ signal modality, audio wave- forms actually benefit from LTI models which have matching inductive bias. (Left) Homogenous models (all blocks have the same parameterization) (Right) Only the center U-Net blocks are ablated; the outer blocks are Mamba-S4. Purple line is same as figure on left. and we want the resulting sequence length to be a a multiple of 8 for hardware eï¬ ciency, the longest possible sequence is 468 Ã 2048 = 958464. The rest of our sequence lengths are deï¬', chunk_index=92, num_tokens=400, metadata={}), ResponseChunk(id='chunk_c5886e2b-8387-48e0-b21e-5abbd46a0e00', content='ned by successively halving this and rounding up to the nearest multiple of 2048. Table 14 lists the speciï¬ cations used in Figure 7. Beyond the varying batch sizes, the number of valid segments in the training set varied between diï¬ erent sequence lengths (e.g. the number of training steps per epoch was not constant for diï¬ erent points in the graph), which may have contributed to kinks in the scaling curves. Training. Models were trained for 200ð ¾ training steps with a maximum learning rate of 0.002, 20ð ¾ (10%) warmup steps, and weight decay 0.1 (similar to our general pretraining recipe across domains). Additional Ablations: SSM Parameterizations. We investigate SSM parameterizations on long-form audio waveform pretraining in the setting of Figure 7. The setting is modiï¬ ed slightly to use larger models (8 layers and ð · = 64 for 6M params, the SaShiMi default), shorter sequences (211 = 2048 to 218 = 262144 instead of 213 to 220), lower LR (0.001 from 0.002), and shorter training cycles (100K instead of 200K steps). Figure 10 shows that the change from S4 â S6 (i.e. the selection mechanism) is not always beneï¬ cial. On long-form audio waveforms, it in fact signiï¬ cantly hampers performance, which may be intuitive from the point of view that audio is uniformly sampled and very smooth, and therefore beneï¬ ts from continuous linear time-invariant (LTI) methods. After ablating away the selection mechanism, note that the resulting model is the S4 layer inside the Mamba block. To disambiguate, we call this Mamba-S4 as opposed the default Mamba architecture Mamba-S6.', chunk_index=93, num_tokens=403, metadata={}), ResponseChunk(id='chunk_b1b61601-18c6-4c31-be9d-c3b9024e4c90', content='However, on the right side, we keep the outer layers of the U-Net Mamba-S4 and ablate only the inner layers. The performance diï¬ erences shrink dramatically; this reinforces the hypothesis that layers closer to the raw audio signal should be LTI, but once they are â tokenizedâ and compressed by the outer layers, the inner layers no longer need to be LTI. In this setting however, the real-valued SSM still underperforms the complex-valued one. 35 # E.4.2 SC09 Speech Generation Autoregressive training largely followed the autoregressive language modeling protocol, such as â ¢ Weight decay 0.1 â ¢ Learning rate warmup for 10% of total steps â ¢ AdamW optimizer with ð ½ = (0.9, 0.95) â ¢ Gradient clip value 0.1 We used a learning rate of 0.002 and 200000 training steps at a batch size of 16. The large Mamba model in Table 4 has 15 layers per stage with an outer dimension of ð · = 96 and pooling factor 4. We note that this dataset is small (training went through 100 epochs) and for this large model, there was signiï¬ cant overï¬ tting of the BPB or NLL. However, automated metrics of generated samples continually improving throughout training. The models in the architecture ablations in Table 5 all have 8 layers per stage with an outer dimension of ð ³ = 64 and pooling factor 4. The S4+MLP block has roughly 2ð ·2 + 4ð ·2 parameters (expansion factor 2 in the MLP). The Transformer block has 4ð ·2 + 2ð ·2 parameters (expansion factor 1 in the MLP). The Mamba block has the usual â 6ð ·2 parameters.', chunk_index=94, num_tokens=403, metadata={}), ResponseChunk(id='chunk_3ce240ee-4972-475e-b421-6105be4473af', content='All models have roughly 6M total parameters. # E.5 Efficiency Benchmark Scan Operation. We compare the core operation of selective SSMs, which is the parallel scan (Section 3.3), against convolution and attention, measured on an A100 80GB PCIe GPU. Note that these do not include the cost of other operations outside of this core operation, such as computing the convolutional kernel in global-convolution models, or computing the QKV projections in attention. As a baseline, we implement a standard parallel scan in PyTorch with no kernel fusion. This requires materializing the parameters A, B, C in HBM. Our scan implementation fuses the discretization step and the parallel scan, avoiding the cost of materializing all the large parameters in HBM. For convolution, we use the standard implementation in PyTorch, which separately performs FFTs on the inputs and the ï¬ lters, multiply them in frequency domain, then performs an inverse FFT to obtain the result. The theoretical complexity is ð (ð ¿ log(ð ¿)) for sequence length ð ¿. For attention, we compare against the fastest implementation that we are aware of (FlashAttention-2 (Dao 2023)), with causal mask. Note that FlashAttention-2 with causal mask is about 1.7Ã faster than without causal mask, since approximately only half of the attention entries are computed. We use batch size of 1 and increase the sequence length from 29 = 512, 210 â 1ð ¾, 211 â 2ð ¾, up to 219 â 500ð ¾ (some of the baselines run out of memory before reaching 500K). We use a model dimension of ð · = 1024 and state dimension ð = 16. We measure with BF16 inputs, which is the data type most commonly used for large scale training. End-to-end Inference.', chunk_index=95, num_tokens=399, metadata={}), ResponseChunk(id='chunk_bf38601c-55c9-4b34-8fc0-c4334255c0e4', content='We measure the inference throughput of a Mamba 1.4B model and an untrained Mamba 6.9B model, against a standard Transformer (GPT3 architecture) at 1.3B and 6.7B size. We use the standard Transformer implementation in the Huggingface transformers library. We set the prompt length to be 2048 and the generation length to be 128. We vary the batch size from 1, 2, 4, 8, 16, 32, 64, to 128, and measure time time taken to generate 128 tokens. We then calculate the throughput (tokens/s) as batch size Ã 128â time taken. We repeat the measurements 3 times and take the average. Measurements are done on an A100 80GB PCIe GPU. Memory Benchmark. The memory usage simply scales proportionally to the size of the activation tensors, as with most deep sequence models. We report measurements of the training memory requirements of 125M models 36 Table 15: (Memory benchmark.) Mambaâ s memory footprint is comparable to the most optimized Transformer. Results for 125M models. Batch size Transformer (w/ FlashAttention-2) Mamba 1 2 4 8 16 32 4.6GB 5.2GB 6.9GB 11.5GB 20.7GB 34.5GB 4.8GB 5.8GB 7.3GB 12.3GB 23.1GB 38.2GB on 1 A100 80GB GPU. Each batch consists of sequences of length 2048. We compare to the most memory-eï¬ cient Transformer implementation we are aware of (with kernel fusion from torch.compile and with FlashAttention-2). Table 15 shows that Mambaâ', chunk_index=96, num_tokens=384, metadata={}), ResponseChunk(id='chunk_0a47e8f9-a143-48af-afd1-d84ccb148358', content='s memory requirement is comparable to a similar-sized Transformer with an extremely optimized implementation, and we expect further improvement in Mambaâ s memory footprint in the future. 37', chunk_index=97, num_tokens=34, metadata={})])\n"
-     ]
+     "data": {
+      "text/plain": [
+       "97"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "print(response_regex)"
+    "response_regex.document.num_chunks"
    ]
   },
   {
diff --git a/examples/03_extract_async.ipynb b/examples/03_extract_async.ipynb
index ebe4324..672f9f7 100644
--- a/examples/03_extract_async.ipynb
+++ b/examples/03_extract_async.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,6 +45,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# PDF\n",
+    "\n",
     "from aurelio_sdk import ExtractResponse\n",
     "\n",
     "file_path = \"data/pdf/adaptive_semantic_search.pdf\"\n",
@@ -60,10 +62,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:22 - at client_async.py:181 in extract_file(): Uploading file from path, data/video/how_to_overcome_our_mistakes.mp4\u001b[0m\n",
+      "\u001b[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:22 - at client_async.py:488 in _file_stream_generator(): Reading chunk 1, chunk_size: 41943040, total bytes: 8258456\u001b[0m\n",
+      "\u001b[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:22 - at client_async.py:491 in _file_stream_generator(): Stream finished, total chunks: 1, file size: 7.88 MB\u001b[0m\n",
+      "\u001b[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:23 - at client_async.py:397 in wait_for(): Starting polling for document completion: doc_b864cb88-8095-4ce4-85b4-d6280ed800aa\u001b[0m\n",
+      "\u001b[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:28 - at client_async.py:417 in wait_for(): Polling document doc_b864cb88-8095-4ce4-85b4-d6280ed800aa: status=TaskStatus.pending\u001b[0m\n",
+      "\u001b[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:33 - at client_async.py:417 in wait_for(): Polling document doc_b864cb88-8095-4ce4-85b4-d6280ed800aa: status=TaskStatus.pending\u001b[0m\n",
+      "\u001b[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:38 - at client_async.py:417 in wait_for(): Polling document doc_b864cb88-8095-4ce4-85b4-d6280ed800aa: status=TaskStatus.pending\u001b[0m\n",
+      "\u001b[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:43 - at client_async.py:417 in wait_for(): Polling document doc_b864cb88-8095-4ce4-85b4-d6280ed800aa: status=TaskStatus.pending\u001b[0m\n",
+      "\u001b[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:48 - at client_async.py:417 in wait_for(): Polling document doc_b864cb88-8095-4ce4-85b4-d6280ed800aa: status=TaskStatus.completed\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "status=<TaskStatus.completed: 'completed'> usage=Usage(tokens=838, pages=None, seconds=291) message=None processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>) document=ResponseDocument(id='doc_b864cb88-8095-4ce4-85b4-d6280ed800aa', content=\" In a 2019 study, over 400 participants were enlisted to learn a mysterious, invented language. Individuals were asked about three pairs of runes. For example, which of these two characters represents an animal? Then, after a brief break, they were asked about the same roon pairs, with questions flipped. as in which of these two runes represents a non-living object. But this game had a secret.  enrolled in introductory and advanced French courses. These students completed a questionnaire asking what kind of teacher they preferred, one who emphasized their strength and successes, or one who highlighted their mistakes and corrected their weaknesses. In general, responses showed that while beginner students sought positive reinforcement, advanced students were more eager for critical feedback. Researchers have theorized a handful of explanations for these results. Having just started out, beginners are still determining if they enjoy learning French, and if they want to continue studying.  People often describe failure as a teachable moment, a necessary stumble on our way to improvement. But learning from our mistakes isn't always easy, especially when those failures are demoralizing, overwhelming, or just downright confusing. So what exactly prevents us from turning our mistakes into mastery? Perhaps the most obvious hurdle to learning from failure is how painful it can be. People generally want to think of themselves as capable and competent, and experiencing failure threatens that self-image. In a survey following a replication of...  forget all your successes. And building on what you're doing right can be more effective than focusing on what you did wrong. One of the ways we can be more constructive with ourselves is by considering how we talk to ourselves. Self-talk can actually play a major role in performance. Learn the key to doing it right with this video. Or get actionable science-based advice on how to build character at ed.t.com slash build character.  the Roon study, participants in the failure group indicated much lower levels of self-confidence after participating. It's tempting to dismiss this pain as a temporary setback, but some studies have found that when people feel demoralized or incompetent, their brains often stop processing new information. This suggests that if a threat to your self-esteem is large enough, it can undermine your ability to learn. However, your tolerance for failure also depends on. on your relationship with the task at hand. In a study from 2011, researchers surveyed a group of American students  what and how much to study, and you can replicate those decisions for the next test. But if you failed, it could be for any number of reasons. Maybe you didn't study enough. Maybe you studied the wrong information. Or maybe you did everything right and the test covered things you shouldn't have been expected to know. In cases like this, it's unclear exactly what went wrong, making it difficult to learn how to improve. Wanting to learn from our failures is completely natural, and there's a lot of to gain by being resilient and cultivating a growth mindset. But fixating on your failures can make it easy to fail.  so they might crave praise as a way to stay motivated. On the other hand, the advanced students are already invested, so they may want to improve their skills as efficiently as possible. The process of gaining expertise also comes with its fair share of failure, so the advanced students may have built a higher tolerance for making mistakes. But whether you're an expert or a novice, it's usually much more straightforward to learn from your successes than your failures. For example, imagine getting your grade back on an exam. If you aced it, you could reasonably assume you made good choices around when...  The subject's answers in round one determined the rune's meanings in round two. In the first round, participants either had all their answers marked as correct, no matter what, or they were forced to fail every question. This meant that at the break, every participant had the same amount of information, and in round two, they were playing for real. But despite this even playing field, the successful participants from round one rose to the top of the ranks, while those cast as failures kept, well, failing. People\", source='', source_type=<SourceType.video_mp4: 'video/mp4'>, num_chunks=3, metadata={}, chunks=[ResponseChunk(id='chunk_f06db1cb-fb94-4d98-a8d5-c319c88238f6', content=\"In a 2019 study, over 400 participants were enlisted to learn a mysterious, invented language. Individuals were asked about three pairs of runes. For example, which of these two characters represents an animal? Then, after a brief break, they were asked about the same roon pairs, with questions flipped. as in which of these two runes represents a non-living object. But this game had a secret. enrolled in introductory and advanced French courses. These students completed a questionnaire asking what kind of teacher they preferred, one who emphasized their strength and successes, or one who highlighted their mistakes and corrected their weaknesses. In general, responses showed that while beginner students sought positive reinforcement, advanced students were more eager for critical feedback. Researchers have theorized a handful of explanations for these results. Having just started out, beginners are still determining if they enjoy learning French, and if they want to continue studying. People often describe failure as a teachable moment, a necessary stumble on our way to improvement. But learning from our mistakes isn't always easy, especially when those failures are demoralizing, overwhelming, or just downright confusing. So what exactly prevents us from turning our mistakes into mastery? Perhaps the most obvious hurdle to learning from failure is how painful it can be. People generally want to think of themselves as capable and competent, and experiencing failure threatens that self-image. In a survey following a replication of. . . forget all your successes.\", chunk_index=1, num_tokens=290, metadata={'start_time': 0, 'end_time': 116}), ResponseChunk(id='chunk_c1629bc0-47d7-40eb-81cd-a63a3b664c96', content=\"And building on what you're doing right can be more effective than focusing on what you did wrong. One of the ways we can be more constructive with ourselves is by considering how we talk to ourselves. Self-talk can actually play a major role in performance. Learn the key to doing it right with this video. Or get actionable science-based advice on how to build character at ed.t.com slash build character. the Roon study, participants in the failure group indicated much lower levels of self-confidence after participating. It's tempting to dismiss this pain as a temporary setback, but some studies have found that when people feel demoralized or incompetent, their brains often stop processing new information. This suggests that if a threat to your self-esteem is large enough, it can undermine your ability to learn. However, your tolerance for failure also depends on. on your relationship with the task at hand. In a study from 2011, researchers surveyed a group of American students what and how much to study, and you can replicate those decisions for the next test. But if you failed, it could be for any number of reasons. Maybe you didn't study enough. Maybe you studied the wrong information. Or maybe you did everything right and the test covered things you shouldn't have been expected to know. In cases like this, it's unclear exactly what went wrong, making it difficult to learn how to improve.\", chunk_index=2, num_tokens=280, metadata={'start_time': 110, 'end_time': 204}), ResponseChunk(id='chunk_93f47369-6c09-47e5-96b2-a081097cd51d', content=\"Wanting to learn from our failures is completely natural, and there's a lot of to gain by being resilient and cultivating a growth mindset. But fixating on your failures can make it easy to fail. so they might crave praise as a way to stay motivated. On the other hand, the advanced students are already invested, so they may want to improve their skills as efficiently as possible. The process of gaining expertise also comes with its fair share of failure, so the advanced students may have built a higher tolerance for making mistakes. But whether you're an expert or a novice, it's usually much more straightforward to learn from your successes than your failures. For example, imagine getting your grade back on an exam. If you aced it, you could reasonably assume you made good choices around when. . . The subject's answers in round one determined the rune's meanings in round two. In the first round, participants either had all their answers marked as correct, no matter what, or they were forced to fail every question. This meant that at the break, every participant had the same amount of information, and in round two, they were playing for real. But despite this even playing field, the successful participants from round one rose to the top of the ranks, while those cast as failures kept, well, failing. People\", chunk_index=3, num_tokens=266, metadata={'start_time': 204, 'end_time': 289})])\n"
+     ]
+    }
+   ],
    "source": [
+    "# Video\n",
     "from aurelio_sdk import ExtractResponse\n",
     "\n",
     "# From a local file\n",
@@ -75,7 +101,7 @@
     "\n",
     "response_video_file\n",
     "\n",
-    "print(response_video_file.document.content)"
+    "print(response_video_file)"
    ]
   },
   {
@@ -87,10 +113,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=11875, pages=8, seconds=None), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>), document=ResponseDocument(id='doc_b3c94774-dace-4ab3-8af1-b4a1c8acfb19', content='4 2 0 2\\ng u A 6 2\\nR S . h p - o r t s a [\\n1 v 1 9 2 5 1 . 8 0 4 2 : v i X r a\\nA temperature scale of 1 2 eV in the mass-radius relationship of white dwarfs of type DA\\nJin Lima, Ji-Yu Kima, Maurice H.P.M. van Puttena,1,\\naPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea\\nAbstract\\nThe mass-radius relationship of white dwarfs (WDs) is one of their defining characteristics, largely derived from electron degen- eracy pressure. We present a model-independent study of the observed mass-radius relationship in WD binaries of Parsons et al. (2017), listing data over a broad temperature range up to about 60,000 K (5 eV). The data show an appreciable temperature sen- sitivity with pronounced intrinsic scatter (beyond measurement uncertainty) for the canonical He-models with proton to neutron ratio 1:1. We characterize temperature sensitivity by a temperature scale T0 in model-agnostic power-law relations with tempera- ture normalized radius. For low-mass WDs, the results identify a remarkably modest T0 = 1   2 eV. We comment on a potential interpretation for atmospheres insulating super-Eddington temperature cores from the sub-Eddington photospheres of low-mass WDs.\\nKeywords: white dwarfs, mass-radius relation, Chandrasekhar\\n1. Introduction\\nWhite dwarfs (WDs) represent the final evolutionary stage of stars with masses in the range of 0.4M  < M < 8M . As such, they are quite numerous. The Gaia Early Data Release 3 (EDR3) contains 359,073 WD candidates (Gentile Fusillo et al., 2021). EDR3 extends the survey of 29,294 WDs of the Sloan Digital Sky Survey (SDSS) Data Release 16 (DR 16) by over an order of magnitude with 25,176 WDs in both surveys. WDs are variously classified by their atmospheric composition (Gen- tile Fusillo et al. 2021). Most are of type DA characterized by a dominance of hydrogen lines in their spectra. It reveals an opaque hydrogen atmosphere, whose observed temperatures are well below the Eddington limit (Fig. 1).\\nWDs have a distinct mass-radius relationship with a lower bound defined by electron degeneracy pressure (Str omgren, 1939). It is described by the equation of state (EoS) of de- generacy pressure of a Fermi gas, the relativistic limit of which defines the Chandrasekhar mass-limit. To leading order, WDs are modeled by the ideal Fermi gas at zero temperature sur- rounded by a non-degenerate atmosphere (Koester and Chan- mugam, 1990). At zero temperature, the degenerate WD core is effectively parameterized only by a proton-to-neutron ratio. In this limit, the mass-radius relation satisfies (Str omgren, 1939; Hamada and Salpeter, 1961; Koester et al., 1979)\\nR = C0\\n(cid:32)\\nM M \\n(cid:33) 1/3\\nwhere\\nC0 =\\n(cid:32) 16G3m3 e 81 2 6\\n(cid:33) 1/3 (cid:32) Amp Z\\n(cid:33) 5/3\\nM 1/3  \\n  0.01 R ,\\nEmail address: mvp@sejong.ac.kr (Maurice H.P.M. van Putten) 1INAF-OAS Bologna via P. Gobetti 101 I-40129 Bologna Italy, Italy\\n(1)\\n(2)\\nwhere G is Newton s constant,   is Planck s constant, me and mp are the masses of the electron and proton, A is the atomic number and Z denotes the number of protons in a nucleus - here, mostly alpha-nuclei. We have set the A/Z ratio to 2, since many WDs consist of a C/O or He core, the latter in particular expected for low-mass WDs (Iben and Tutukov, 1985; Marsh et al., 1995; Nelemans et al., 2001; Han et al., 2002; Istrate et al., 2016; Ren et al., 2018; Zenati et al., 2019). According to this calculation, R = C0M 1/3 holds.\\nFig. 2 shows the observed mass-radius relation for 26 WDs of type DA, all in eclipsing binaries, sampled by Parsons et al. (2017) by photometric and spectroscopic observations. Photo- metric observations use, for example, the Ultrafast Triple-beam CCD Camera (ULTRACAM, Dhillon et al. 2007) and its spec- troscopic version ULTRASPEC (Dhillon et al., 2014), currently in use as the high-speed imaging photometer on the Thai Na- tional Telescope (TNT). Spectroscopic observations have been performed by X-shooter (Vernet et al., 2011) in ESO Very Large Telescope (VLT).\\nFig. 2 shows the theoretical mass-radius relation of the de- generate core (1) to provide a lower bound. It effectively pro- vides a greatest lower bound only for M/M    0.5, even though it represents an rather elementary model of degeneracy pres- sure. The data clearly show a temperature sensitivity at rel- atively low mass M/M    0.5, see also Fig. 9 in Parsons et al. (2017). Crucially, the temperatures involved (Fig. 1) are far below the characteristic energy 0.1   1 Me V of the Fermi energies of the electrons in the degenerate core. For the present temperature range and low-mass WDs, any finite temperature corrections to the EoS (de Carvalho et al., 2014; Boshkayev, 2018; Boshkayev et al., 2021), including relativis- tic corrections, will be accordingly small for any generaliza- tions beyond (1), notably in Hamada and Salpeter (1961), Ro- tondo et al. (2011), de Carvalho et al. (2014), Boshkayev (2018)\\nPreprint submitted to New Astronomy\\nAugust 29, 2024\\nFigure 1: (Top panel.) Observed, Eddington and core temperatures of the 26 WDs of type DA in the sample of Parsons et al. (2017). Core temperatures are inferred from the Koester (1976) correlation for an optically thick atmosphere, insulating a C/O or He core at super-Eddington temperatures. (Lower panel.) Same data plotted as a function of mass with trends at slope 1.10 (Observed, blue), 1.18 (Eddington, red) and -0.11 (Core, brown). Eddington temperatures are roughly consistent with the geometric mean of observed and core tempera- tures.\\nand Baiko and Yakovlev (2019); see further Koester and Chan- mugam (1990); Koester (2002). Moreover, at sufficiently high density, the mass-radius relationship of the core becomes uni- versal, independent of the details of the EoS.\\nInstead, the origin of temperature sensitivity in the mass- radius relationship may be found in a non-degenerate atmo- sphere, if present. In particular, a finite temperature sensitivity is expected from an atmosphere about a core at super-Eddington temperatures - allowed for a sufficiently massive and optically thick atmospheres (Fontaine et al., 2001).\\nA variety of studies have been conducted to find solutions thereto and gain a deeper understanding of, e.g., an H enve- lope and/or evolution models depending on core composition (Hamada and Salpeter, 1961; Hearn and Mewe, 1976; Verbunt and Rappaport, 1988; Benvenuto and Althaus, 1999; Fontaine\\net al., 2001; Panei et al., 2007; Boshkayev et al., 2015; Par- sons et al., 2017; Kepler et al., 2019; Pei, 2022). Among these studies, various convection theories have been advanced, also to explain cooling times and evolutionary processes of WDs in- cluding the thermal insulation provided by their non-degenerate atmospheres.\\nFor instance, hot WDs in SDSS DR12 have been analyzed with models of cooling and atmospheres (B edard et al., 2020). Non-Local Thermal Equilibrium (non-LTE) atmospheres and synthetic WD spectra reveal a correlation between surface grav- ity and effective temperature.\\nThese modeled approaches, however, are intricate with po- tentially systematic uncertainties in the detailed structure of non-degenerate atmospheres, mediating heat transport by radi- ation and convection (B edard, 2024). For this reason, we set out the present model-agnostic study based on spectroscopic and photometric data, to further our understanding of the mass- radius relationship (Tremblay et al., 2017; Boshkayev et al., 2016).\\nFor the recent sample of 26 WDs of Parsons et al. (2017) (Fig. 2), we set out to derive a temperature scale T0, charac- terizing temperature sensitivity by exploring various power law relations for the mass-radius relations. The resulting T0 may serve as a novel observational constraint in future studies.\\nIn  2, we recall some preliminaries of the Eddington temper- ature and the Koester (1976) correlation of core and observed temperature. In  3, we introduce a temperature normalized ra- dius, to be used in power-law fits to the data based on two cost functions:  2 and residual Standard deviation (STD) defined by minimal least square errors. In  4, introduce three temperature- normalized power-laws and consider their fits to data in the log- log plane to effectively describe the expansion of apparent ra- dius with normalized temperature. In  5, two of the three re- lationships are ranked by probability of significance by Monte Carlo analysis. In  6, we interpret the results and summarize our findings with an outlook for future studies in  7.\\n2. The intermediate Eddington temperature\\nFig. 1 shows, as expected, the observed temperatures T to be strictly below the Eddington temperature, TEdd. After all, the modest observed temperatures on the order of a few eV im- ply the existence of an atmosphere. A reverse inequality would imply rapid evaporation by radiation pressure acting on the op- tically thin outer-most layers of any atmosphere.\\nThe Eddington temperature TEdd is defined by equating the Eddington luminosity LEdd = 3.2   104 (M/M ) L  to the lumi- nosity from a sphere of radius R. That is, LEdd = 4 R2 T 4 Edd, where   = 5.67   10 5g s 3K 4 is the Stefan-Boltzmann con- stant. This defines\\nTEdd = 39.5\\n(cid:32)\\ng 5000 g \\n(cid:33) 1 4\\nby surface gravity g = GM/R2 with a fiducial value for M = 0.5M , R = 2%R , scaled to the solar value g  = GM /R2  .\\n(3)\\nFigure 2: Mass-radius plot with temperature (color) in the sample of 26 WDs of type DA (Parsons et al., 2017). NN Ser is the hottest and SDSS J0138- 0016 is the coldest at 63,000K and, respectively, 3,570 K. For reference, it includes the theoretical zero-temperature limit (1) (dashed black line). The vertical grey region highlights three WDs of essentially the same mass with pronounced expansion in apparent radius with temperature. A significant de- parture is seen between observed and the expected radius (1), especially at low mass M/M    0.5. Included are fits to the data by a model-agnostic power- law Relation-2 (dotted colored curves,  3) studied in the present work with isothermals covering 5,000-65,000 K (bottom to top in steps of 10,000 K) in observed temperature. Relation-2 identifies a characteristic temperature scale T0 = 1   2 eV in the observed mass-radius relationship.\\nFor the sample of Parsons et al. (2017), TEdd in (3) is on-average about 40 times the observed surface temperature T (Fig. 1).\\nThe Eddington temperatures shown are roughly consistent TcT of observed and core tempera- with the geometric mean tures. As such, TEdd provides a natural reference to their corre- lations.\\nFollowing a detailed revisit of WD envelopes for the ob- served temperature T at the surface and the central temperature Tc of the core of the WD, Koester (1976) derives a correlation T 4 = 2.05   10 10 (cid:16) c with index   = 2.56, where T T   is in K. Scaled to TEdd, it takes the form\\ng/cm s 2(cid:17)\\nTc    TEdd\\nwith\\n0.02 M 1/4\\n  = 35.7 R1/2\\n0.5\\n(cid:32)\\n40 T TEdd\\n(cid:33) \\n  = 4/    1.56. Here, we use the notation R = 2% R0.02 R  and M = 0.5 M0.5 M  for a fiducial value and taking into account aforementioned mean ratio of TEdd to T .\\nFig. 1 summarizes the distributions of Tc, TEdd and T for the sample of Parsons et al. (2017). These are well below the Fermi level EF of the degenerate electrons supporting the core with characteristic temperature kBTc = z mpc2   0.1   1 MeV, where z = Rg/R is the gravitational redshift of the WD surface according to its gravitational radius Rg = GM/c2, where mp is the proton mass, kB is the Boltzmann constant, and c is the velocity of light.\\n(4)\\n(5)\\nTable 1: Three WDs of essential the same mass showing a clear increase of observed radius with temperature in the sample of Parsons et al. (2017).\\nObject\\nM [M ] R [R ]\\nTeff [K]\\nkBTeff [eV]\\nCSS 0970\\n0.4146\\n0.025\\n30000\\n2.9\\nSDSS J1028+0931\\n0.4146\\n0.018\\n12000\\n1.1\\nSDSS J1210+3347\\n0.4150\\n0.016\\n6000\\n0.52\\n3. Temperature normalized radius\\nFig. 2 shows the data in a mass-radius plot alongside the theoretical zero-temperature limit (1). Highlighted by color is a general trend of increasing radius with temperature. This trend is particularly striking upon considering similar masses. Table 1 lists three WDs of mass M   0.415M  clearly showing a pronounced correlation of apparent radius expanding by   50% with temperature increasing to   5 eV.\\nIn absolute terms, relative to the Fermi level of the electrons (Fig. 1), these temperatures are extremely modest leaving the star essentially unperturbed (Hamada and Salpeter, 1961). For the present sample of Parsons et al. (2017), this suggests, in- stead, a temperature sensitivity in the H atmosphere of WDs considered previously by modeled approaches in Parsons et al. (2017) more likely so than in the degenerate core. We return to this in  7.\\nHere, we circumvent model assumptions by using generic and model-agnostic power-laws for an effective description by a temperature-normalized radius. In doing so, we derive a char- acteristic temperature scale characterizing temperature sensitiv- ity, blind to the underlying physical origin. To be specific, based on Fig. 1 and Table 1, we consider (cid:33) \\n(cid:32)\\nM M \\nR = R0\\nf (T/T0)\\n(6)\\nwith free parameters ( , T0). Here, f (T/T0) is dimensionless and R0 is a constant fixing the dimension of length.\\nStarting point of our approach are effective mass-radius re- lationships of the form R   M  f (T/T0) for some power-law index   and temperature scale T0. Equivalently, this considers a correlation of mass to the scaled radius\\nR  =\\nR f (T/T0)\\n(7)\\nWe apply the temperature scaled radius (7) to fit the data in the form\\nR  = R0 (M/M )  .\\n(8)\\nFor the sample of Parsons et al. (2017), we determine best-fit parameters ( , T0) to the mass-radius data of 26 WDs of type DA, all in eclipsing binaries.\\nThe function f (T/T0) in (6), to be discussed further below, will be a power-law comprising the free parameters ( , T0). In a fit to the mass-radius data, these parameters will be considered over a broad range of values\\n0 < T0 < 9 eV, 0 <   < 1.\\n(9)\\nThe characteristic temperature scale T0 is limited to T0 < 9 eV. In this energy range, radii obtain accurately, while beyond, accuracy diminishes. We keep   < 1, reflecting the assumption that temperature has a secondary impact on the radius. Our best-fit is defined by   estimated using ODR (Orthogonal Dis- tance Regression) and, subsequently, the optimal value of   and T0 at the minimum STD and  2 of residuals.\\nFollowing standard practice, our power-laws (6-8) are ana-\\nlyzed by fits to linear trends in the log-log plane to\\nlog R  =   log M + C.\\nIn fits to data by (10),  2 optimizes both the index   in scaling by a power-law in mass and the constant C, while minimizing STD optimizes only  . In the present analysis quantifying the goodness-of-fit according to residual scatter about a trend line (10), the level shift C along the ordinate - absorbing R0 in (6-8) - is safely ignored and it suffices to determine   by minimization of residuals.\\n4. Temperature-normalized mass-radius relations\\nIn the present model-agnostic approach, we explore fits to the data using three temperature-normalized power-laws. To this end, optimize by  2 and STD in our parameter estimation from fits to the sample of Parsons et al. (2017).\\nThe first power-law Relation-1 is\\nI. f (x) = x ,\\nwhere x = T/T0. Here, T0 acts as a constant because of (9). For this reason,   is not affected by a choice of T0. We find\\n  =  0.954,   = 0.195  2 : STD :   =  0.951,   = 0.190\\nwith the minimum  2 = 0.0183 and, respectively, with residual   = 0.03664. The second power-law Relation-2 is\\nII. f (x) = (1 + x)  .\\nIn contrast to Relation-1, Relation-2 includes the zero temper- ature limit of the Chandrasekhar mass-radius relation (1). Ac- cordingly, T0 is no longer ignorable and is determined in the optimization process in fitting (13) to the data. We find\\n  =  0.969,   = 0.389, T0 = 16226  2 : STD :   =  0.965,   = 0.356, T0 = 13896\\nwith the minimum  2 = 0.0159 and, respectively, with residual   = 0.0344. The third power-law Relation-3 is\\nIII. f (x) = 1 + x ,\\nsimilar but not identical to Relation-2. Relation-3 also includes the Chandrasekhar limit of zero temperature and T0 is not ig- norable. We find\\n  =  0.971,   = 0.690, T0 = 66500,  2 : STD :   =  0.965,   = 0.647, T0 = 65914\\n(10)\\n(11)\\n(12)\\n(13)\\n(14)\\n(15)\\n(16)\\nwith the minimum  2 = 0.0161 and, respectively, a residual   = 0.0346.\\nFig. 3 summarizes our three results, each indicated by color: blue, green and red for Relation-1, Relation-2 and Relation- 3, respectively. The curve represents the optimal   with each T0 at the minimum STD ( 2). The junction point shows the common minimum STD ( 2). Both Relation-2 and Relation-3 leave rather similar residuals (in   and  2) for each of Relation- 2 and Relation-3. To rank these two relations, we proceed as follows.\\n5. Ranking relations by Monte Carlo Analysis\\nIn this section, Relation-2 and Relation-3 are ranked for sig-\\nnificance by Monte Carlo (MC) analysis.\\nMC analysis is a useful method for robust parameter estima- tion and ranking relations in the face of measurement uncertain- ties. Though ODR methods can be used also to estimate param- eters, the results do not necessarily agree, making it difficult to rank Relation-2 and Relation-3 for their relative significance.\\nIn this light, we pursue MC analysis by creating synthetic data by randomly selecting samples of varying radius, mass, and temperature within the measurement confidence intervals,\\nXnew = X + N(0,  ).\\nFollowing this procedure, we estimate  ,  , and T0 without con- sidering errors in ODR. Results extended over 5M calculations are used to determine a ranking of Relation-2 and Relation-3 according to STD or  2) (Fig. 4) and infer a probability P of relative significance by counting the total number of times ei- ther one has preferred rank (Table 2). The MC simulation also provides accurate values in the mean of  ,  , and T0 over the total number of iterations.\\nIn our MC analysis, we consider realizations of arrays of 26   3 = 78 entries, comprising mass, radius and tempera- ture of the 26 WDs. As observed quantities, mass, radius and temperature data are independent. An accordingly fair (unbi- ased) draw of realizations extends over random draws from 98 confidence intervals, unconstrained and independently, blind to physical meaning and pre-conceived notions of correlations. In our analysis, the range of allowed values is densely covering by using a very large number of 5M iterations.\\nTable 2 shows the output of our MC analysis. The results indicate that the WD radius is firstly determined by mass more so than temperature. Relation-2 and Relation-3 have similar   values, but   and T0 are notably distinct with otherwise similar residuals in  2 and STD (Figs. 2-5).Table 2 includes Relation-1, results for which are consistent with the above discussion.\\n6. Interpretation of Results\\nEclipsing binaries allow precise measurements of WD mass and radius (Bours et al., 2016; Parsons et al., 2017). How- ever, as in Fig. 2, the theoretical mass-radius relation (1) pro- vides a lower bound. It is generally not the greatest lower bound by significant departures for hot, low-mass white dwarfs\\n(17)\\nFigure 3: Tracks of STD residuals (left panel) and  2 (right panel) for the Relation-1 (blue), Relation-2 (green) and Relation-3 (red) as a function of the normalized temperature power-law index  , following minimization over all T0 in (9). Note that Relation-1 has no explicit dependence on T0. Both cost functions produce very similar results for   and  .\\nTable 2: Monte Carlo analysis on Relations 1-3 with ranking by probability P of having the lowest residual in  2 (left) or STD (right) for the same data (Fig. 4, synthesized over 5M representations).\\nSTD\\nRelation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3\\nR0/R \\n7.6   10 3\\n6.2   10 3\\n5.8   10 3\\n8.4   10 4\\n6.2   10 3\\n5.8   10 3\\n0.951\\n 0.966\\n 0.967\\n0.951\\n 0.965\\n 0.966\\n0.194\\n0.409\\n0.690\\n0.193\\n0.375\\n0.685\\n T0\\n25000\\n19000\\n65000\\nconst\\n15000\\n64000\\n  1%\\n98.7%\\n  1%\\n  1%\\n98.6%\\n  1%\\nFigure 4: Probability of Relation-2 to have a smaller residual in  2 (continuous curve) and STD (dashed curve) than Relation-3 in a MC analysis extending over a large number of synthetic data sets. Both cost functions support the conclusion that Relation-2 is preferred over Relation-3 in providing a model- agnostic fit to the mass-radius data of WDs.\\n(M/M    0.5). The gray region in Fig. 2 (Table 1) is il- lustrative, highlighting a pronounced trend in observed WD radius with temperatures at otherwise very similar masses in M/M  < 0.5. Evidently, this trend cannot be explained by the zero-temperature mass-radius relationship.\\nSeveral theoretical calculations (de Carvalho et al., 2014; Boshkayev, 2018, 2019) have been advanced to explain this de- parture. While rotation and density affects the radius, the ra- dius of relatively dense WDs is not significantly influenced by temperature even though such is more so for low-density or ro- tating WDs compared to their high-density counter parts. Our a model-agnostic study of T0 identify a characteristic temper- ature in the expansion of the radius of the photosphere. It re- veals a consistent trend wherein WDs with higher temperatures exhibit relatively larger radius, clearly apparent in the overall trend (Benvenuto and Althaus, 1999; Panei et al., 2007; Par- sons et al., 2017; Joyce et al., 2018; Zenati et al., 2019; Romero et al., 2019).\\nIn a novel model-agnostic revisit of the mass-radius relation- ship, we quantify this temperature dependence by a temperature scale T0 = (1   2) eV in Relation-2. Relation-2 is found to be statistically more significant than Relation-3 based on our MC analysis ( 4). Over 5M iterations, Fig.4 shows Relation-2 to have a lower  2 and lower   than Relation-3 at a probability of 98.7%, respectively, 98.6% . In the present approach, we circumvent potential systematic uncertainties otherwise present in model-dependent approaches (Parsons et al. 2017 and refer- ences therein).\\nWe summarize our approach in Fig. 5 (A-C). Panel (A) shows the discrepancy between the theoretical mass-radius re- lation at zero temperature (1) and the observed radius. Panel (B) shows a fit to the mass to the un-normalized radius, re- vealing scatter than clearly exceeds that of measurement uncer- tainty. Panel (C) shows the result of a linear relation between the temperature-normalized radius R  and (M/M ) . At rela- tively small residual scatter, this result identifies a temperature scale T0 and a radius primarily determined by mass. Accord- ing to Table 2 and in the notation of (1), we infer a mass-radius relation\\nR = C\\n(cid:32)\\nM M \\n(cid:33) 1/3\\n(18)\\nFigure 5: Mass-radius plots of the data (black dots). Panel (A) highlights the deviation from the theoretical zero-temperature limit (1). Panel (B) highlights the excess scatter in the data following a fit (red line) to the un-normalized radius R   M . Panel (C) shows a fit (green line) to our temperature-normalized radius R    M  in Relation-2 (13). A small residual scatter (cf. Fig. 3) evidences the effectiveness of our normalization in Relation-2, except for three outliers with relatively large observational uncertainties. All three panels A-C include the theoretical relation (blue dotted line). Bottom panels show the adjusted radius in our normalization produced by minimization of  2 (left) and STD (right). The adjustment by our temperature normalization in R  = R/ f (x), x = T/T0, is about 23% (  = 1.76) and 28% (  = 2.05) in Relation-2 and, respectively, Relation-3 in  2 optimization. The same is 24% (  = 1.82) and 29% (  = 2.07) in Relation-2, respectively, Relation-3 in STD optimization. This adjustment is most relevant at high temperatures.\\nwith the temperature-dependent expansion of an atmosphere in- cluded in\\nC   C0\\n(cid:32)\\n1 + T T0\\n(cid:33)1/3 (cid:32)\\nM M \\n(cid:33) 2/3\\naccording to Relation-2, parameterized by STD in the approxi- mations   =  0.965    1 and   = 0.375   1/3.\\n(19)\\nOur model-agnostic analysis hereby effectively reveals the presence of non-degenerate atmosphere, sufficiently massive and opaque, to account for the observed temperature sensitivity in Fig. 2 parameterized by above-mentioned T0. Indeed, further confirmation can be found in consistency of the present Parsons et al. (2017) data with the detailed model for H atmospheres of B edard et al. (2020).\\n7. Conclusions and Outlook\\nWe summarize the apparent mass-radius relation (18) with temperature dependent coefficient (19) due to this atmosphere by including Relation-2 as a factor modifying C0 in (19).\\nA principal outcome of our model-independent study is a temperature scale T0 = 1   2 eV in temperature sensitivity of the photospheric radius of the WDs of type DA in Parsons et al. (2017), shown in Fig. 5 and summarized in (18-19).\\nT0 derives from 1.6 eV and 1.3 eV according to  2 and, re- spectively, STD in fits of Relation-2 to the data. T0 appears to be particularly relevant to low mass M/M    0.5 in the present sample.\\nWhile the Parsons et al. (2017) sample of WDs covers a size- able range in masses with distinct temperature sensitivity be- low M/M    0.5 and above M/M    0.5, they nevertheless are of relatively high mass-density      0 (Fig. 6). Cru- cially, densities for the entire at hand are above the threshold  0 = 105 g cm 3 above which the mass-radius relation of the degenerate core is expected to be insensitive to temperature (Boshkayev, 2018). In this sense, the present sample is rela- tively homogeneous and does not test temperature sensitivity of the core beyond what is expected from (1).\\nFor the above-mentioned low-mass WDs, a non-degenerate atmosphere effectively insulates a core at super-Eddington tem- peratures (4) from the observed sub-Eddington temperatures T by virtue of an atmosphere that is sufficiently massive and opaque. In the following scaling, we shall ignore a potential role of a corona due to magnetic fields (Aznar Cuadrado et al., 2004; Jordan et al., 2007; Ferrario et al., 2020).\\nFollowing  2, the Eddington temperature introduces a scale height hE by virtue of thermal kinetic energy of the ions, distinct from escape by radiation pressure. For a hydrogen atmosphere,\\nFigure 6: (Top panel.) For the WD sample of Parsons et al. (2017), shown are the expected relative expansion of a non-degenerate atmosphere, sufficiently massive and opaque, surrounding a degenerate core based on super-Eddington temperature Tc inferred from the Koester (1976) correlation to the observed temperature T (Fig. 1). The Parsons et al. (2017) sample satisfies TEdd     TcT , reconciling Tc   TEdd with T   TEdd, leaving T on the order of a few eV. (Lower panel.) For the entire Parsons et al. (2017) sample, the mass- densities of the core according to the theoretical zero-temperature relationship (1) are above the threshold 105 g cm 3 of Boshkayev (2018), where temperature sensitivity is negligible.\\nwe have\\nhE  \\nkBTEdd mpg\\n= 13 km R2\\n0.02 M 1\\n0.5   0.3%RWD.\\nBy high thermal conductivity, the core is believed to be at an essentially uniform temperature. As a result, the atmosphere assumes a mean temperature  TEdd with      . By (20), this implies a height\\nH =   hE   10% RWD.\\nThis expected height (Fig. 6) introduces a radial expansion qualitatively consistent with the data (Fig. 5).\\nIn this light, the relatively modest characteristic temperature T0 = 1   2 eV in (18-19) can be identified with the core temper- ature higher by a factor of Tc/T   (TEdd/T )2 = O based on Figs. 1-2 and  2.\\n(cid:16)\\n103(cid:17)\\nThe heat transport from core to surface in such atmospheres gives rise to a complex mass-radius relationship, here described by (18-19). This appears to be particular relevant to low-mass WDs, essentially below the mean of the WD mass distribution. Above, the relatively high-mass WDs appear to follow the the- oretical mass-radius relation (1), evidencing the absence of an atmosphere and/or a relatively low temperature core. The origin of this discrepancy appears to be beyond the present considera- tions, that might involve a distinct composition and associated formation history. Derived for WDs of type DA, our results may serve as a reference for similar model-independent analysis of WDs of different type.\\n(20)\\n(21)\\nAcknowledgements\\nThe authors thank the anonymous reviewer for constructive comments which greatly contributed to clarity of presentation, and thank M.A. Abchouyeh for stimulating discussions. This work is supported, in part, by NRF No. RS-2024-00334550.\\nReferences\\nAznar Cuadrado, R., Jordan, S., Napiwotzki, R., Schmid, H.M., Solanki, S.K., Mathys, G., 2004. Discovery of kilogauss magnetic fields in three DA white dwarfs. A&A 423, 1081 1094. doi:10.1051/0004-6361:20040355, arXiv:astro-ph/0405308.\\nBaiko, D.A., Yakovlev, D.G., 2019. Quantum ion thermodynamics in liquid interiors of white dwarfs. MNRAS 490, 5839 5847. doi:10.1093/mnras/ stz3029, arXiv:1910.06771.\\nB edard, A., 2024.\\nThe spectral evolution of white dwarfs: where do doi:10.1007/s10509-024-04307-5,\\nwe stand? arXiv:2405.01268.\\nAp&SS 369, 43.\\nB edard, A., Bergeron, P., Brassard, P., Fontaine, G., 2020. On the Spec- tral Evolution of Hot White Dwarf Stars. I. A Detailed Model Atmo- sphere Analysis of Hot White Dwarfs from SDSS DR12. ApJ 901, 93. doi:10.3847/1538-4357/abafbe, arXiv:2008.07469.\\nBenvenuto, O.G., Althaus, L.G., 1999. Grids of white dwarf evolutionary mod- els with masses from M=0.1 to 1.2 m solar. MNRAS 303, 30 38. doi:10. 1046/j.1365-8711.1999.02215.x, arXiv:astro-ph/9811414.\\nBoshkayev, K., 2018. Equilibrium Configurations of Rotating White Dwarfs at Finite Temperatures. Astronomy Reports 62, 847 852. doi:10.1134/ S106377291812017X, arXiv:1807.00332.\\nBoshkayev, K., 2019.\\nStatic and rotating white dwarfs at finite temper- arXiv e-prints , arXiv:1909.10899doi:10.48550/arXiv.1909.\\natures. 10899, arXiv:1909.10899.\\nBoshkayev, K., O., L., M., M., H., Q., 2021. Static and rotating white dwarfs at finite temperatures. IJMPh , 61doi:10.26577/ijmph.2021.v12.i2.07, arXiv:1909.10899.\\nBoshkayev, K., Rueda, J.A., Ruffini, R., Siutsou, I., 2015. General Rela- tivistic and Newtonian White Dwarfs, in: Thirteenth Marcel Grossmann Meeting: On Recent Developments in Theoretical and Experimental Gen- eral Relativity, Astrophysics and Relativistic Field Theories, pp. 2468 2474. doi:10.1142/9789814623995$_-$0472, arXiv:1503.04171.\\nBoshkayev, K.A., Rueda, J.A., Zhami, B.A., Kalymova, Z.A., Balgymbekov, G.S., 2016. Equilibrium structure of white dwarfs at finite temperatures, in: International Journal of Modern Physics Conference Series, p. 1660129. doi:10.1142/S2010194516601290, arXiv:1510.02024.\\nBours, M.C.P., Marsh, T.R., Parsons, S.G., Dhillon, V.S., Ashley, R.P., Bento, J.P., Breedt, E., Butterley, T., Caceres, C., Chote, P., Copperwheat, C.M., Hardy, L.K., Hermes, J.J., Irawati, P., Kerry, P., Kilkenny, D., Littlefair, S.P., McAllister, M.J., Rattanasoon, S., Sahman, D.I., Vu ckovi c, M., Wilson, R.W., 2016. Long-term eclipse timing of white dwarf binaries: an observa- tional hint of a magnetic mechanism at work. MNRAS 460, 3873 3887. doi:10.1093/mnras/stw1203, arXiv:1606.00780.\\nde Carvalho, S.M., Rotondo, M., Rueda, J.A., Ruffini, R., 2014. Relativistic feynman-metropolis-teller treatment at finite temperatures. Phys. Rev. C 89, 015801. URL: https://link.aps.org/doi/10.1103/PhysRevC.89. 015801, doi:10.1103/PhysRevC.89.015801.\\nde Carvalho, S.M., Rotondo, M., Rueda, J.A., Ruffini, R., 2014. Relativistic Feynman-Metropolis-Teller treatment at finite temperatures. Phys. Rev. C 89, 015801. doi:10.1103/PhysRevC.89.015801.\\nDhillon, V.S., Marsh, T.R., Atkinson, D.C., Bezawada, N., Bours, M.C.P., Copperwheat, C.M., Gamble, T., Hardy, L.K., Hickman, R.D.H., Irawati, P., Ives, D.J., Kerry, P., Leckngam, A., Littlefair, S.P., McLay, S.A., O Brien, K., Peacocke, P.T., Poshyachinda, S., Richichi, A., Soonthorn- thum, B., Vick, A., 2014. ULTRASPEC: a high-speed imaging photome- ter on the 2.4-m Thai National Telescope. MNRAS 444, 4009 4021. doi:10.1093/mnras/stu1660, arXiv:1408.2733.\\nDhillon, V.S., Marsh, T.R., Stevenson, M.J., Atkinson, D.C., Kerry, P., Pea- cocke, P.T., Vick, A.J.A., Beard, S.M., Ives, D.J., Lunney, D.W., McLay, S.A., Tierney, C.J., Kelly, J., Littlefair, S.P., Nicholson, R., Pashley, R.,\\nHarlaftis, E.T., O Brien, K., 2007. ULTRACAM: an ultrafast, triple- beam CCD camera for high-speed astrophysics. MNRAS 378, 825 840. doi:10.1111/j.1365-2966.2007.11881.x, arXiv:0704.2557.\\nFerrario, L., Wickramasinghe, D., Kawka, A., 2020. Magnetic fields in isolated and interacting white dwarfs. Advances in Space Research 66, 1025 1056. doi:10.1016/j.asr.2019.11.012, arXiv:2001.10147.\\nFontaine, G., Brassard, P., Bergeron, P., 2001. The Potential of White Dwarf\\nCosmochronology. PASP 113, 409 435. doi:10.1086/319535.\\nGentile Fusillo, N.P., Tremblay, P.E., Cukanovaite, E., Vorontseva, A., Lalle- ment, R., Hollands, M., G ansicke, B.T., Burdge, K.B., McCleery, J., Jordan, S., 2021. A catalogue of white dwarfs in Gaia EDR3. MNRAS 508, 3877  3896. doi:10.1093/mnras/stab2672, arXiv:2106.07669.\\nHamada, T., Salpeter, E.E., 1961. Models for Zero-Temperature Stars. ApJ\\n134, 683. doi:10.1086/147195.\\nHan, Z., Podsiadlowski, P., Maxted, P.F.L., Marsh, T.R.,\\nIvanova, N., The origin of subdwarf B stars - I. The formation channels. doi:10.1046/j.1365-8711.2002.05752.x,\\n2002. MNRAS 336, 449 466. arXiv:astro-ph/0206130.\\nHearn, A.G., Mewe, R., 1976. The corona around the white dwarf Sirius B\\ndetermined from X-ray measurements. A&A 50, 319 321.\\nIben, I., J., Tutukov, A.V., 1985. On the evolution of close binaries with components of initial mass between 3 M and 12 M. ApJS 58, 661 710. doi:10.1086/191054.\\nIstrate, A.G., Marchant, P., Tauris, T.M., Langer, N., Stancliffe, R.J., Grassitelli, L., 2016. Models of low-mass helium white dwarfs including gravitational settling, thermal and chemical diffusion, and rotational mixing. A&A 595, A35. doi:10.1051/0004-6361/201628874, arXiv:1606.04947.\\nJordan, S., Aznar Cuadrado, R., Napiwotzki, R., Schmid, H.M., Solanki, S.K., 2007. The fraction of DA white dwarfs with kilo-Gauss magnetic fields. A&A 462, 1097 1101. doi:10.1051/0004-6361:20066163, arXiv:astro-ph/0610875.\\nJoyce, S.R.G., Barstow, M.A., Casewell, S.L., Burleigh, M.R., Holberg, J.B., Bond, H.E., 2018. Testing the white dwarf mass-radius relation and com- paring optical and far-UV spectroscopic results with Gaia DR2, HST, and FUSE. MNRAS 479, 1612 1626. doi:10.1093/mnras/sty1425, arXiv:1806.00061.\\nKepler, S.O., Pelisoli, I., Koester, D., Reindl, N., Geier, S., Romero, A.D., Ourique, G., Oliveira, C.d.P., Amaral, L.A., 2019. White dwarf and subd- warf stars in the Sloan Digital Sky Survey Data Release 14. MNRAS 486, 2169 2183. doi:10.1093/mnras/stz960, arXiv:1904.01626.\\nKoester, D., 1976. Convective Mixing and Accretion in White Dwarfs. A&A\\n52, 415.\\nKoester, D., 2002. White dwarfs: Recent developments. A&A Rev. 11, 33 66.\\ndoi:10.1007/s001590100015.\\nKoester, D., Chanmugam, G., 1990. Physics of white dwarf stars. Reports on\\nProgress in Physics 53, 837.\\nKoester, D., Chanmugam, G., 1990. REVIEW: Physics of white dwarf stars. Reports on Progress in Physics 53, 837 915. doi:10.1088/0034-4885/ 53/7/001.\\nKoester, D., Schulz, H., Weidemann, V., 1979. Atmospheric parameters and\\nmass distribution of DA white dwarfs. A&A 76, 262 275.\\nMarsh, T.R., Dhillon, V.S., Duck, S.R., 1995. Low-Mass White Dwarfs Need Friends - Five New Double-Degenerate Close Binary Stars. MNRAS 275, 828. doi:10.1093/mnras/275.3.828.\\nNelemans, G., Zwart, S., Verbunt, F., Yungelson, L., 2001. Population synthesis for double white dwarfs. ii. semi-detached systems: Am cvn stars. arXiv preprint astro-ph/0101123 .\\nPanei, J.A., Althaus, L.G., Chen, X., Han, Z., 2007. Full evolution of low- mass white dwarfs with helium and oxygen cores. MNRAS 382, 779 792. doi:10.1111/j.1365-2966.2007.12400.x.\\nParsons, S.G., G ansicke, B.T., Marsh, T.R., Ashley, R.P., Bours, M.C.P., Breedt, E., Burleigh, M.R., Copperwheat, C.M., Dhillon, V.S., Green, M., Hardy, L.K., Hermes, J.J., Irawati, P., Kerry, P., Littlefair, S.P., McAllis- ter, M.J., Rattanasoon, S., Rebassa-Mansergas, A., Sahman, D.I., Schreiber, M.R., 2017. Testing the white dwarf mass-radius relationship with eclips- ing binaries. MNRAS 470, 4473 4492. doi:10.1093/mnras/stx1522, arXiv:1706.05016.\\nPei, T.H., 2022. The Highly Accurate Relation between the Radius and Mass of the White Dwarf Star from Zero to Finite Temperature. Frontiers in As- tronomy and Space Sciences 8, 243. doi:10.3389/fspas.2021.799210. Ren, J.J., Rebassa-Mansergas, A., Parsons, S.G., Liu, X.W., Luo, A.L., Kong,\\nX., Zhang, H.T., 2018. White dwarf-main sequence binaries from LAM- OST: the DR5 catalogue. MNRAS 477, 4641 4654. doi:10.1093/mnras/ sty805, arXiv:1803.09523.\\nRomero, A.D., Kepler, S.O., Joyce, S.R.G., Lauffer, G.R., C orsico, A.H., 2019. The white dwarf mass-radius relation and its dependence on the hydro- gen envelope. MNRAS 484, 2711 2724. doi:10.1093/mnras/stz160, arXiv:1901.04644.\\nRotondo, M., Rueda, J.A., Ruffini, R., Xue, S.S., 2011. Relativistic Feynman- Metropolis-Teller theory for white dwarfs in general relativity. Phys. Rev. D 84, 084007. doi:10.1103/PhysRevD.84.084007, arXiv:1012.0154. Str omgren, B., 1939. Book Review: An Introduction to the Study of Stellar\\nStructure, by S. Chandrasekhar. Popular Astronomy 47, 287.\\nTremblay, P.E., Gentile-Fusillo, N., Raddi, R., Jordan, S., Besson, C., G ansicke, B.T., Parsons, S.G., Koester, D., Marsh, T., Bohlin, R., Kali- rai, J., Deustua, S., 2017. The Gaia DR1 mass-radius relation for white dwarfs. MNRAS 465, 2849 2861. doi:10.1093/mnras/stw2854, arXiv:1611.00629.\\nVerbunt, F., Rappaport, S., 1988. Mass Transfer Instabilities Due to Angular Momentum Flows in Close Binaries. ApJ 332, 193. doi:10.1086/166645. Vernet, J., Dekker, H., D Odorico, S., Kaper, L., Kjaergaard, P., Hammer, F., Randich, S., Zerbi, F., Groot, P.J., Hjorth, J., Guinouard, I., Navarro, R., Adolfse, T., Albers, P.W., Amans, J.P., Andersen, J.J., Andersen, M.I., Bi- netruy, P., Bristow, P., Castillo, R., Chemla, F., Christensen, L., Conconi, P., Conzelmann, R., Dam, J., de Caprio, V., de Ugarte Postigo, A., Delabre, B., di Marcantonio, P., Downing, M., Elswijk, E., Finger, G., Fischer, G., Flores, H., Franc ois, P., Goldoni, P., Guglielmi, L., Haigron, R., Hanen- burg, H., Hendriks, I., Horrobin, M., Horville, D., Jessen, N.C., Kerber, F., Kern, L., Kiekebusch, M., Kleszcz, P., Klougart, J., Kragt, J., Larsen, H.H., Lizon, J.L., Lucuix, C., Mainieri, V., Manuputy, R., Martayan, C., Mason, E., Mazzoleni, R., Michaelsen, N., Modigliani, A., Moehler, S., M ller, P., Norup S rensen, A., N rregaard, P., P eroux, C., Patat, F., Pena, E., Pragt, J., Reinero, C., Rigal, F., Riva, M., Roelfsema, R., Royer, F., Sacco, G., Santin, P., Schoenmaker, T., Spano, P., Sweers, E., Ter Horst, R., Tintori, M., Tromp, N., van Dael, P., van der Vliet, H., Venema, L., Vidali, M., Vinther, J., Vola, P., Winters, R., Wistisen, D., Wulterkens, G., Zacchei, A., 2011. X-shooter, the new wide band intermediate reso- lution spectrograph at the ESO Very Large Telescope. A&A 536, A105. doi:10.1051/0004-6361/201117752, arXiv:1110.1944.\\nZenati, Y., Toonen, S., Perets, H.B., 2019. Formation and evolution of hy- brid He-CO white dwarfs and their properties. MNRAS 482, 1135 1142. doi:10.1093/mnras/sty2723, arXiv:1803.04444.\\n', source='https://arxiv.org/pdf/2408.15291', source_type=<SourceType.application_pdf: 'application/pdf'>, num_chunks=43, metadata={}, chunks=[ResponseChunk(id='chunk_f7370a35-1779-4f92-95dd-88054c091610', content='4 2 0 2 g u A 6 2 R S . h p - o r t s a [ 1 v 1 9 2 5 1 . 8 0 4 2 : v i X r a A temperature scale of 1 2 eV in the mass-radius relationship of white dwarfs of type DA Jin Lima, Ji-Yu Kima, Maurice H.P.M. van Puttena,1, aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea Abstract The mass-radius relationship of white dwarfs (WDs) is one of their defining characteristics, largely derived from electron degen- eracy pressure. We present a model-independent study of the observed mass-radius relationship in WD binaries of Parsons et al. (2017), listing data over a broad temperature range up to about 60,000 K (5 eV). The data show an appreciable temperature sen- sitivity with pronounced intrinsic scatter (beyond measurement uncertainty) for the canonical He-models with proton to neutron ratio 1:1. We characterize temperature sensitivity by a temperature scale T0 in model-agnostic power-law relations with tempera- ture normalized radius. For low-mass WDs, the results identify a remarkably modest T0 = 1   2 eV.', chunk_index=1, num_tokens=283, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_576d1fdd-a90b-47a4-ae38-162a2349ac43', content='We comment on a potential interpretation for atmospheres insulating super-Eddington temperature cores from the sub-Eddington photospheres of low-mass WDs. Keywords: white dwarfs, mass-radius relation, Chandrasekhar 1. Introduction White dwarfs (WDs) represent the final evolutionary stage of stars with masses in the range of 0.4M  < M < 8M . As such, they are quite numerous. The Gaia Early Data Release 3 (EDR3) contains 359,073 WD candidates (Gentile Fusillo et al., 2021). EDR3 extends the survey of 29,294 WDs of the Sloan Digital Sky Survey (SDSS) Data Release 16 (DR 16) by over an order of magnitude with 25,176 WDs in both surveys. WDs are variously classified by their atmospheric composition (Gen- tile Fusillo et al. 2021). Most are of type DA characterized by a dominance of hydrogen lines in their spectra. It reveals an opaque hydrogen atmosphere, whose observed temperatures are well below the Eddington limit (Fig. 1). WDs have a distinct mass-radius relationship with a lower bound defined by electron degeneracy pressure (Str omgren, 1939).', chunk_index=2, num_tokens=264, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_4233fedc-14c2-48e1-9a5e-703c3f589913', content='It is described by the equation of state (EoS) of de- generacy pressure of a Fermi gas, the relativistic limit of which defines the Chandrasekhar mass-limit. To leading order, WDs are modeled by the ideal Fermi gas at zero temperature sur- rounded by a non-degenerate atmosphere (Koester and Chan- mugam, 1990). At zero temperature, the degenerate WD core is effectively parameterized only by a proton-to-neutron ratio. In this limit, the mass-radius relation satisfies (Str omgren, 1939; Hamada and Salpeter, 1961; Koester et al., 1979) R = C0 (cid:32) M M (cid:33) 1/3 where C0 = (cid:32) 16G3m3 e 81 2 6 (cid:33) 1/3 (cid:32) Amp Z (cid:33) 5/3 M 1/3 0.01 R , Email address: mvp@sejong.ac.kr (Maurice H.P.M. van Putten) 1INAF-OAS Bologna via P.', chunk_index=3, num_tokens=250, metadata={'section_titles': [], 'pages': []}), ResponseChunk(id='chunk_c2f0d2d9-8624-406c-8e81-ca7408e473ee', content='Gobetti 101 I-40129 Bologna Italy, Italy (1) (2) where G is Newton s constant,   is Planck s constant, me and mp are the masses of the electron and proton, A is the atomic number and Z denotes the number of protons in a nucleus - here, mostly alpha-nuclei. We have set the A/Z ratio to 2, since many WDs consist of a C/O or He core, the latter in particular expected for low-mass WDs (Iben and Tutukov, 1985; Marsh et al., 1995; Nelemans et al., 2001; Han et al., 2002; Istrate et al., 2016; Ren et al., 2018; Zenati et al., 2019). According to this calculation, R = C0M 1/3 holds. Fig. 2 shows the observed mass-radius relation for 26 WDs of type DA, all in eclipsing binaries, sampled by Parsons et al. (2017) by photometric and spectroscopic observations.', chunk_index=4, num_tokens=229, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_9bae22d0-dd78-4e90-8a36-da82896a891d', content='Photo- metric observations use, for example, the Ultrafast Triple-beam CCD Camera (ULTRACAM, Dhillon et al. 2007) and its spec- troscopic version ULTRASPEC (Dhillon et al., 2014), currently in use as the high-speed imaging photometer on the Thai Na- tional Telescope (TNT). Spectroscopic observations have been performed by X-shooter (Vernet et al., 2011) in ESO Very Large Telescope (VLT). Fig. 2 shows the theoretical mass-radius relation of the de- generate core (1) to provide a lower bound. It effectively pro- vides a greatest lower bound only for M/M    0.5, even though it represents an rather elementary model of degeneracy pres- sure. The data clearly show a temperature sensitivity at rel- atively low mass M/M    0.5, see also Fig. 9 in Parsons et al. (2017). Crucially, the temperatures involved (Fig. 1) are far below the characteristic energy 0.1   1 Me V of the Fermi energies of the electrons in the degenerate core.', chunk_index=5, num_tokens=247, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_9e8dccba-3110-4d61-b769-ff3e2551622d', content='For the present temperature range and low-mass WDs, any finite temperature corrections to the EoS (de Carvalho et al., 2014; Boshkayev, 2018; Boshkayev et al., 2021), including relativis- tic corrections, will be accordingly small for any generaliza- tions beyond (1), notably in Hamada and Salpeter (1961), Ro- tondo et al. (2011), de Carvalho et al. (2014), Boshkayev (2018) Preprint submitted to New Astronomy August 29, 2024 Figure 1: (Top panel.) Observed, Eddington and core temperatures of the 26 WDs of type DA in the sample of Parsons et al. (2017). Core temperatures are inferred from the Koester (1976) correlation for an optically thick atmosphere, insulating a C/O or He core at super-Eddington temperatures. (Lower panel.) Same data plotted as a function of mass with trends at slope 1.10 (Observed, blue), 1.18 (Eddington, red) and -0.11 (Core, brown).', chunk_index=6, num_tokens=252, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_c788fd65-2925-48d7-97a6-dd6ea65e7b83', content='Eddington temperatures are roughly consistent with the geometric mean of observed and core tempera- tures. and Baiko and Yakovlev (2019); see further Koester and Chan- mugam (1990); Koester (2002). Moreover, at sufficiently high density, the mass-radius relationship of the core becomes uni- versal, independent of the details of the EoS. Instead, the origin of temperature sensitivity in the mass- radius relationship may be found in a non-degenerate atmo- sphere, if present. In particular, a finite temperature sensitivity is expected from an atmosphere about a core at super-Eddington temperatures - allowed for a sufficiently massive and optically thick atmospheres (Fontaine et al., 2001). A variety of studies have been conducted to find solutions thereto and gain a deeper understanding of, e.g., an H enve- lope and/or evolution models depending on core composition (Hamada and Salpeter, 1961; Hearn and Mewe, 1976; Verbunt and Rappaport, 1988; Benvenuto and Althaus, 1999; Fontaine et al., 2001; Panei et al., 2007; Boshkayev et al., 2015; Par- sons et al., 2017; Kepler et al., 2019; Pei, 2022).', chunk_index=7, num_tokens=290, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_46240803-59ae-4660-bc8d-fe5fff0a0d05', content='Among these studies, various convection theories have been advanced, also to explain cooling times and evolutionary processes of WDs in- cluding the thermal insulation provided by their non-degenerate atmospheres. For instance, hot WDs in SDSS DR12 have been analyzed with models of cooling and atmospheres (B edard et al., 2020). Non-Local Thermal Equilibrium (non-LTE) atmospheres and synthetic WD spectra reveal a correlation between surface grav- ity and effective temperature. These modeled approaches, however, are intricate with po- tentially systematic uncertainties in the detailed structure of non-degenerate atmospheres, mediating heat transport by radi- ation and convection (B edard, 2024). For this reason, we set out the present model-agnostic study based on spectroscopic and photometric data, to further our understanding of the mass- radius relationship (Tremblay et al., 2017; Boshkayev et al., 2016). For the recent sample of 26 WDs of Parsons et al. (2017) (Fig. 2), we set out to derive a temperature scale T0, charac- terizing temperature sensitivity by exploring various power law relations for the mass-radius relations. The resulting T0 may serve as a novel observational constraint in future studies.', chunk_index=8, num_tokens=273, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_39bfcc9b-0bee-4ca4-a300-8875786dd1df', content='In  2, we recall some preliminaries of the Eddington temper- ature and the Koester (1976) correlation of core and observed temperature. In  3, we introduce a temperature normalized ra- dius, to be used in power-law fits to the data based on two cost functions:  2 and residual Standard deviation (STD) defined by minimal least square errors. In  4, introduce three temperature- normalized power-laws and consider their fits to data in the log- log plane to effectively describe the expansion of apparent ra- dius with normalized temperature. In  5, two of the three re- lationships are ranked by probability of significance by Monte Carlo analysis. In  6, we interpret the results and summarize our findings with an outlook for future studies in  7. 2. The intermediate Eddington temperature Fig. 1 shows, as expected, the observed temperatures T to be strictly below the Eddington temperature, TEdd. After all, the modest observed temperatures on the order of a few eV im- ply the existence of an atmosphere. A reverse inequality would imply rapid evaporation by radiation pressure acting on the op- tically thin outer-most layers of any atmosphere.', chunk_index=9, num_tokens=257, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_cb97c9e2-7e4c-4185-8805-19e8b2398fd8', content='The Eddington temperature TEdd is defined by equating the Eddington luminosity LEdd = 3.2   104 (M/M ) L  to the lumi- nosity from a sphere of radius R. That is, LEdd = 4 R2 T 4 Edd, where   = 5.67   10 5g s 3K 4 is the Stefan-Boltzmann con- stant. This defines TEdd = 39.5 (cid:32) g 5000 g (cid:33) 1 4 by surface gravity g = GM/R2 with a fiducial value for M = 0.5M , R = 2%R , scaled to the solar value g  = GM /R2  . (3) Figure 2: Mass-radius plot with temperature (color) in the sample of 26 WDs of type DA (Parsons et al., 2017). NN Ser is the hottest and SDSS J0138- 0016 is the coldest at 63,000K and, respectively, 3,570 K. For reference, it includes the theoretical zero-temperature limit (1) (dashed black line). The vertical grey region highlights three WDs of essentially the same mass with pronounced expansion in apparent radius with temperature.', chunk_index=10, num_tokens=280, metadata={'section_titles': [], 'pages': []}), ResponseChunk(id='chunk_42bc1531-09c7-4c9c-afc1-7a269e68222b', content='A significant de- parture is seen between observed and the expected radius (1), especially at low mass M/M    0.5. Included are fits to the data by a model-agnostic power- law Relation-2 (dotted colored curves,  3) studied in the present work with isothermals covering 5,000-65,000 K (bottom to top in steps of 10,000 K) in observed temperature. Relation-2 identifies a characteristic temperature scale T0 = 1   2 eV in the observed mass-radius relationship. For the sample of Parsons et al. (2017), TEdd in (3) is on-average about 40 times the observed surface temperature T (Fig. 1). The Eddington temperatures shown are roughly consistent TcT of observed and core tempera- with the geometric mean tures. As such, TEdd provides a natural reference to their corre- lations. Following a detailed revisit of WD envelopes for the ob- served temperature T at the surface and the central temperature Tc of the core of the WD, Koester (1976) derives a correlation T 4 = 2.05   10 10 (cid:16) c with index   = 2.56, where T T   is in K.', chunk_index=11, num_tokens=271, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_23f6e1b6-b52f-48f6-869c-4206c5a2d567', content='Scaled to TEdd, it takes the form g/cm s 2(cid:17) Tc    TEdd with 0.02 M 1/4 = 35.7 R1/2 0.5 (cid:32) 40 T TEdd (cid:33) = 4/    1.56. Here, we use the notation R = 2% R0.02 R  and M = 0.5 M0.5 M  for a fiducial value and taking into account aforementioned mean ratio of TEdd to T . Fig. 1 summarizes the distributions of Tc, TEdd and T for the sample of Parsons et al. (2017). These are well below the Fermi level EF of the degenerate electrons supporting the core with characteristic temperature kBTc = z mpc2   0.1   1 MeV, where z = Rg/R is the gravitational redshift of the WD surface according to its gravitational radius Rg = GM/c2, where mp is the proton mass, kB is the Boltzmann constant, and c is the velocity of light. (4) (5) Table 1: Three WDs of essential the same mass showing a clear increase of observed radius with temperature in the sample of Parsons et al. (2017).', chunk_index=12, num_tokens=275, metadata={'section_titles': [], 'pages': []}), ResponseChunk(id='chunk_daabbbfd-23e7-4442-af44-faf358bb8420', content='Object M [M ] R [R ] Teff [K] kBTeff [eV] CSS 0970 0.4146 0.025 30000 2.9 SDSS J1028+0931 0.4146 0.018 12000 1.1 SDSS J1210+3347 0.4150 0.016 6000 0.52 3. Temperature normalized radius Fig. 2 shows the data in a mass-radius plot alongside the theoretical zero-temperature limit (1). Highlighted by color is a general trend of increasing radius with temperature. This trend is particularly striking upon considering similar masses. Table 1 lists three WDs of mass M   0.415M  clearly showing a pronounced correlation of apparent radius expanding by   50% with temperature increasing to   5 eV. In absolute terms, relative to the Fermi level of the electrons (Fig. 1), these temperatures are extremely modest leaving the star essentially unperturbed (Hamada and Salpeter, 1961). For the present sample of Parsons et al. (2017), this suggests, in- stead, a temperature sensitivity in the H atmosphere of WDs considered previously by modeled approaches in Parsons et al. (2017) more likely so than in the degenerate core. We return to this in  7.', chunk_index=13, num_tokens=288, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_005906db-b893-42d7-a347-eb7dbdb52046', content='Here, we circumvent model assumptions by using generic and model-agnostic power-laws for an effective description by a temperature-normalized radius. In doing so, we derive a char- acteristic temperature scale characterizing temperature sensitiv- ity, blind to the underlying physical origin. To be specific, based on Fig. 1 and Table 1, we consider (cid:33) (cid:32) M M R = R0 f (T/T0) (6) with free parameters ( , T0). Here, f (T/T0) is dimensionless and R0 is a constant fixing the dimension of length. Starting point of our approach are effective mass-radius re- lationships of the form R   M  f (T/T0) for some power-law index   and temperature scale T0. Equivalently, this considers a correlation of mass to the scaled radius R  = R f (T/T0) (7) We apply the temperature scaled radius (7) to fit the data in the form R  = R0 (M/M )  . (8) For the sample of Parsons et al. (2017), we determine best-fit parameters ( , T0) to the mass-radius data of 26 WDs of type DA, all in eclipsing binaries.', chunk_index=14, num_tokens=269, metadata={'section_titles': [], 'pages': []}), ResponseChunk(id='chunk_1f26a7aa-4655-4514-bab9-3cffa35fc355', content='The function f (T/T0) in (6), to be discussed further below, will be a power-law comprising the free parameters ( , T0). In a fit to the mass-radius data, these parameters will be considered over a broad range of values 0 < T0 < 9 eV, 0 <   < 1. (9) The characteristic temperature scale T0 is limited to T0 < 9 eV. In this energy range, radii obtain accurately, while beyond, accuracy diminishes. We keep   < 1, reflecting the assumption that temperature has a secondary impact on the radius. Our best-fit is defined by   estimated using ODR (Orthogonal Dis- tance Regression) and, subsequently, the optimal value of   and T0 at the minimum STD and  2 of residuals. Following standard practice, our power-laws (6-8) are ana- lyzed by fits to linear trends in the log-log plane to log R  =   log M + C. In fits to data by (10),  2 optimizes both the index   in scaling by a power-law in mass and the constant C, while minimizing STD optimizes only  .', chunk_index=15, num_tokens=248, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_83c7ed62-b6e0-4bf5-ab93-b483a92ccf72', content='In the present analysis quantifying the goodness-of-fit according to residual scatter about a trend line (10), the level shift C along the ordinate - absorbing R0 in (6-8) - is safely ignored and it suffices to determine   by minimization of residuals. 4. Temperature-normalized mass-radius relations In the present model-agnostic approach, we explore fits to the data using three temperature-normalized power-laws. To this end, optimize by  2 and STD in our parameter estimation from fits to the sample of Parsons et al. (2017). The first power-law Relation-1 is I. f (x) = x , where x = T/T0. Here, T0 acts as a constant because of (9). For this reason,   is not affected by a choice of T0. We find =  0.954,   = 0.195  2 : STD :   =  0.951,   = 0.190 with the minimum  2 = 0.0183 and, respectively, with residual   = 0.03664. The second power-law Relation-2 is II. f (x) = (1 + x)  . In contrast to Relation-1, Relation-2 includes the zero temper- ature limit of the Chandrasekhar mass-radius relation (1).', chunk_index=16, num_tokens=282, metadata={'section_titles': [], 'pages': []}), ResponseChunk(id='chunk_40b78365-1a59-4b9b-ba1a-9a7bcead6cac', content='Ac- cordingly, T0 is no longer ignorable and is determined in the optimization process in fitting (13) to the data. We find =  0.969,   = 0.389, T0 = 16226  2 : STD :   =  0.965,   = 0.356, T0 = 13896 with the minimum  2 = 0.0159 and, respectively, with residual   = 0.0344. The third power-law Relation-3 is III. f (x) = 1 + x , similar but not identical to Relation-2. Relation-3 also includes the Chandrasekhar limit of zero temperature and T0 is not ig- norable. We find =  0.971,   = 0.690, T0 = 66500,  2 : STD :   =  0.965,   = 0.647, T0 = 65914 (10) (11) (12) (13) (14) (15) (16) with the minimum  2 = 0.0161 and, respectively, a residual   = 0.0346. Fig. 3 summarizes our three results, each indicated by color: blue, green and red for Relation-1, Relation-2 and Relation- 3, respectively.', chunk_index=17, num_tokens=289, metadata={'section_titles': [], 'pages': []}), ResponseChunk(id='chunk_355fe67d-4f64-4a0b-a8fa-1d450d3a3ef3', content='The curve represents the optimal   with each T0 at the minimum STD ( 2). The junction point shows the common minimum STD ( 2). Both Relation-2 and Relation-3 leave rather similar residuals (in   and  2) for each of Relation- 2 and Relation-3. To rank these two relations, we proceed as follows. 5. Ranking relations by Monte Carlo Analysis In this section, Relation-2 and Relation-3 are ranked for sig- nificance by Monte Carlo (MC) analysis. MC analysis is a useful method for robust parameter estima- tion and ranking relations in the face of measurement uncertain- ties. Though ODR methods can be used also to estimate param- eters, the results do not necessarily agree, making it difficult to rank Relation-2 and Relation-3 for their relative significance. In this light, we pursue MC analysis by creating synthetic data by randomly selecting samples of varying radius, mass, and temperature within the measurement confidence intervals, Xnew = X + N(0,  ). Following this procedure, we estimate  ,  , and T0 without con- sidering errors in ODR. Results extended over 5M calculations are used to determine a ranking of Relation-2 and Relation-3 according to STD or  2) (Fig. 4) and infer a probability P of relative significance by counting the total number of times ei- ther one has preferred rank (Table 2).', chunk_index=18, num_tokens=298, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_ddac4d4c-3318-4083-92a4-5db002c63c3d', content='The MC simulation also provides accurate values in the mean of  ,  , and T0 over the total number of iterations. In our MC analysis, we consider realizations of arrays of 26   3 = 78 entries, comprising mass, radius and tempera- ture of the 26 WDs. As observed quantities, mass, radius and temperature data are independent. An accordingly fair (unbi- ased) draw of realizations extends over random draws from 98 confidence intervals, unconstrained and independently, blind to physical meaning and pre-conceived notions of correlations. In our analysis, the range of allowed values is densely covering by using a very large number of 5M iterations. Table 2 shows the output of our MC analysis. The results indicate that the WD radius is firstly determined by mass more so than temperature. Relation-2 and Relation-3 have similar   values, but   and T0 are notably distinct with otherwise similar residuals in  2 and STD (Figs. 2-5).Table 2 includes Relation-1, results for which are consistent with the above discussion. 6. Interpretation of Results Eclipsing binaries allow precise measurements of WD mass and radius (Bours et al., 2016; Parsons et al., 2017). How- ever, as in Fig. 2, the theoretical mass-radius relation (1) pro- vides a lower bound.', chunk_index=19, num_tokens=292, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_1a0f75a9-aef9-4af7-a0da-a94218992597', content='It is generally not the greatest lower bound by significant departures for hot, low-mass white dwarfs (17) Figure 3: Tracks of STD residuals (left panel) and  2 (right panel) for the Relation-1 (blue), Relation-2 (green) and Relation-3 (red) as a function of the normalized temperature power-law index  , following minimization over all T0 in (9). Note that Relation-1 has no explicit dependence on T0. Both cost functions produce very similar results for   and  . Table 2: Monte Carlo analysis on Relations 1-3 with ranking by probability P of having the lowest residual in  2 (left) or STD (right) for the same data (Fig. 4, synthesized over 5M representations).', chunk_index=20, num_tokens=165, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_11f8ee80-4bc0-46d8-9c41-74aefd51953c', content='STD Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3 R0/R 7.6   10 3 6.2   10 3 5.8   10 3 8.4   10 4 6.2   10 3 5.8   10 3 0.951 0.966 0.967 0.951 0.965 0.966 0.194 0.409 0.690 0.193 0.375 0.685 T0 25000 19000 65000 const 15000 64000 1% 98.7% 1% 1% 98.6% 1% Figure 4: Probability of Relation-2 to have a smaller residual in  2 (continuous curve) and STD (dashed curve) than Relation-3 in a MC analysis extending over a large number of synthetic data sets. Both cost functions support the conclusion that Relation-2 is preferred over Relation-3 in providing a model- agnostic fit to the mass-radius data of WDs. (M/M    0.5). The gray region in Fig. 2 (Table 1) is il- lustrative, highlighting a pronounced trend in observed WD radius with temperatures at otherwise very similar masses in M/M  < 0.5.', chunk_index=21, num_tokens=298, metadata={'section_titles': [], 'pages': []}), ResponseChunk(id='chunk_b181affb-9a1a-4bea-b296-ebe96aa095ba', content='Evidently, this trend cannot be explained by the zero-temperature mass-radius relationship. Several theoretical calculations (de Carvalho et al., 2014; Boshkayev, 2018, 2019) have been advanced to explain this de- parture. While rotation and density affects the radius, the ra- dius of relatively dense WDs is not significantly influenced by temperature even though such is more so for low-density or ro- tating WDs compared to their high-density counter parts. Our a model-agnostic study of T0 identify a characteristic temper- ature in the expansion of the radius of the photosphere. It re- veals a consistent trend wherein WDs with higher temperatures exhibit relatively larger radius, clearly apparent in the overall trend (Benvenuto and Althaus, 1999; Panei et al., 2007; Par- sons et al., 2017; Joyce et al., 2018; Zenati et al., 2019; Romero et al., 2019). In a novel model-agnostic revisit of the mass-radius relation- ship, we quantify this temperature dependence by a temperature scale T0 = (1   2) eV in Relation-2. Relation-2 is found to be statistically more significant than Relation-3 based on our MC analysis ( 4).', chunk_index=22, num_tokens=278, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_11f2d40e-a00c-493f-b8f3-dc1719dbf818', content='Over 5M iterations, Fig.4 shows Relation-2 to have a lower  2 and lower   than Relation-3 at a probability of 98.7%, respectively, 98.6% . In the present approach, we circumvent potential systematic uncertainties otherwise present in model-dependent approaches (Parsons et al. 2017 and refer- ences therein). We summarize our approach in Fig. 5 (A-C). Panel (A) shows the discrepancy between the theoretical mass-radius re- lation at zero temperature (1) and the observed radius. Panel (B) shows a fit to the mass to the un-normalized radius, re- vealing scatter than clearly exceeds that of measurement uncer- tainty. Panel (C) shows the result of a linear relation between the temperature-normalized radius R  and (M/M ) . At rela- tively small residual scatter, this result identifies a temperature scale T0 and a radius primarily determined by mass. Accord- ing to Table 2 and in the notation of (1), we infer a mass-radius relation R = C (cid:32) M M (cid:33) 1/3 (18) Figure 5: Mass-radius plots of the data (black dots). Panel (A) highlights the deviation from the theoretical zero-temperature limit (1).', chunk_index=23, num_tokens=276, metadata={'section_titles': [], 'pages': []}), ResponseChunk(id='chunk_60d38756-143d-41bf-901b-d80934d7372b', content='Panel (B) highlights the excess scatter in the data following a fit (red line) to the un-normalized radius R   M . Panel (C) shows a fit (green line) to our temperature-normalized radius R    M  in Relation-2 (13). A small residual scatter (cf. Fig. 3) evidences the effectiveness of our normalization in Relation-2, except for three outliers with relatively large observational uncertainties. All three panels A-C include the theoretical relation (blue dotted line). Bottom panels show the adjusted radius in our normalization produced by minimization of  2 (left) and STD (right). The adjustment by our temperature normalization in R  = R/ f (x), x = T/T0, is about 23% (  = 1.76) and 28% (  = 2.05) in Relation-2 and, respectively, Relation-3 in  2 optimization. The same is 24% (  = 1.82) and 29% (  = 2.07) in Relation-2, respectively, Relation-3 in STD optimization.', chunk_index=24, num_tokens=233, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_1a439cec-13d8-4b8d-a460-4293a3e41632', content='This adjustment is most relevant at high temperatures. with the temperature-dependent expansion of an atmosphere in- cluded in C   C0 (cid:32) 1 + T T0 (cid:33)1/3 (cid:32) M M (cid:33) 2/3 according to Relation-2, parameterized by STD in the approxi- mations   =  0.965    1 and   = 0.375   1/3. (19) Our model-agnostic analysis hereby effectively reveals the presence of non-degenerate atmosphere, sufficiently massive and opaque, to account for the observed temperature sensitivity in Fig. 2 parameterized by above-mentioned T0. Indeed, further confirmation can be found in consistency of the present Parsons et al. (2017) data with the detailed model for H atmospheres of B edard et al. (2020). 7. Conclusions and Outlook We summarize the apparent mass-radius relation (18) with temperature dependent coefficient (19) due to this atmosphere by including Relation-2 as a factor modifying C0 in (19). A principal outcome of our model-independent study is a temperature scale T0 = 1   2 eV in temperature sensitivity of the photospheric radius of the WDs of type DA in Parsons et al. (2017), shown in Fig. 5 and summarized in (18-19).', chunk_index=25, num_tokens=289, metadata={'section_titles': [], 'pages': []}), ResponseChunk(id='chunk_0ec8a359-a428-4943-ad03-ea2f76dc0195', content='T0 derives from 1.6 eV and 1.3 eV according to  2 and, re- spectively, STD in fits of Relation-2 to the data. T0 appears to be particularly relevant to low mass M/M    0.5 in the present sample. While the Parsons et al. (2017) sample of WDs covers a size- able range in masses with distinct temperature sensitivity be- low M/M    0.5 and above M/M    0.5, they nevertheless are of relatively high mass-density      0 (Fig. 6). Cru- cially, densities for the entire at hand are above the threshold  0 = 105 g cm 3 above which the mass-radius relation of the degenerate core is expected to be insensitive to temperature (Boshkayev, 2018). In this sense, the present sample is rela- tively homogeneous and does not test temperature sensitivity of the core beyond what is expected from (1). For the above-mentioned low-mass WDs, a non-degenerate atmosphere effectively insulates a core at super-Eddington tem- peratures (4) from the observed sub-Eddington temperatures T by virtue of an atmosphere that is sufficiently massive and opaque.', chunk_index=26, num_tokens=264, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_f483fae5-6761-4649-a560-896956a0c8ae', content='In the following scaling, we shall ignore a potential role of a corona due to magnetic fields (Aznar Cuadrado et al., 2004; Jordan et al., 2007; Ferrario et al., 2020). Following  2, the Eddington temperature introduces a scale height hE by virtue of thermal kinetic energy of the ions, distinct from escape by radiation pressure. For a hydrogen atmosphere, Figure 6: (Top panel.) For the WD sample of Parsons et al. (2017), shown are the expected relative expansion of a non-degenerate atmosphere, sufficiently massive and opaque, surrounding a degenerate core based on super-Eddington temperature Tc inferred from the Koester (1976) correlation to the observed temperature T (Fig. 1). The Parsons et al. (2017) sample satisfies TEdd     TcT , reconciling Tc   TEdd with T   TEdd, leaving T on the order of a few eV. (Lower panel.) For the entire Parsons et al. (2017) sample, the mass- densities of the core according to the theoretical zero-temperature relationship (1) are above the threshold 105 g cm 3 of Boshkayev (2018), where temperature sensitivity is negligible. we have hE kBTEdd mpg = 13 km R2 0.02 M 1 0.5   0.3%RWD.', chunk_index=27, num_tokens=299, metadata={'section_titles': [], 'pages': []}), ResponseChunk(id='chunk_ff7de32e-f1dd-4578-b643-cdd62de0fff2', content='By high thermal conductivity, the core is believed to be at an essentially uniform temperature. As a result, the atmosphere assumes a mean temperature  TEdd with      . By (20), this implies a height H =   hE   10% RWD. This expected height (Fig. 6) introduces a radial expansion qualitatively consistent with the data (Fig. 5). In this light, the relatively modest characteristic temperature T0 = 1   2 eV in (18-19) can be identified with the core temper- ature higher by a factor of Tc/T   (TEdd/T )2 = O based on Figs. 1-2 and  2. (cid:16) 103(cid:17) The heat transport from core to surface in such atmospheres gives rise to a complex mass-radius relationship, here described by (18-19). This appears to be particular relevant to low-mass WDs, essentially below the mean of the WD mass distribution. Above, the relatively high-mass WDs appear to follow the the- oretical mass-radius relation (1), evidencing the absence of an atmosphere and/or a relatively low temperature core. The origin of this discrepancy appears to be beyond the present considera- tions, that might involve a distinct composition and associated formation history.', chunk_index=28, num_tokens=274, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_cb3cf566-78e2-453a-b77c-be6258567f4d', content='Derived for WDs of type DA, our results may serve as a reference for similar model-independent analysis of WDs of different type. (20) (21) Acknowledgements The authors thank the anonymous reviewer for constructive comments which greatly contributed to clarity of presentation, and thank M.A. Abchouyeh for stimulating discussions. This work is supported, in part, by NRF No. RS-2024-00334550. References Aznar Cuadrado, R., Jordan, S., Napiwotzki, R., Schmid, H.M., Solanki, S.K., Mathys, G., 2004. Discovery of kilogauss magnetic fields in three DA white dwarfs. A&A 423, 1081 1094. doi:10.1051/0004-6361:20040355, arXiv:astro-ph/0405308. Baiko, D.A., Yakovlev, D.G., 2019. Quantum ion thermodynamics in liquid interiors of white dwarfs. MNRAS 490, 5839 5847. doi:10.1093/mnras/ stz3029, arXiv:1910.06771. B edard, A., 2024. The spectral evolution of white dwarfs: where do doi:10.1007/s10509-024-04307-5, we stand?', chunk_index=29, num_tokens=293, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_a0dc9cc9-9cfd-4d34-9fc5-e35f2d12b8fe', content='arXiv:2405.01268. Ap&SS 369, 43. B edard, A., Bergeron, P., Brassard, P., Fontaine, G., 2020. On the Spec- tral Evolution of Hot White Dwarf Stars. I. A Detailed Model Atmo- sphere Analysis of Hot White Dwarfs from SDSS DR12. ApJ 901, 93. doi:10.3847/1538-4357/abafbe, arXiv:2008.07469. Benvenuto, O.G., Althaus, L.G., 1999. Grids of white dwarf evolutionary mod- els with masses from M=0.1 to 1.2 m solar. MNRAS 303, 30 38. doi:10. 1046/j.1365-8711.1999.02215.x, arXiv:astro-ph/9811414. Boshkayev, K., 2018. Equilibrium Configurations of Rotating White Dwarfs at Finite Temperatures. Astronomy Reports 62, 847 852. doi:10.1134/ S106377291812017X, arXiv:1807.00332. Boshkayev, K., 2019.', chunk_index=30, num_tokens=276, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_291b6fd1-fd26-4019-b713-a902d28538c1', content='Static and rotating white dwarfs at finite temper- arXiv e-prints , arXiv:1909.10899doi:10.48550/arXiv.1909. atures. 10899, arXiv:1909.10899. Boshkayev, K., O., L., M., M., H., Q., 2021. Static and rotating white dwarfs at finite temperatures. IJMPh , 61doi:10.26577/ijmph.2021.v12.i2.07, arXiv:1909.10899. Boshkayev, K., Rueda, J.A., Ruffini, R., Siutsou, I., 2015. General Rela- tivistic and Newtonian White Dwarfs, in: Thirteenth Marcel Grossmann Meeting: On Recent Developments in Theoretical and Experimental Gen- eral Relativity, Astrophysics and Relativistic Field Theories, pp. 2468 2474. doi:10.1142/9789814623995$_-$0472, arXiv:1503.04171. Boshkayev, K.A., Rueda, J.A., Zhami, B.A., Kalymova, Z.A., Balgymbekov, G.S., 2016. Equilibrium structure of white dwarfs at finite temperatures, in:', chunk_index=31, num_tokens=299, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_ab1ff197-6f18-4533-be98-ec2ea2a33d7c', content='International Journal of Modern Physics Conference Series, p. 1660129. doi:10.1142/S2010194516601290, arXiv:1510.02024. Bours, M.C.P., Marsh, T.R., Parsons, S.G., Dhillon, V.S., Ashley, R.P., Bento, J.P., Breedt, E., Butterley, T., Caceres, C., Chote, P., Copperwheat, C.M., Hardy, L.K., Hermes, J.J., Irawati, P., Kerry, P., Kilkenny, D., Littlefair, S.P., McAllister, M.J., Rattanasoon, S., Sahman, D.I., Vu ckovi c, M., Wilson, R.W., 2016. Long-term eclipse timing of white dwarf binaries: an observa- tional hint of a magnetic mechanism at work. MNRAS 460, 3873 3887. doi:10.1093/mnras/stw1203, arXiv:1606.00780. de Carvalho, S.M., Rotondo, M., Rueda, J.A., Ruffini, R., 2014. Relativistic feynman-metropolis-teller treatment at finite temperatures. Phys. Rev. C 89, 015801.', chunk_index=32, num_tokens=288, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_e11d9540-0a8d-4079-867e-9637626e3cf8', content='URL: https://link.aps.org/doi/10.1103/PhysRevC.89. 015801, doi:10.1103/PhysRevC.89.015801. de Carvalho, S.M., Rotondo, M., Rueda, J.A., Ruffini, R., 2014. Relativistic Feynman-Metropolis-Teller treatment at finite temperatures. Phys. Rev. C 89, 015801. doi:10.1103/PhysRevC.89.015801. Dhillon, V.S., Marsh, T.R., Atkinson, D.C., Bezawada, N., Bours, M.C.P., Copperwheat, C.M., Gamble, T., Hardy, L.K., Hickman, R.D.H., Irawati, P., Ives, D.J., Kerry, P., Leckngam, A., Littlefair, S.P., McLay, S.A., O Brien, K., Peacocke, P.T., Poshyachinda, S., Richichi, A., Soonthorn- thum, B., Vick, A., 2014. ULTRASPEC: a high-speed imaging photome- ter on the 2.4-m Thai National Telescope.', chunk_index=33, num_tokens=272, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_fe15f42d-f950-47d9-8de6-e952b45f7a8e', content='MNRAS 444, 4009 4021. doi:10.1093/mnras/stu1660, arXiv:1408.2733. Dhillon, V.S., Marsh, T.R., Stevenson, M.J., Atkinson, D.C., Kerry, P., Pea- cocke, P.T., Vick, A.J.A., Beard, S.M., Ives, D.J., Lunney, D.W., McLay, S.A., Tierney, C.J., Kelly, J., Littlefair, S.P., Nicholson, R., Pashley, R., Harlaftis, E.T., O Brien, K., 2007. ULTRACAM: an ultrafast, triple- beam CCD camera for high-speed astrophysics. MNRAS 378, 825 840. doi:10.1111/j.1365-2966.2007.11881.x, arXiv:0704.2557. Ferrario, L., Wickramasinghe, D., Kawka, A., 2020. Magnetic fields in isolated and interacting white dwarfs. Advances in Space Research 66, 1025 1056. doi:10.1016/j.asr.2019.11.012, arXiv:2001.10147.', chunk_index=34, num_tokens=281, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_bb06a1e1-83fa-483c-a84e-90076e714124', content='Fontaine, G., Brassard, P., Bergeron, P., 2001. The Potential of White Dwarf Cosmochronology. PASP 113, 409 435. doi:10.1086/319535. Gentile Fusillo, N.P., Tremblay, P.E., Cukanovaite, E., Vorontseva, A., Lalle- ment, R., Hollands, M., G ansicke, B.T., Burdge, K.B., McCleery, J., Jordan, S., 2021. A catalogue of white dwarfs in Gaia EDR3. MNRAS 508, 3877  3896. doi:10.1093/mnras/stab2672, arXiv:2106.07669. Hamada, T., Salpeter, E.E., 1961. Models for Zero-Temperature Stars. ApJ 134, 683. doi:10.1086/147195. Han, Z., Podsiadlowski, P., Maxted, P.F.L., Marsh, T.R., Ivanova, N., The origin of subdwarf B stars - I. The formation channels. doi:10.1046/j.1365-8711.2002.05752.x, 2002.', chunk_index=35, num_tokens=280, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_2ef64b95-a13e-41ed-8e8d-58da7fe990c0', content='MNRAS 336, 449 466. arXiv:astro-ph/0206130. Hearn, A.G., Mewe, R., 1976. The corona around the white dwarf Sirius B determined from X-ray measurements. A&A 50, 319 321. Iben, I., J., Tutukov, A.V., 1985. On the evolution of close binaries with components of initial mass between 3 M and 12 M. ApJS 58, 661 710. doi:10.1086/191054. Istrate, A.G., Marchant, P., Tauris, T.M., Langer, N., Stancliffe, R.J., Grassitelli, L., 2016. Models of low-mass helium white dwarfs including gravitational settling, thermal and chemical diffusion, and rotational mixing. A&A 595, A35. doi:10.1051/0004-6361/201628874, arXiv:1606.04947. Jordan, S., Aznar Cuadrado, R., Napiwotzki, R., Schmid, H.M., Solanki, S.K., 2007. The fraction of DA white dwarfs with kilo-Gauss magnetic fields.', chunk_index=36, num_tokens=269, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_f436e108-3525-4f92-aa9d-a66c96c2f70a', content='A&A 462, 1097 1101. doi:10.1051/0004-6361:20066163, arXiv:astro-ph/0610875. Joyce, S.R.G., Barstow, M.A., Casewell, S.L., Burleigh, M.R., Holberg, J.B., Bond, H.E., 2018. Testing the white dwarf mass-radius relation and com- paring optical and far-UV spectroscopic results with Gaia DR2, HST, and FUSE. MNRAS 479, 1612 1626. doi:10.1093/mnras/sty1425, arXiv:1806.00061. Kepler, S.O., Pelisoli, I., Koester, D., Reindl, N., Geier, S., Romero, A.D., Ourique, G., Oliveira, C.d.P., Amaral, L.A., 2019. White dwarf and subd- warf stars in the Sloan Digital Sky Survey Data Release 14. MNRAS 486, 2169 2183. doi:10.1093/mnras/stz960, arXiv:1904.01626. Koester, D., 1976. Convective Mixing and Accretion in White Dwarfs. A&A 52, 415.', chunk_index=37, num_tokens=290, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_b08985d8-d024-41cc-bc5e-e8d9aca53521', content='Koester, D., 2002. White dwarfs: Recent developments. A&A Rev. 11, 33 66. doi:10.1007/s001590100015. Koester, D., Chanmugam, G., 1990. Physics of white dwarf stars. Reports on Progress in Physics 53, 837. Koester, D., Chanmugam, G., 1990. REVIEW: Physics of white dwarf stars. Reports on Progress in Physics 53, 837 915. doi:10.1088/0034-4885/ 53/7/001. Koester, D., Schulz, H., Weidemann, V., 1979. Atmospheric parameters and mass distribution of DA white dwarfs. A&A 76, 262 275. Marsh, T.R., Dhillon, V.S., Duck, S.R., 1995. Low-Mass White Dwarfs Need Friends - Five New Double-Degenerate Close Binary Stars. MNRAS 275, 828. doi:10.1093/mnras/275.3.828. Nelemans, G., Zwart, S., Verbunt, F., Yungelson, L., 2001. Population synthesis for double white dwarfs. ii. semi-detached systems:', chunk_index=38, num_tokens=279, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_edcda445-9dae-4aa9-996e-7435b2a5d78c', content='Am cvn stars. arXiv preprint astro-ph/0101123 . Panei, J.A., Althaus, L.G., Chen, X., Han, Z., 2007. Full evolution of low- mass white dwarfs with helium and oxygen cores. MNRAS 382, 779 792. doi:10.1111/j.1365-2966.2007.12400.x. Parsons, S.G., G ansicke, B.T., Marsh, T.R., Ashley, R.P., Bours, M.C.P., Breedt, E., Burleigh, M.R., Copperwheat, C.M., Dhillon, V.S., Green, M., Hardy, L.K., Hermes, J.J., Irawati, P., Kerry, P., Littlefair, S.P., McAllis- ter, M.J., Rattanasoon, S., Rebassa-Mansergas, A., Sahman, D.I., Schreiber, M.R., 2017. Testing the white dwarf mass-radius relationship with eclips- ing binaries. MNRAS 470, 4473 4492. doi:10.1093/mnras/stx1522, arXiv:1706.05016. Pei, T.H., 2022. The Highly Accurate Relation between the Radius and Mass of the White Dwarf Star from Zero to Finite Temperature.', chunk_index=39, num_tokens=297, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_09a865a7-de42-4935-802a-6565cd6539c9', content='Frontiers in As- tronomy and Space Sciences 8, 243. doi:10.3389/fspas.2021.799210. Ren, J.J., Rebassa-Mansergas, A., Parsons, S.G., Liu, X.W., Luo, A.L., Kong, X., Zhang, H.T., 2018. White dwarf-main sequence binaries from LAM- OST: the DR5 catalogue. MNRAS 477, 4641 4654. doi:10.1093/mnras/ sty805, arXiv:1803.09523. Romero, A.D., Kepler, S.O., Joyce, S.R.G., Lauffer, G.R., C orsico, A.H., 2019. The white dwarf mass-radius relation and its dependence on the hydro- gen envelope. MNRAS 484, 2711 2724. doi:10.1093/mnras/stz160, arXiv:1901.04644. Rotondo, M., Rueda, J.A., Ruffini, R., Xue, S.S., 2011. Relativistic Feynman- Metropolis-Teller theory for white dwarfs in general relativity. Phys. Rev. D 84, 084007. doi:10.1103/PhysRevD.84.084007, arXiv:1012.0154.', chunk_index=40, num_tokens=299, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_6f5ab1e1-7152-46b1-887b-5324964369e5', content='Str omgren, B., 1939. Book Review: An Introduction to the Study of Stellar Structure, by S. Chandrasekhar. Popular Astronomy 47, 287. Tremblay, P.E., Gentile-Fusillo, N., Raddi, R., Jordan, S., Besson, C., G ansicke, B.T., Parsons, S.G., Koester, D., Marsh, T., Bohlin, R., Kali- rai, J., Deustua, S., 2017. The Gaia DR1 mass-radius relation for white dwarfs. MNRAS 465, 2849 2861. doi:10.1093/mnras/stw2854, arXiv:1611.00629. Verbunt, F., Rappaport, S., 1988. Mass Transfer Instabilities Due to Angular Momentum Flows in Close Binaries. ApJ 332, 193. doi:10.1086/166645.', chunk_index=41, num_tokens=212, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_71f74699-72b3-4d28-a090-ee3f44c11870', content='Vernet, J., Dekker, H., D Odorico, S., Kaper, L., Kjaergaard, P., Hammer, F., Randich, S., Zerbi, F., Groot, P.J., Hjorth, J., Guinouard, I., Navarro, R., Adolfse, T., Albers, P.W., Amans, J.P., Andersen, J.J., Andersen, M.I., Bi- netruy, P., Bristow, P., Castillo, R., Chemla, F., Christensen, L., Conconi, P., Conzelmann, R., Dam, J., de Caprio, V., de Ugarte Postigo, A., Delabre, B., di Marcantonio, P., Downing, M., Elswijk, E., Finger, G., Fischer, G., Flores, H., Franc ois, P., Goldoni, P., Guglielmi, L., Haigron, R., Hanen- burg, H., Hendriks, I., Horrobin, M., Horville, D., Jessen, N.C., Kerber, F., Kern, L., Kiekebusch, M., Kleszcz, P., Klougart, J., Kragt, J., Larsen, H.H., Lizon, J.L., Lucuix, C., Mainieri, V., Manuputy, R., Martayan, C., Mason, E., Mazzoleni, R., Michaelsen, N., Modigliani, A., Moehler, S., M ller, P., Norup S rensen, A., N rregaard, P., P eroux, C., Patat, F., Pena, E., Pragt, J., Reinero, C., Rigal, F., Riva, M., Roelfsema, R., Royer, F., Sacco, G., Santin, P., Schoenmaker, T., Spano, P., Sweers, E., Ter Horst, R., Tintori, M., Tromp, N., van Dael, P., van der Vliet, H., Venema, L., Vidali, M., Vinther, J., Vola, P., Winters, R., Wistisen, D., Wulterkens, G., Zacchei, A., 2011.', chunk_index=42, num_tokens=516, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]}), ResponseChunk(id='chunk_62de0c83-51bc-47df-805b-0ca388ecf512', content='X-shooter, the new wide band intermediate reso- lution spectrograph at the ESO Very Large Telescope. A&A 536, A105. doi:10.1051/0004-6361/201117752, arXiv:1110.1944. Zenati, Y., Toonen, S., Perets, H.B., 2019. Formation and evolution of hy- brid He-CO white dwarfs and their properties. MNRAS 482, 1135 1142. doi:10.1093/mnras/sty2723, arXiv:1803.04444.', chunk_index=43, num_tokens=133, metadata={'section_titles': ['Tc    TEdd', 'Acknowledgements', 'nificance by Monte Carlo (MC) analysis.', '40 T TEdd', 'with', 'R = C', 'Keywords: white dwarfs, mass-radius relation, Chandrasekhar', 'R f (T/T0)', 'Abstract', 'Preprint submitted to New Astronomy', 'C   C0', 'Teff [K]', 'STD', 'R = R0', 'const', 'Object', 'References', 'where', 'aPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea', 'Progress in Physics 53, 837.', 'kBTEdd mpg', 'Boshkayev, K., 2019.', 'Relation-1 Relation-2 Relation-3 Relation-1 Relation-2 Relation-3', 'Xnew = X + N(0,  ).', 'R = C0', 'log R  =   log M + C.'], 'pages': [1]})]))"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "# From URL PDF\n",
     "from aurelio_sdk import ExtractResponse\n",
     "\n",
     "# From URL\n",
@@ -98,16 +136,48 @@
     "response_pdf_url: ExtractResponse = await client.extract_url(\n",
     "    url=url, quality=\"low\", chunk=True, wait=-1, polling_interval=5\n",
     ")\n",
-    "\n",
     "response_pdf_url"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "283"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "response_pdf_url.document.chunks[0].num_tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=10, pages=None, seconds=15), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>), document=ResponseDocument(id='doc_ead3c92e-efa7-4355-bdb6-3fc3a4c56ba5', content=\" I'm a monster! I'm a monster!\", source='https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4', source_type=<SourceType.video_mp4: 'video/mp4'>, num_chunks=1, metadata={}, chunks=[ResponseChunk(id='chunk_c69eea8d-fdbe-41f9-8218-250ec9ee63fe', content=\"I'm a monster! I'm a monster!\", chunk_index=1, num_tokens=10, metadata={'start_time': 0, 'end_time': 7})]))"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "# From URL Video\n",
     "from aurelio_sdk import ExtractResponse\n",
     "\n",
     "# From URL\n",
diff --git a/tests/client_async/test_async_chunk.py b/tests/client_async/test_async_chunk.py
new file mode 100644
index 0000000..acfaf2e
--- /dev/null
+++ b/tests/client_async/test_async_chunk.py
@@ -0,0 +1,109 @@
+# Tests for AsyncAurelioClient
+import os
+from pathlib import Path
+
+import pydantic_core
+import pytest
+from dotenv import load_dotenv
+
+from aurelio_sdk.client_async import AsyncAurelioClient
+from aurelio_sdk.exceptions import ApiError
+from aurelio_sdk.schema import ChunkingOptions, ChunkResponse
+
+load_dotenv()
+
+
+# setup async client
+@pytest.fixture
+def client() -> AsyncAurelioClient:
+    client = AsyncAurelioClient(
+        api_key=os.environ["AURELIO_API_KEY"], base_url=os.environ["BASE_URL"]
+    )
+    return client
+
+
+@pytest.fixture
+def content():
+    file_path = Path(__file__).parent.parent / "data" / "content.txt"
+    try:
+        return file_path.read_text(encoding="utf-8")
+    except FileNotFoundError:
+        pytest.skip(f"Content file not found at {file_path}")
+
+@pytest.mark.asyncio
+async def test_chunk_invalid(client: AsyncAurelioClient, content: str):
+    with pytest.raises(pydantic_core._pydantic_core.ValidationError):
+        chunking_options = ChunkingOptions(
+            chunker_type="invalid", delimiters=[], max_chunk_length=400
+        )
+        await client.chunk(content=content, processing_options=chunking_options)
+
+@pytest.mark.asyncio
+async def test_chunk_regex(client: AsyncAurelioClient, content: str):
+    chunking_options = ChunkingOptions(
+        chunker_type="regex", delimiters=[], max_chunk_length=400
+    )
+
+    response: ChunkResponse = await client.chunk(
+        content=content, processing_options=chunking_options
+    )
+    dict_response = response.model_dump()
+
+    assert dict_response["status"] == "completed"
+    assert 42930 < dict_response["usage"]["tokens"] < 42940
+    assert dict_response["usage"]["pages"] is None
+    assert dict_response["usage"]["seconds"] is None
+    assert dict_response["processing_options"]["max_chunk_length"] == 400
+    assert dict_response["processing_options"]["chunker_type"] == "regex"
+    assert dict_response["processing_options"]["delimiters"] == []
+    assert len(dict_response["document"]["content"]) > 40000
+    assert dict_response["document"]["num_chunks"] == 97
+    assert dict_response["document"]["chunks"][0]["id"].startswith("chunk_")
+    assert dict_response["document"]["chunks"][0]["chunk_index"] == 1
+    assert dict_response["document"]["chunks"][0]["num_tokens"] > 170
+    # max_num_tokens = max(chunk["num_tokens"] for chunk in dict_response["document"]["chunks"])
+    # # assert max_num_tokens <= 400  # TODO: This is failing
+
+
+@pytest.mark.asyncio
+async def test_chunk_regex_delimiters(client: AsyncAurelioClient):
+    chunking_options = ChunkingOptions(
+        chunker_type="regex", delimiters=["|", "/", "@"], max_chunk_length=4
+    )
+    content = "This is a test | This is a test / This is a  @ test."
+
+    response: ChunkResponse = await client.chunk(
+        content=content, processing_options=chunking_options
+    )
+    dict_response = response.model_dump()
+
+    assert dict_response["processing_options"]["chunker_type"] == "regex"
+    assert dict_response["processing_options"]["delimiters"] == ["|", "/", "@"]
+    assert len(dict_response["document"]["content"]) == 52
+    assert dict_response["document"]["num_chunks"] == 4
+
+@pytest.mark.asyncio
+async def test_chunk_semantic(client: AsyncAurelioClient, content: str):
+    chunking_options = ChunkingOptions(
+        chunker_type="semantic", window_size=5, max_chunk_length=400
+    )
+
+    response: ChunkResponse = await client.chunk(
+        content=content[:10000], processing_options=chunking_options
+    )
+    dict_response = response.model_dump()
+
+    assert dict_response["status"] == "completed"
+    assert 2500 < dict_response["usage"]["tokens"] < 10000
+    assert dict_response["usage"]["pages"] is None
+    assert dict_response["usage"]["seconds"] is None
+    assert dict_response["processing_options"]["max_chunk_length"] == 400
+    assert dict_response["processing_options"]["chunker_type"] == "semantic"
+    assert dict_response["processing_options"]["window_size"] == 5
+    assert len(dict_response["document"]["content"]) == 10000
+    assert dict_response["document"]["num_chunks"] == 18
+    assert dict_response["document"]["chunks"][0]["id"].startswith("chunk_")
+    assert dict_response["document"]["chunks"][0]["chunk_index"] == 1
+    assert dict_response["document"]["chunks"][0]["num_tokens"] > 170
+    max_num_tokens = max(chunk["num_tokens"] for chunk in dict_response["document"]["chunks"])
+    assert max_num_tokens <= 400
diff --git a/tests/client_async/test_async_client.py b/tests/client_async/test_async_client.py
new file mode 100644
index 0000000..f5890df
--- /dev/null
+++ b/tests/client_async/test_async_client.py
@@ -0,0 +1,53 @@
+# Tests for AsyncAurelioClient
+import os
+
+import pytest
+from dotenv import load_dotenv
+
+from aurelio_sdk.client_async import AsyncAurelioClient
+from aurelio_sdk.exceptions import ApiError
+from aurelio_sdk.schema import ChunkingOptions
+
+load_dotenv()
+
+
+# setup async client
+@pytest.fixture
+def client() -> AsyncAurelioClient:
+    client = AsyncAurelioClient(
+        api_key=os.environ["AURELIO_API_KEY"], base_url=os.environ["BASE_URL"]
+    )
+    return client
+
+
+@pytest.fixture
+def no_api_key_env():
+    """Temporarily remove API key from environment"""
+    original_key = os.environ.get("AURELIO_API_KEY")
+    os.environ["AURELIO_API_KEY"] = ""
+    yield
+    if original_key is not None:
+        os.environ["AURELIO_API_KEY"] = original_key
+    else:
+        del os.environ["AURELIO_API_KEY"]
+
+
+@pytest.mark.asyncio
+async def test_async_client_initialization():
+    client = AsyncAurelioClient(api_key="test_api_key")
+    assert client.api_key == "test_api_key"
+    assert client.base_url == "https://api.aurelio.ai"
+
+
+def test_async_client_no_api_key(no_api_key_env):
+    with pytest.raises(ValueError):
+        AsyncAurelioClient(api_key="", base_url="https://api.aurelio.ai")
+
+
+@pytest.mark.asyncio
+async def test_async_client_unauthorized():
+    client = AsyncAurelioClient(api_key="test_api_key")
+    assert client.api_key == "test_api_key"
+    assert client.base_url == "https://api.aurelio.ai"
+    with pytest.raises(ApiError):
+        await client.chunk(content="test", processing_options=ChunkingOptions())
diff --git a/tests/client_async/test_async_extract.py b/tests/client_async/test_async_extract.py
new file mode 100644
index 0000000..d0a315c
--- /dev/null
+++ b/tests/client_async/test_async_extract.py
@@ -0,0 +1,260 @@
+# Tests for AsyncAurelioClient
+import os
+from pathlib import Path
+
+import pytest
+from dotenv import load_dotenv
+
+from aurelio_sdk.client_async import AsyncAurelioClient
+from aurelio_sdk.schema import ExtractResponse
+
+load_dotenv()
+
+
+# setup async client
+@pytest.fixture
+def client() -> AsyncAurelioClient:
+    client = AsyncAurelioClient(
+        api_key=os.environ["AURELIO_API_KEY"], base_url=os.environ["BASE_URL"]
+    )
+    return client
+
+
+# @pytest.mark.asyncio
+# async def test_extract_pdf_file_wait_5_seconds(client: AsyncAurelioClient):
+#     file_path = Path(__file__).parent.parent / "data" / "test_pdf.pdf"
+
+#     response: ExtractResponse = await client.extract_file(
+#         file_path=file_path, quality="low", chunk=True, wait=5, polling_interval=1
+#     )
+
+#     dict_response = response.model_dump()
+
+#     # Status
+#     assert dict_response["status"] == "pending"
+
+#     # Processing options
+#     assert dict_response["processing_options"]["quality"] == "low"
+#     assert dict_response["processing_options"]["chunk"] is True
+
+#     # Document
+#     assert dict_response["document"]["id"].startswith("doc_")
+
+# @pytest.mark.asyncio
+# async def test_extract_pdf_file_from_file_path(client: AsyncAurelioClient):
+#     file_path = Path(__file__).parent.parent / "data" / "test_pdf.pdf"
+
+#     response: ExtractResponse = await client.extract_file(
+#         file_path=file_path, quality="low", chunk=True, wait=-1, polling_interval=2
+#     )
+
+#     dict_response = response.model_dump()
+
+#     # Status
+#     assert dict_response["status"] == "completed"
+
+#     # Usage
+#     assert 11690 < dict_response["usage"]["tokens"] < 11700
+#     assert dict_response["usage"]["pages"] == 7
+#     assert dict_response["usage"]["seconds"] is None
+
+#     # Processing options
+#     assert dict_response["processing_options"]["quality"] == "low"
+#     assert dict_response["processing_options"]["chunk"] is True
+
+#     # Document
+#     assert dict_response["document"]["id"].startswith("doc_")
+#     assert len(dict_response["document"]["content"]) > 11000
+#     assert dict_response["document"]["num_chunks"] == 43
+#     assert dict_response["document"]["chunks"][0]["id"].startswith("chunk_")
+#     assert dict_response["document"]["chunks"][0]["chunk_index"] == 1
+#     assert dict_response["document"]["chunks"][0]["num_tokens"] > 170
+#     max_num_tokens = max(chunk["num_tokens"] for chunk in dict_response["document"]["chunks"])
+#     assert max_num_tokens <= 500
+
+
+# @pytest.mark.asyncio
+# async def test_extract_pdf_file_no_chunks(client: AsyncAurelioClient):
+#     file_path = Path(__file__).parent.parent / "data" / "test_pdf.pdf"
+
+#     response: ExtractResponse = await client.extract_file(
+#         file_path=file_path, quality="low", chunk=False, wait=-1, polling_interval=2
+#     )
+
+#     dict_response = response.model_dump()
+
+#     # Status
+#     assert dict_response["status"] == "completed"
+
+#     # Usage
+#     # assert dict_response["usage"]["tokens"] is None #TODO: this should be None, if chunk is False
+#     assert dict_response["usage"]["pages"] == 7
+#     assert dict_response["usage"]["seconds"] is None
+
+#     # Processing options
+#     assert dict_response["processing_options"]["quality"] == "low"
+#     assert dict_response["processing_options"]["chunk"] is False
+
+#     # Document
+#     assert dict_response["document"]["num_chunks"] == 0
+#     assert dict_response["document"]["chunks"] == []
+
+
+# @pytest.mark.asyncio
+# async def test_extract_pdf_file_from_bytes(client: AsyncAurelioClient):
+#     file_path = Path(__file__).parent.parent / "data" / "test_pdf.pdf"
+#     with open(file_path, "rb") as f:
+#         file_bytes = f.read()
+
+#     response: ExtractResponse = await client.extract_file(
+#         file=file_bytes, quality="low", chunk=False, wait=-1, polling_interval=2
+#     )
+
+#     dict_response = response.model_dump()
+
+#     # Status
+#     assert dict_response["status"] == "completed"
+
+#     # Usage
+#     assert dict_response["usage"]["pages"] == 7
+#     assert dict_response["usage"]["seconds"] is None
+
+#     # Processing options
+#     assert dict_response["processing_options"]["quality"] == "low"
+#     assert dict_response["processing_options"]["chunk"] is False
+
+#     # Document
+#     assert dict_response["document"]["num_chunks"] == 0
+#     assert dict_response["document"]["chunks"] == []
+
+
+@pytest.mark.asyncio
+async def test_extract_video_file_from_file_path(client: AsyncAurelioClient):
+    file_path = Path(__file__).parent.parent / "data" / "test_video.mp4"
+
+    response: ExtractResponse = await client.extract_file(
+        file_path=file_path, quality="low", chunk=True, wait=-1, polling_interval=10
+    )
+
+    dict_response = response.model_dump()
+
+    # Status
+    assert dict_response["status"] == "completed"
+
+    # Usage
+    assert 830 < dict_response["usage"]["tokens"] < 840
+    assert dict_response["usage"]["pages"] is None
+    assert dict_response["usage"]["seconds"] == 291
+
+    # Processing options
+    assert dict_response["processing_options"]["quality"] == "low"
+    assert dict_response["processing_options"]["chunk"] is True
+
+    # Document
+    assert dict_response["document"]["id"].startswith("doc_")
+    assert len(dict_response["document"]["content"]) > 840
+    assert dict_response["document"]["num_chunks"] == 3
+    assert dict_response["document"]["chunks"][0]["id"].startswith("chunk_")
+    assert dict_response["document"]["chunks"][0]["chunk_index"] == 1
+    assert dict_response["document"]["chunks"][0]["num_tokens"] > 200
+    max_num_tokens = max(
+        chunk["num_tokens"] for chunk in dict_response["document"]["chunks"]
+    )
+    assert max_num_tokens <= 500
+    print(dict_response["document"]["metadata"])
+    assert dict_response["document"]["metadata"]["start_time"] == 0
+    assert dict_response["document"]["metadata"]["end_time"] == 99
+
+
+@pytest.mark.asyncio
+async def test_extract_pdf_file_from_url(client: AsyncAurelioClient):
+    url = "https://arxiv.org/pdf/2408.15291"
+
+    response: ExtractResponse = await client.extract_url(
+        url=url, quality="low", chunk=True, wait=-1, polling_interval=5
+    )
+
+    dict_response = response.model_dump()
+
+    # Status
+    assert dict_response["status"] == "completed"
+
+    # Usage
+    assert 11870 < dict_response["usage"]["tokens"] < 11890
+    assert dict_response["usage"]["pages"] == 8
+    assert dict_response["usage"]["seconds"] is None
+
+    # Processing options
+    assert dict_response["processing_options"]["quality"] == "low"
+    assert dict_response["processing_options"]["chunk"] is True
+
+    # Document
+    assert dict_response["document"]["id"].startswith("doc_")
+    assert len(dict_response["document"]["content"]) > 11000
+    assert dict_response["document"]["num_chunks"] == 43
+    assert dict_response["document"]["chunks"][0]["id"].startswith("chunk_")
+    assert dict_response["document"]["chunks"][0]["chunk_index"] == 1
+    assert dict_response["document"]["chunks"][0]["num_tokens"] > 270
+    max_num_tokens = max(
+        chunk["num_tokens"] for chunk in dict_response["document"]["chunks"]
+    )
+    assert max_num_tokens <= 500
+
+
+@pytest.mark.asyncio
+async def test_extract_pdf_file_from_url_chunk_false(client: AsyncAurelioClient):
+    url = "https://arxiv.org/pdf/2408.15291"
+
+    response: ExtractResponse = await client.extract_url(
+        url=url, quality="low", chunk=False, wait=-1, polling_interval=5
+    )
+
+    dict_response = response.model_dump()
+
+    # Status
+    assert dict_response["status"] == "completed"
+
+    # Usage
+    assert dict_response["usage"]["tokens"] is None
+    assert dict_response["usage"]["pages"] == 8
+    assert dict_response["usage"]["seconds"] is None
+
+    # Processing options
+    assert dict_response["processing_options"]["quality"] == "low"
+    assert dict_response["processing_options"]["chunk"] is False
+
+    # Document
+    assert dict_response["document"]["id"].startswith("doc_")
+    assert len(dict_response["document"]["content"]) > 11000
+    assert dict_response["document"]["num_chunks"] == 0
+    assert dict_response["document"]["chunks"] == []
+
+
+
+@pytest.mark.asyncio
+async def test_extract_pdf_file_from_url_wait_5_seconds(client: AsyncAurelioClient):
+    url = "https://arxiv.org/pdf/2408.15291"
+
+    response: ExtractResponse = await client.extract_url(
+        url=url, quality="low", chunk=True, wait=5, polling_interval=1
+    )
+
+    dict_response = response.model_dump()
+
+    # Status
+    assert dict_response["status"] == "pending"
+
+    # Processing options
+    assert dict_response["processing_options"]["quality"] == "low"
+    assert dict_response["processing_options"]["chunk"] is True
+
+    # Document
+    assert dict_response["document"]["id"].startswith("doc_")
+
+
+@pytest.mark.asyncio
+async def test_extract_pdf_file_from_bad_url(client: AsyncAurelioClient):
+    with pytest.raises(ValueError):
+        await client.extract_url(
+            url="https://123.com", quality="low", chunk=True, wait=-1
+        )
diff --git a/tests/client_async/test_chunk_async.py b/tests/client_async/test_chunk_async.py
deleted file mode 100644
index 47fc045..0000000
--- a/tests/client_async/test_chunk_async.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Tests for AsyncAurelioClient
-import os
-from pathlib import Path
-
-import pytest
-from dotenv import load_dotenv
-
-from aurelio_sdk.client_async import AsyncAurelioClient
-from aurelio_sdk.schema import ChunkingOptions, ChunkResponse
-
-load_dotenv()
-
-
-# setup async client
-@pytest.fixture
-def client() -> AsyncAurelioClient:
-    client = AsyncAurelioClient(
-        api_key=os.environ["AURELIO_API_KEY"], base_url=os.environ["BASE_URL"]
-    )
-    return client
-
-
-@pytest.fixture
-def content():
-    file_path = Path(__file__).parent.parent / "data" / "content.txt"
-    try:
-        return file_path.read_text(encoding="utf-8")
-    except FileNotFoundError:
-        pytest.skip(f"Content file not found at {file_path}")
-
-
-# Test successful initialization
-@pytest.mark.asyncio
-async def test_async_client_initialization():
-    client = AsyncAurelioClient(api_key="test_api_key")
-    assert client.api_key == "test_api_key"
-    assert client.base_url == "https://api.aurelio.ai"
-
-
-# Test initialization without API key
-def test_async_client_no_api_key():
-    with pytest.raises(ValueError):
-        AsyncAurelioClient(api_key="", base_url="https://api.aurelio.ai")
-
-
-# Test chunk method success
-@pytest.mark.asyncio
-async def test_chunk_method_success(client: AsyncAurelioClient, content: str):
-    chunking_options = ChunkingOptions(
-        chunker_type="regex", delimiters=[], max_chunk_length=400
-    )
-
-    response_regex: ChunkResponse = await client.chunk(
-        content=content, processing_options=chunking_options
-    )
-
-    assert response_regex.status == "completed"
-
-
-# # Test chunk method API error
-# @pytest.mark.asyncio
-# async def test_chunk_method_api_error():
-#     async def mock_json():
-#         return {"error": "Invalid request"}
-
-#     mock_response = MagicMock()
-#     mock_response.status = 400
-#     mock_response.json = mock_json
-
-#     with patch(
-#         "aiohttp.ClientSession.post",
-#         return_value=asyncio.coroutine(lambda: mock_response),
-#     ) as mock_post:
-#         client = AsyncAurelioClient(api_key="test_api_key")
-#         with pytest.raises(APIError):
-#             await client.chunk(content="Test content")
-
-
-# # Test chunk method timeout
-# @pytest.mark.asyncio
-# async def test_chunk_method_timeout():
-#     with patch("aiohttp.ClientSession.post", side_effect=asyncio.TimeoutError):
-#         client = AsyncAurelioClient(api_key="test_api_key")
-#         with pytest.raises(APITimeoutError):
-#             await client.chunk(content="Test content", timeout=1)
-
-
-# # Test chunk method with processing options
-# @pytest.mark.asyncio
-# async def test_chunk_method_with_options():
-#     mock_response_data = {
-#         "status": "completed",
-#         "usage": {"tokens": 150},
-#         "processing_options": {
-#             "chunker_type": "semantic",
-#             "max_chunk_length": 500,
-#             "window_size": 5,
-#             "delimiters": [],
-#         },
-#         "document": {
-#             "id": "doc_id",
-#             "content": "Test content with options",
-#             "source": "test_source",
-#             "source_type": "text_plain",
-#             "num_chunks": 2,
-#             "metadata": {},
-#             "chunks": [
-#                 {
-#                     "id": "chunk_id_1",
-#                     "content": "Test chunk 1",
-#                     "chunk_index": 0,
-#                     "num_tokens": 75,
-#                     "metadata": {},
-#                 },
-#                 {
-#                     "id": "chunk_id_2",
-#                     "content": "Test chunk 2",
-#                     "chunk_index": 1,
-#                     "num_tokens": 75,
-#                     "metadata": {},
-#                 },
-#             ],
-#         },
-#     }
-
-#     async def mock_json():
-#         return mock_response_data
-
-#     mock_response = MagicMock()
-#     mock_response.status = 200
-#     mock_response.json = mock_json
-
-#     with patch(
-#         "aiohttp.ClientSession.post",
-#         return_value=asyncio.coroutine(lambda: mock_response),
-#     ) as mock_post:
-#         client = AsyncAurelioClient(api_key="test_api_key")
-#         options = ChunkingOptions(
-#             chunker_type="semantic", max_chunk_length=500, window_size=5
-#         )
-#         response = await client.chunk(
-#             content="Test content with options", processing_options=options
-#         )
-#         assert response.processing_options.chunker_type == "semantic"
-#         assert response.document.num_chunks == 2
-
-
-# # Test chunk method handles invalid response
-# @pytest.mark.asyncio
-# async def test_chunk_method_invalid_response():
-#     async def mock_json():
-#         return {"invalid": "data"}
-
-#     mock_response = MagicMock()
-#     mock_response.status = 200
-#     mock_response.json = mock_json
-
-#     with patch(
-#         "aiohttp.ClientSession.post",
-#         return_value=asyncio.coroutine(lambda: mock_response),
-#     ) as mock_post:
-#         client = AsyncAurelioClient(api_key="test_api_key")
-#         with pytest.raises(KeyError):
-#             await client.chunk(content="Test content")
-
-
-# # Test chunk method handles network error
-# @pytest.mark.asyncio
-# async def test_chunk_method_network_error():
-#     with patch("aiohttp.ClientSession.post", side_effect=aiohttp.ClientError):
-#         client = AsyncAurelioClient(api_key="test_api_key")
-#         with pytest.raises(APIError):
-#             await client.chunk(content="Test content")
-
-
-# # Test chunk method with large content
-# @pytest.mark.asyncio
-# async def test_chunk_method_large_content():
-#     large_content = "A" * 10000  # Large content string
-#     mock_response_data = {
-#         "status": "completed",
-#         "usage": {"tokens": 5000},
-#         "processing_options": {
-#             "chunker_type": "regex",
-#             "max_chunk_length": 400,
-#             "window_size": 1,
-#             "delimiters": [],
-#         },
-#         "document": {
-#             "id": "doc_id",
-#             "content": large_content,
-#             "source": "test_source",
-#             "source_type": "text_plain",
-#             "num_chunks": 25,
-#             "metadata": {},
-#             "chunks": [],
-#         },
-#     }
-
-#     async def mock_json():
-#         return mock_response_data
-
-#     mock_response = MagicMock()
-#     mock_response.status = 200
-#     mock_response.json = mock_json
-
-#     with patch(
-#         "aiohttp.ClientSession.post",
-#         return_value=asyncio.coroutine(lambda: mock_response),
-#     ) as mock_post:
-#         client = AsyncAurelioClient(api_key="test_api_key")
-#         response = await client.chunk(content=large_content)
-#         assert response.document.num_chunks == 25
-
-
-# # Test chunk method with invalid processing options
-# @pytest.mark.asyncio
-# async def test_chunk_method_invalid_options():
-#     client = AsyncAurelioClient(api_key="test_api_key")
-#     options = ChunkingOptions(chunker_type="invalid_type")
-#     with pytest.raises(APIError):
-#         await client.chunk(content="Test content", processing_options=options)
-
-
-# # Test chunk method without content
-# @pytest.mark.asyncio
-# async def test_chunk_method_no_content():
-#     client = AsyncAurelioClient(api_key="test_api_key")
-#     with pytest.raises(TypeError):
-#         await client.chunk()  # Missing required positional argument 'content'
-
-
-# # Test chunk method with null content
-# @pytest.mark.asyncio
-# async def test_chunk_method_null_content():
-#     client = AsyncAurelioClient(api_key="test_api_key")
-#     with pytest.raises(TypeError):
-#         await client.chunk(content=None)
-
-
-# # Test chunk method with invalid timeout
-# @pytest.mark.asyncio
-# async def test_chunk_method_invalid_timeout():
-#     client = AsyncAurelioClient(api_key="test_api_key")
-#     with pytest.raises(ValueError):
-#         await client.chunk(content="Test content", timeout=-1)
-
-
-# # Test chunk method with extra headers
-# @pytest.mark.asyncio
-# async def test_chunk_method_extra_headers():
-#     mock_response_data = {
-#         "status": "completed",
-#         "usage": {},
-#         "processing_options": {},
-#         "document": {
-#             "id": "doc_id",
-#             "content": "Test content",
-#             "source": "test_source",
-#             "source_type": "text_plain",
-#             "num_chunks": 1,
-#             "metadata": {},
-#             "chunks": [],
-#         },
-#     }
-
-#     async def mock_json():
-#         return mock_response_data
-
-#     def side_effect(*args, **kwargs):
-#         headers = kwargs.get("headers")
-#         assert "Authorization" in headers
-#         assert headers["Authorization"] == "Bearer test_api_key"
-#         mock_response = MagicMock()
-#         mock_response.status = 200
-#         mock_response.json = mock_json
-#         return asyncio.coroutine(lambda: mock_response)
-
-#     with patch("aiohttp.ClientSession.post", side_effect=side_effect) as mock_post:
-#         client = AsyncAurelioClient(api_key="test_api_key")
-#         response = await client.chunk(content="Test content")
-#         assert response.status == "completed"
-
-
-# # Test chunk method with custom base URL
-# @pytest.mark.asyncio
-# async def test_chunk_method_custom_base_url():
-#     custom_url = "https://custom.api.aurelio.ai"
-#     client = AsyncAurelioClient(api_key="test_api_key", base_url=custom_url)
-#     assert client.base_url == custom_url
-
-
-# # Test chunk method with empty response
-# @pytest.mark.asyncio
-# async def test_chunk_method_empty_response():
-#     async def mock_json():
-#         return {}
-
-#     mock_response = MagicMock()
-#     mock_response.status = 200
-#     mock_response.json = mock_json
-
-#     with patch(
-#         "aiohttp.ClientSession.post",
-#         return_value=asyncio.coroutine(lambda: mock_response),
-#     ) as mock_post:
-#         client = AsyncAurelioClient(api_key="test_api_key")
-#         with pytest.raises(KeyError):
-#             await client.chunk(content="Test content")
-
-
-# # Test chunk method with unauthorized access
-# @pytest.mark.asyncio
-# async def test_chunk_method_unauthorized():
-#     async def mock_json():
-#         return {"error": "Unauthorized"}
-
-#     mock_response = MagicMock()
-#     mock_response.status = 401
-#     mock_response.json = mock_json
-
-#     with patch(
-#         "aiohttp.ClientSession.post",
-#         return_value=asyncio.coroutine(lambda: mock_response),
-#     ) as mock_post:
-#         client = AsyncAurelioClient(api_key="invalid_api_key")
-#         with pytest.raises(APIError):
-#             await client.chunk(content="Test content")
diff --git a/tests/data/test_empty b/tests/data/test_empty
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/test_md.md b/tests/data/test_md.md
new file mode 100644
index 0000000..06e2f73
--- /dev/null
+++ b/tests/data/test_md.md
@@ -0,0 +1 @@
+this is a test MD file
\ No newline at end of file
diff --git a/tests/data/test_pdf.pdf b/tests/data/test_pdf.pdf
new file mode 100644
index 0000000..e799411
Binary files /dev/null and b/tests/data/test_pdf.pdf differ
diff --git a/tests/data/test_video.mp4 b/tests/data/test_video.mp4
new file mode 100644
index 0000000..6cbd7d0
Binary files /dev/null and b/tests/data/test_video.mp4 differ