config.yaml

# Sample configuration file for Aphrodite Engine
# You can launch the engine using a provided config file by running
# `aphrodite yaml config.yaml` in the CLI

# You can run `aphrodite run -h` to see the full list of options
# that you can pass to the engine.

# Uncomment and modify the following lines to configure the engine

# The basic options. You will usually need to specify these
basic_args:
  # Your model name. Can be a local path or huggingface model ID
  - model:

  # If you want a custom model name for the API, specify it here
  - served_model_name:

  # Whether or not to launch the Kobold API server. Used for hosting
  # on Kobold Horde. Takes a boolean value (true/false)
  - launch_kobold_api:

  # The maximum sequence length/context window for the model
  # You can leave this blank to use the default value (recommended)
  - max_model_len:

  # The tensor parallelism degree. Set this to the number of GPUs you have
  # Keep in mind that for **quantized** models, this will typically only work
  # with values between 1, 2, 4, and 8.
  - tensor_parallel_size:

  # The pipeline parallelism degree. This is similar to tensor parallel,
  # but splits the layers across GPUs rather than the tensors. Only use this
  # if you're doing multi-node, or need 3, 5, 6, 7 GPUs for quantized models.
  - pipeline_parallel_size:

  # The data type to use for KV cache. You can set it to 'fp8' to reduce
  # memory usage for large contexts.
  - kv_cache_dtype:

  # Enable chunking the prefill tokens. This greatly reduces memory usage
  # at high contexts, but it mutually exclusive with kv_cache_dtype=fp8
  # Takes a boolean value (true/false)
  - enable_chunked_prefill:

  # By default, Aphrodite Engine reserves 90% of VRAM for every GPU it's using.
  # Pass a value between 0-1 (e.g. 0.95 for 95%) to increase or decrease this.
  - gpu_memory_utilization:

  # If your model doesn't fit on the GPU, use this. It takes values in GiB.
  # e.g., if you pass `10`, it'll virtually add 10 GiB of VRAM to your GPU.
  # Not recommended because CPU offloading is generally slow.
  - cpu_offload_gb:

  # This is essentially the maximum batch size. It's set to `256` by default.
  # You can lower this to use less memory, but it doesn't affect things that much,
  # unless `enforce_eager` is enabled.
  - max_num_seqs:

  # Whether to enable CUDA graphs. By default, CUDA graphs are disabled. Pass
  # `false` here to enable them, and leave blank or pass `true` to keep it disabled.
  - enforce_eager:

  # The load format to use. You can usually leave this blank.
  # If you want to use bitsandbytes on-the-fly quantization,
  # pass `bitsandbytes`, along with `quantization=bitsandbytes`
  # in the category below.
  - load_format:

  # Whether or not to enable prefix caching. This will cache
  # previous prompts so that they're not recomputed. Helps
  # with large prompts.
  - enable_prefix_caching:

  # Whether or not to trust remote code in the repository. Needed
  # for some models that have custom code.
  - trust_remote_code:

  # The download directory if the `model` is a Hugging Face ID.
  - download_dir:

  # The data type to use for the model. Can be `auto`, `float16`, `bfloat16`,
  # `float32`. Defaults to `auto`, which will use fp16 for fp32 and fp16 models,
  # and bf16 for bf16 models.
  - dtype:


# Quantization options.
quantization_args:

  # The quantization type to use. You don't usually need to pass this,
  # as the engine will figure out the quant from the model itself.
  # You may need to use this if you want to perform online quantization,
  # i.e., quantizing a 16-bit model on-the-fly.
  # To use FP8 (only supported by Ampere and newer GPUs), pass `fp8`.
  # To use bitsandbytes, pass `bitsandbytes`.
  - quantization:

  # Path to the JSON file containing the KV cache scaling factors.
  # This should generally be supplied when KV cache dtype is FP8.
  # Otherwise, KV cache scaling factors default to 1.0, which
  # may cause accuracy issues. FP8_E5M2 (without scaling) is
  # only supported on CUDA versions greater than 11.8. On ROCm,
  # FP8_E4M3 is used instead.
  # For most use cases, you can leave this blank. If you want to
  # generate scales for your model, look at examples/fp8 directory.
  - quantization_param_path:

  # The number of floating point bits to use for deepspeed_fp
  # on-the-fly quantization. Only pass this if you've set
  # quantization to `deepspeedfp`. Takes 4, 6, 8, 12.
  - deepspeed_fp_bits:


# The API-specific options. These are decoupled from the engine.
api_args:

  # The API key to use for the server. Leave blank to disable API key.
  - api_keys:

  # The local path or http address to the chat template to use.
  # This will override the model's existing chat template, if
  # it has one.
  - chat_template:

  # When max_logprobs is specified, represents single tokens as
  # strings of the form `token_ids:{token_id}` so that tokens
  # that are not JSON-encodable can be identified.
  - return_tokens_as_token_ids:


# These are the options for speculative decoding. Spec Decoding
# is a way to speed up inference by loading a smaller model
# and letting it do the predictions, and your main model
# will only verify its outputs. The outputs will match
# 1:1 with your main model.

# We currently support the following speculative decoding algorithms:
# Draft Model, Ngram Prompt Lookup, MLPSpeculator, and Medusa.
speculative_args:
  
  # Use the V2 block manager. Mandatory for speculative decoding.
  # Takes a boolean value (true/false)
  - use_v2_block_manager:

  # The speculative model to use. Can take either a Hugging Face ID
  # or a local path. You can also pass "[ngram]" to use ngram prompt
  # lookup decoding without needing a draft model.
  - speculative_model:

  # The number of tokens for the speculative model to predict.
  # Spec decoding can generate multiple tokens in single forward
  # pass to speed up inference. Don't set this too high, a good
  # value is between 3-10, depending on model size.
  - num_speculative_tokens:

  # The tensor parallel size to use for the speculative model.
  # Usually, you want this set to 1.
  - speculative_draft_tensor_parallel_size:

  # The maximum window size for ngram prompt lookup
  # This needs to be set if you're using ngram prompt lookup
  - ngram_prompt_lookup_max:

  # The minimum window size for ngram prompt lookup
  - ngram_prompt_lookup_min:

  # Disable speculative decoding if the number of queued
  # requests is larger than this value. This is useful
  # to prevent speculative decoding from using too much
  # compute.
  - speculative_disable_by_batch_size:

  # The acceptance method to use for speculative decoding.
  # Can be either `rejection_sampler` or `typical_acceptance_sampler`.
  # The default is `rejection_sampler`.
  # Rejection sampler does not allow changing the acceptance rate
  # of draft tokens. More accurate but slower.
  # Typical acceptance sampler allows changing the acceptance rate
  # of draft tokens. Less accurate but faster.
  - spec_decoding_acceptance_method:

  # The lower bound threshold for the posterior probability
  # of a token to be accepted. Only set this if you're using
  # the typical acceptance sampler. Defaults to 0.09.
  - typical_acceptance_sampler_posterior_threshold:

  # A scaling factor for the entropy-based threshold for token
  # acceptance in the typical acceptance sampler. Only set this
  # if you're using the typical acceptance sampler. Defaults to
  # sqrt of typical_acceptance_sampler_posterior_threshold, i.e. 0.3.
  - typical_acceptance_sampler_posterior_alpha:

  # Whether to disable logprobs during speculative decoding.
  # If True, token log probabilities are not returned. If False,
  # log probabilities are returned according to the settings
  # in samplingParams. Defaults to True.
  # Disabling this (setting to True) speeds up inference
  # during speculative decoding by skipping log probability
  # calculation in proposal and target sampling.
  - disable_logprobs_during_spec_decoding:


# The config options for LoRA adapters.
# Each adapter is treated as a separate model in the API server,
# and your requests will need to be sent to the specific model.
lora_args:

  # Whether or not to enable handling LoRA adapters.
  # Takes a boolean value (true/false)
  - enable_lora:


  # The LoRA adapters to use for the API server.
  # You can specify multiple adapters here.
  - lora_modules:
    # Change the name of the adapter to something more descriptive
    # e.g. ` - my_sql_lora: /path/to/my_sql_lora`
    - lora1: 
    - lora2:

  # The maximum number of LoRA adapters in a single batch.
  - max_loras:

  # The maximum rank of the LoRA adapters. We currently support
  # up to 64.
  - max_lora_rank:

  # The maximum size of extra vocabulary that can be present
  # in a LoRA adapter (added to the base model vocab)
  - lora_extra_vocab_size:

  # The data type for the LoRA adapter.
  # Can take "auto", "float16", "bfloat16", and "float32"
  - lora_dtype:

  # The maximum number of LoRA adapters to store in CPU memory.
  # This number must be larger or equal to max_num_seqs.
  # Defaults to max_num_seqs.
  - max_cpu_loras:

  # Specify multiple scaling factors (which can be different from base
  # model scaling factor) to allow for multiple LoRA adapters trained
  # with those scaling factors to be used at the same time.
  # If not specified, only adapters trained with the base model scaling
  # factor are allowed.
  - long_lora_scaling_factors:

  # By default, only half of the LoRA computation is sharded with tensor
  # parallelism. Enabling this will use the fully sharded layers. At high
  # sequence length, max rank, or tensor parallel size, this is likely faster.
  - fully_sharded_loras:

  # The name or path of the QLoRA adapter to use.
  - qlora_adapter_name_or_path:


# The config options for the Soft Prompt adapters.
# Soft prompts are a way to tune prompts for a specific task
# and load them at a request-level.
soft_prompt_args:

  # Whether or not to enable handling Soft Prompt adapters.
  # Takes a boolean value (true/false)
  - enable_prompt_adapter:

  # The Soft Prompt adapters to use for the API server.
  # You can specify multiple adapters here.
  - prompt_adapters:
    # Change the name of the adapter to something more descriptive
    # e.g. ` - my_sql_prompt: /path/to/my_sql_prompt`
    - prompt1: 
    - prompt2:

  # The maximum number of Soft Prompt adapters in a single batch.
  - max_prompt_adapters:

  # The maximum number of PromptAdapter tokens.
  - max_prompt_adapter_token:


# These are advanced options. You usually don't need to modify these.
advanced_args:

  # The backend to use for distributed inference. Can be either `ray`
  # or `mp` (multiprocessing). Defaults to `mp` for single-node,
  # `ray` for multi-node.
  # Note that specifying a custom backend by passing a custom class
  # is intended for expert use only. The API may change without notice.
  - distributed_executor_backend:

  # The tokenizer to use. Defaults to the model's tokenizer.
  - tokenizer:

  # The model revision to use if pulling from HF. Defaults to main.
  - revision:

  # The revision for the remote code in the model repository.
  - code_revision:

  # The revision for the tokenizer.
  - tokenizer_revision:

  # The maximum number of tokens to be captured by CUDA graphs.
  # This is set to 8192 by default. If your prompt exceeds this
  # threshold, it'll fallback to eager execution.
  - max_seq_len_to_capture:

  # RoPE scaling config in JSON format.
  # For example, `{"type": "dynamic", "factor": 2.0}`
  - rope_scaling:

  # The RoPE theta value. Use with `rope_scaling`. In some cases,
  # changing the RoPE theta improves performance of the scaled
  # model.
  - rope_theta:

  # Extra config for the model loader.
  # This will be passed to the model loader corresponding
  # to the chosen load_format. This should be a JSON string that
  # will be parsed into a dictionary.
  - model_loader_extra_config:

  # Whether to skip tokenizer and detokenizer initialization.
  - skip_tokenizer_init:

  # The size of tokenizer pool to use for asynchronous tokenization.
  # IF 0, will use synchronous tokenization.
  - tokenizer_pool_size:

  # The type of tokenizer pool to use for asynchronous tokenization.
  # Ignored if tokenizer_pool_size is 0.
  # Note that specifying a tokenizer pool by passing a custom class
  # is intended for expert use only. The API may change without notice.
  - tokenizer_pool_type:

  # The extra config for tokenizer pool. This should be a JSON string
  # that will be parsed into a dictionary. Ignored if tokenizer_pool_size
  # is 0.
  - tokenizer_pool_extra_config:

  # The maximum log probabilities to return in the API. Defaults to 10.
  - max_logprobs:

  # The device to use for model execution. You usually don't
  # need to modify this.
  # We support `auto`, `cuda`, `neuron`, `cpu`, `openvino`, `tpu`, and `xpu.
  - device:

  # The pattern(s) to ignore when loading the model.
  # Defaults to `original/**/*` to avoid repeated loading
  # of llama's checkpoints.
  - ignore_patterns:

  # If specified, use nsight to profile ray workers.
  - ray_workers_use_nsight:

  # If specified, disable the custom all-reduce kernels.
  # They're enabled by default for GPUs with P2P support.
  - disable_custom_all_reduce:

  # The preemption mode to use for the scheduler. If `recompute`,
  # the engine performs preemption by block recomputation. If `swap`,
  # the engine performs preemption by block swapping.
  - preemption_mode:

  # If specified, ignore GPU profiling result and use this
  # number of GPU blocks. Only used for testing.
  - num_gpu_blocks_override:

  # The CPU swap space size (GiB) per GPU. Not related to CPU offloading.
  - swap_space:

  # Whether to disable sliding window.
  - disable_sliding_window:

  # The token block size. Takes values between 8, 16, 32.
  - block_size: