-
-
Notifications
You must be signed in to change notification settings - Fork 5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Bugfix] Add custom Triton cache manager to resolve MoE MP issue #6140
Changes from 9 commits
6a15062
0d54387
b803165
d3ef0d8
eb5c892
81eef8a
b040645
4dd9367
889d6dd
3307522
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from vllm.triton_utils.custom_cache_manager import ( | ||
maybe_set_triton_cache_manager) | ||
|
||
__all__ = [ | ||
"maybe_set_triton_cache_manager", | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import os | ||
|
||
from triton.runtime.cache import (FileCacheManager, default_cache_dir, | ||
default_dump_dir, default_override_dir) | ||
|
||
from vllm.logger import init_logger | ||
|
||
logger = init_logger(__name__) | ||
|
||
|
||
def maybe_set_triton_cache_manager() -> None: | ||
"""Set environment variable to tell Triton to use a | ||
custom cache manager""" | ||
cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None) | ||
if cache_manger is None: | ||
manager = "vllm.triton_utils.custom_cache_manager:CustomCacheManager" | ||
logger.info("Setting Triton cache manager to: %s", manager) | ||
os.environ["TRITON_CACHE_MANAGER"] = manager | ||
|
||
|
||
class CustomCacheManager(FileCacheManager): | ||
"""Re-implements Triton's cache manager, ensuring that a | ||
unique cache directory is created for each process. This is | ||
needed to avoid collisions when running with tp>1 and | ||
using multi-processing as the distributed backend. | ||
Comment on lines
+22
to
+25
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If triton 3.0.0 could solve this problem, it'd be better to note here that this custom cache manager can be removed when we upgrade triton. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The fix for the issue is not yet in v3.0.0, but I guess would be in whatever version comes after that (see my summary here). I will add a comment to that end. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
""" | ||
|
||
def __init__(self, key, override=False, dump=False): | ||
self.key = key | ||
self.lock_path = None | ||
if dump: | ||
self.cache_dir = default_dump_dir() | ||
self.cache_dir = os.path.join(self.cache_dir, self.key) | ||
self.lock_path = os.path.join(self.cache_dir, "lock") | ||
os.makedirs(self.cache_dir, exist_ok=True) | ||
elif override: | ||
self.cache_dir = default_override_dir() | ||
self.cache_dir = os.path.join(self.cache_dir, self.key) | ||
else: | ||
# create cache directory if it doesn't exist | ||
self.cache_dir = os.getenv("TRITON_CACHE_DIR", | ||
"").strip() or default_cache_dir() | ||
if self.cache_dir: | ||
self.cache_dir = f"{self.cache_dir}_{os.getpid()}" | ||
self.cache_dir = os.path.join(self.cache_dir, self.key) | ||
self.lock_path = os.path.join(self.cache_dir, "lock") | ||
os.makedirs(self.cache_dir, exist_ok=True) | ||
else: | ||
raise RuntimeError("Could not create or locate cache dir") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Document why do we need this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added some docstrings