未验证 提交 171316fc 编写于 作者: J Jeff Rasley 提交者: GitHub

launcher save pid + require manual triton install for sparse-attn (#1727)

上级 df724e71
......@@ -29,6 +29,7 @@ EXPORT_ENVS = ["NCCL", "PYTHON", "MV2", "UCX"]
DEEPSPEED_ENVIRONMENT_NAME = ".deepspeed_env"
DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
PDSH_MAX_FAN_OUT = 1024
PID_FILE_BASEPATH = "/tmp"
def parse_args(args=None):
......@@ -126,6 +127,13 @@ def parse_args(args=None):
help="Force multi-node launcher mode, helps in cases where user "
"wants to launch on single remote node.")
parser.add_argument(
"--save_pid",
action="store_true",
help="Save file containing launcher process id (pid) at /tmp/<main-pid>.ds, "
"where <main-pid> is the pid of the first process that invoked `deepspeed`. "
"Useful when launching deepspeed processes programmatically.")
parser.add_argument(
"--autotuning",
default="",
......@@ -428,8 +436,22 @@ def main(args=None):
logger.info(f"cmd = {' '.join(cmd)}")
result = subprocess.Popen(cmd, env=env)
pid_file = None
if args.save_pid:
main_pid = os.getpid()
launcher_pid = result.pid
pid_file = os.path.join(PID_FILE_BASEPATH, f"{main_pid}.deepspeed")
with open(pid_file, 'w') as fd:
fd.write(f"{launcher_pid}")
result.wait()
if args.save_pid and pid_file is not None:
# clean-up saved pid file
if os.path.isfile(pid_file):
os.remove(pid_file)
# In case of failure must propagate the error-condition back to the caller (usually shell). The
# actual error and traceback should have been printed in the subprocess, so in order to avoid
# unnecessary noise we just quietly exit here with the same code as the subprocess
......
......@@ -3,6 +3,7 @@ Copyright 2020 The Microsoft DeepSpeed Team
"""
import warnings
from .builder import OpBuilder
from packaging import version as pkg_version
class SparseAttnBuilder(OpBuilder):
......@@ -52,4 +53,20 @@ class SparseAttnBuilder(OpBuilder):
f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
)
try:
import triton
except ImportError:
# auto-install of triton is broken on some systems, reverting to manual install for now
# see this issue: https://github.com/microsoft/DeepSpeed/issues/1710
self.warning(
f"please install triton==1.0.0 if you want to use sparse attention")
return False
installed_triton = pkg_version.parse(triton.__version__)
if installed_triton != pkg_version.parse("1.0.0"):
self.warning(
f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible"
)
return False
return super().is_compatible(verbose) and torch_compatible and cuda_compatible
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册