diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 56068ce638e912d8f347a74bc18a48b9d1d71f6d..d55e204aeb460944ee946a01cecc7ecb5fac524c 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -27,13 +27,25 @@ class AutoTP(): return mlist def supported(model): - unsupported = ['bloom', 'codegen', 'flaubert', 'xlm'] + unsupported = [ + 'bloom', + 'codegen', + 'deberta', + 'flaubert', + 'fsmt', + 'gpt2', + 'led', + 'longformer', + 'xlm', + 'xlnet' + ] model = str(model) key = re.search(r": (.*?)Model", model) if key is None: key = re.search(r": (.*?)Stack", model) if key is None: key = re.match(r"(.*?)Model", model) + assert key is not None, "Not able to determine model policy automatically. Please provide policy." if key.group(1).lower() in unsupported: return False return True @@ -91,4 +103,5 @@ class AutoTP(): gem_list = list(set(gem_list)) policy_list = AutoTP.update_policy_list(policy_list, module, gem_list) gem_list = [] + assert len(policy_list), "Not able to determine model policy automatically. Please provide policy." return policy_list diff --git a/docs/_tutorials/automatic-tensor-parallelism.md b/docs/_tutorials/automatic-tensor-parallelism.md index 89bddb7b96865bb94bc63ed08bcf4ed2600a0a9b..6991d5caf92574c621c2f20758764f76b98df88a 100644 --- a/docs/_tutorials/automatic-tensor-parallelism.md +++ b/docs/_tutorials/automatic-tensor-parallelism.md @@ -88,6 +88,7 @@ deepspeed --num_gpus DeepSpeedExamples/inference/huggingface/text-gen The following results were collected using V100 SXM2 32GB GPUs. ### Max New Tokens = 50 + | Test | Memory Allocated per GPU | Max Batch Size | Max Throughput per GPU | | ---------- | -------------------------- | ---------------- | ------------------------ | | No TP | 23.94 GB | 64 | 18.84 TFlops | @@ -95,6 +96,7 @@ The following results were collected using V100 SXM2 32GB GPUs. | 4 GPU TP | 6.36 GB | 664 | 27.63 TFlops | ### Max New Tokens = 1024 + | Test | Memory Allocated per GPU | Max Batch Size | Max Throughput per GPU | | ---------- | -------------------------- | ---------------- | ------------------------ | | No TP | 23.94 GB | 2 | 1.65 TFlops | @@ -113,7 +115,6 @@ The following model families have been successfully tested with automatic tensor - electra - ernie - esm -- gpt2 - gpt-j - gpt-neo - gpt-neox @@ -146,6 +147,7 @@ The following models are not currently supported with automatic tensor paralleli - deberta - flaubert - fsmt +- gpt2 - led - longformer - xlm