未验证 提交 8552bd6a 编写于 作者: L Logan Adams 提交者: GitHub

Merge branch 'master' into loadams/low-cpu-mem-ut

......@@ -22,9 +22,6 @@ import pytest
from _pytest.outcomes import Skipped
from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker
# Worker timeout *after* the first worker has completed.
DEEPSPEED_UNIT_WORKER_TIMEOUT = 120
# Worker timeout for tests that hang
DEEPSPEED_TEST_TIMEOUT = 600
......@@ -114,6 +111,7 @@ class DistributedExec(ABC):
requires_cuda_env = True
reuse_dist_env = False
_pool_cache = {}
exec_timeout = DEEPSPEED_TEST_TIMEOUT
@abstractmethod
def run(self):
......@@ -170,7 +168,7 @@ class DistributedExec(ABC):
skip_msgs_async = pool.starmap_async(self._dist_run, args)
try:
skip_msgs = skip_msgs_async.get(DEEPSPEED_TEST_TIMEOUT)
skip_msgs = skip_msgs_async.get(self.exec_timeout)
except mp.TimeoutError:
# Shortcut to exit pytest in the case of a hanged test. This
# usually means an environment error and the rest of tests will
......
......@@ -550,13 +550,14 @@ class TestAutoTensorParallelism(DistributedTest):
"model_family, model_name",
(
["gpt2", "EleutherAI/gpt-neo-2.7B"],
["gpt2", "EleutherAI/gpt-j-6b"],
#["gpt2", "EleutherAI/gpt-j-6b"], # Causing OOM for this test
["gpt2", "gpt2-xl"],
),
)
@pytest.mark.parametrize("task", ["lambada_standard"])
class TestLMCorrectness(DistributedTest):
world_size = 1
exec_timeout = 1200 # Give these tests longer to complete
def test(self, model_family, model_name, task):
# imports here to avoid import errors when pytest collects tests
......@@ -565,6 +566,21 @@ class TestLMCorrectness(DistributedTest):
import lm_eval.tasks
import lm_eval.evaluator
# The bootstrap_stderr function in lm_eval.metrics uses a
# multiprocessing Pool to increase performance. Since we use a Pool for
# our distributed tests and cannot nest Pools, we must redefine and
# patch this function with a version that does not use Pool.
def no_pool_bootstrap_stderr(f, xs, iters):
from lm_eval.metrics import _bootstrap_internal
from lm_eval.metrics import sample_stddev
res = []
chunk_size = min(1000, iters)
for i in range(iters // chunk_size):
res.extend(_bootstrap_internal(f, chunk_size)((i, xs)))
return sample_stddev(res)
lm_eval.metrics.bootstrap_stderr = no_pool_bootstrap_stderr
local_rank = os.getenv("LOCAL_RANK", "0")
device = torch.device(get_accelerator().device_name(local_rank))
dtype = torch.float
......@@ -586,6 +602,7 @@ class TestLMCorrectness(DistributedTest):
get_accelerator().synchronize()
bs_time = time.time() - start
getattr(lm, model_family).to("cpu")
ds_model = deepspeed.init_inference(
getattr(lm, model_family),
mp_size=1,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册