Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
DeepSpeed
提交
ad168a69
D
DeepSpeed
项目概览
Greenplum
/
DeepSpeed
上一次同步 12 个月
通知
10
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeed
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
ad168a69
编写于
4月 20, 2023
作者:
M
Michael Wyatt
提交者:
GitHub
4月 20, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix for dist not being initialized when constructing main config (#3324)
* move dist init out of Engine
上级
dd8df20f
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
11 addition
and
81 deletion
+11
-81
deepspeed/__init__.py
deepspeed/__init__.py
+9
-0
deepspeed/comm/__init__.py
deepspeed/comm/__init__.py
+1
-44
deepspeed/comm/torch.py
deepspeed/comm/torch.py
+0
-1
deepspeed/comm/utils.py
deepspeed/comm/utils.py
+0
-15
deepspeed/runtime/engine.py
deepspeed/runtime/engine.py
+1
-21
未找到文件。
deepspeed/__init__.py
浏览文件 @
ad168a69
...
...
@@ -15,6 +15,7 @@ from packaging import version as pkg_version
from
.
import
ops
from
.
import
module_inject
from
.accelerator
import
get_accelerator
from
.runtime.engine
import
DeepSpeedEngine
,
DeepSpeedOptimizerCallable
,
DeepSpeedSchedulerCallable
from
.runtime.engine
import
ADAM_OPTIMIZER
,
LAMB_OPTIMIZER
from
.runtime.hybrid_engine
import
DeepSpeedHybridEngine
...
...
@@ -50,6 +51,9 @@ __version_major__, __version_minor__, __version_patch__ = _parse_version(__versi
__git_hash__
=
git_hash
__git_branch__
=
git_branch
# Set to torch's distributed package or deepspeed.comm based inside DeepSpeedEngine init
dist
=
None
def
initialize
(
args
=
None
,
model
:
torch
.
nn
.
Module
=
None
,
...
...
@@ -119,6 +123,11 @@ def initialize(args=None,
assert
model
is
not
None
,
"deepspeed.initialize requires a model"
global
dist
from
deepspeed
import
comm
as
dist
dist_backend
=
get_accelerator
().
communication_backend_name
()
dist
.
init_distributed
(
dist_backend
=
dist_backend
,
dist_init_required
=
dist_init_required
)
# Set config using config_params for backwards compat
if
config
is
None
and
config_params
is
not
None
:
config
=
config_params
...
...
deepspeed/comm/__init__.py
浏览文件 @
ad168a69
...
...
@@ -3,48 +3,5 @@
# DeepSpeed Team
import
torch
from
.utils
import
*
from
deepspeed
import
utils
supported_torch_version
=
False
# See more details at: https://github.com/pytorch/pytorch/pull/48767
# The PG API in torch versions lesser than 1.8 are different so it is
# non-trivial to support both in the same API. We will just use the
# DS comm. backend in deepspeed/comm/comm.py if torch version if 1.8+.
if
older_torch
():
# Add custom deepspeed torch comm functions here since we can't import deepspeed.comm
# NOTE: We can't call torch.distributed directly here. Current hack is to import functions before calling them.
supported_torch_version
=
False
from
torch.distributed
import
*
def
get_world_group
():
return
group
.
WORLD
def
get_global_rank
(
group
,
group_rank
):
if
hasattr
(
torch
.
distributed
.
distributed_c10d
,
"get_global_rank"
):
from
torch.distributed.distributed_c10d
import
get_global_rank
as
_get_global_rank
else
:
from
torch.distributed.distributed_c10d
import
_get_global_rank
return
_get_global_rank
(
group
,
group_rank
)
def
allgather_fn
(
output_tensor
,
input_tensor
,
group
=
None
,
async_op
=
False
):
from
torch.distributed
import
all_gather
,
get_world_size
from
torch
import
chunk
output_tensors
=
list
(
chunk
(
output_tensor
,
get_world_size
(
group
)))
return
all_gather
(
output_tensors
,
input_tensor
,
group
=
group
,
async_op
=
async_op
)
def
reduce_scatter_fn
(
output_tensor
,
input_tensor
,
group
=
None
,
async_op
=
False
):
from
torch.distributed
import
reduce_scatter
,
get_world_size
from
torch
import
chunk
input_tensor_lst
=
list
(
chunk
(
input_tensor
,
get_world_size
(
group
)))
return
reduce_scatter
(
output_tensor
,
input_tensor_lst
,
group
=
group
)
def
configure
(
deepspeed_config
=
None
,
enabled
=
None
,
prof_all
=
None
,
prof_ops
=
None
,
verbose
=
None
):
utils
.
logger
.
warn
(
"Communication logging is not supported in torch versions older than 1.8"
)
else
:
supported_torch_version
=
True
from
.comm
import
*
from
.comm
import
*
deepspeed/comm/torch.py
浏览文件 @
ad168a69
...
...
@@ -21,7 +21,6 @@ class TorchBackend(Backend):
def
__init__
(
self
,
backend
,
timeout
,
init_method
,
rank
=-
1
,
world_size
=-
1
,
name
=
'torch'
):
super
(
TorchBackend
,
self
).
__init__
()
self
.
torch_version_before_18
=
older_torch
()
self
.
all_gather_function
=
self
.
get_all_gather_function
()
self
.
reduce_scatter_function
=
self
.
get_reduce_scatter_function
()
self
.
initialized
=
True
...
...
deepspeed/comm/utils.py
浏览文件 @
ad168a69
...
...
@@ -4,25 +4,10 @@
# DeepSpeed Team
import
os
import
torch
import
inspect
from
deepspeed.utils
import
get_caller_func
def
older_torch
():
'''
Helper to lookup torch version. For versions less than 1.8, torch.dist
used torch.distributed.group.WORLD as the default group argument instead of None.
See more details at: https://github.com/pytorch/pytorch/pull/48767
'''
TORCH_MAJOR
=
int
(
torch
.
__version__
.
split
(
'.'
)[
0
])
TORCH_MINOR
=
int
(
torch
.
__version__
.
split
(
'.'
)[
1
])
if
TORCH_MAJOR
==
1
and
TORCH_MINOR
<
8
:
return
True
else
:
return
False
def
get_local_rank_from_launcher
():
# DeepSpeed launcher will set it so get from there
...
...
deepspeed/runtime/engine.py
浏览文件 @
ad168a69
...
...
@@ -21,6 +21,7 @@ from typing import Callable, Dict, Union, Iterable
import
deepspeed
from
deepspeed
import
comm
as
dist
from
deepspeed.runtime.utils
import
see_memory_usage
,
DummyOptim
from
.zero.offload_config
import
OffloadDeviceEnum
from
deepspeed.runtime.zero.stage_1_and_2
import
DeepSpeedZeroOptimizer
...
...
@@ -96,9 +97,6 @@ from deepspeed.ops.op_builder import UtilsBuilder
from
deepspeed.runtime.config
import
DtypeEnum
# Set to torch's distributed package or deepspeed.comm based inside DeepSpeedEngine init
dist
=
None
MEMORY_OPT_ALLREDUCE_SIZE
=
500000000
DeepSpeedOptimizerCallable
=
\
...
...
@@ -232,8 +230,6 @@ class DeepSpeedEngine(Module):
self
.
checkpoint_engine
=
None
global
dist
from
deepspeed
import
comm
as
dist
self
.
_is_gradient_accumulation_boundary
=
None
self
.
scale_wrt_gas
=
None
...
...
@@ -243,22 +239,6 @@ class DeepSpeedEngine(Module):
# needed for zero_to_fp32 weights reconstruction to remap nameless data to state_dict
self
.
param_names
=
{
param
:
name
for
name
,
param
in
model
.
named_parameters
()}
from
deepspeed.comm
import
supported_torch_version
# This supported_torch_version check is for torch1.2 compatibility only
if
supported_torch_version
:
dist
.
init_distributed
(
dist_backend
=
self
.
dist_backend
,
dist_init_required
=
dist_init_required
)
else
:
if
dist_init_required
is
None
:
dist_init_required
=
not
dist
.
is_initialized
()
if
dist_init_required
is
False
:
assert
(
dist
.
is_initialized
()
is
True
),
"Torch distributed not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
else
:
if
not
dist
.
is_initialized
():
dist
.
init_process_group
(
backend
=
self
.
dist_backend
)
self
.
_do_args_sanity_check
(
args
)
self
.
_configure_with_arguments
(
args
,
mpu
)
self
.
_do_sanity_check
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录