未验证 提交 f51c43e2 编写于 作者: X XIE Xuan 提交者: GitHub

Merge pull request #218 from Oneflow-Inc/dev_gpt_modify_parameter

change parallel_distribution to nd_sbp
......@@ -34,7 +34,7 @@ class GPTDataLoader(object):
random_seed=self.seed,
split_sizes=self.split,
split_index=0,
parallel_distribution=distribute.get_data_parallel_dist(),
nd_sbp=distribute.get_data_parallel_dist(),
name=self.name,
)
......
......@@ -235,9 +235,9 @@ def forward_p2b_parallel_cast(x):
# backward: B -> B, identity
x = flow.hierarchical_parallel_cast(
x,
parallel_distribution=parallel_dist,
nd_sbp=parallel_dist,
grad_mode="manual",
grad_parallel_distribution=parallel_dist,
grad_nd_sbp=parallel_dist,
)
elif dist_util.is_data_parallel():
# parallel cast: S(0) -> S(0), identity
......@@ -265,9 +265,9 @@ def backward_p2b_parallel_cast(x):
# backward: [S(0), P] cast to [S(0), B], for layernorm grad not supporting P, cast from P to B
x = flow.hierarchical_parallel_cast(
x,
parallel_distribution=parallel_dist,
nd_sbp=parallel_dist,
grad_mode="manual",
grad_parallel_distribution=parallel_dist,
grad_nd_sbp=parallel_dist,
)
elif dist_util.is_data_parallel():
# parallel cast: S(0) -> S(0), identity
......@@ -288,7 +288,7 @@ def output_parallel_cast(x, device="gpu"):
dist_util = get_dist_util()
if dist_util.is_hybrid_parallel():
with flow.scope.placement(device, dist_util.get_layer_placement(-1)):
x = flow.hierarchical_parallel_cast(x, parallel_distribution=["B"])
x = flow.hierarchical_parallel_cast(x, nd_sbp=["B"])
return x
......@@ -297,7 +297,7 @@ def input_data_parallel_cast(x):
dist_util = get_dist_util()
if dist_util.is_hybrid_parallel():
x = flow.hierarchical_parallel_cast(
x, parallel_distribution=get_data_parallel_dist(),
x, nd_sbp=get_data_parallel_dist(),
)
return x
......@@ -112,13 +112,13 @@ class Embedding(object):
"wpe",
shape=(self.seq_length, self.hidden_size),
initializer=self.wpe_initializer,
parallel_distribution=distribute.get_wpe_parallel_dist(),
nd_sbp=distribute.get_wpe_parallel_dist(),
)
wte = flow.get_variable(
"wte",
shape=(self.vocab_size, self.hidden_size),
initializer=self.wte_initializer,
parallel_distribution=distribute.get_wte_parallel_dist(),
nd_sbp=distribute.get_wte_parallel_dist(),
)
# 2d sbp sig: [B, S(0)] x [S(0), B] -> [S(0), P] -> [S(0), B]
......@@ -569,7 +569,7 @@ def layernorm(
trainable=True,
model_name="beta",
reuse=False,
parallel_distribution=params_parallel_dist,
nd_sbp=params_parallel_dist,
)
gamma = flow.get_variable(
......@@ -580,7 +580,7 @@ def layernorm(
trainable=True,
model_name="gamma",
reuse=False,
parallel_distribution=params_parallel_dist,
nd_sbp=params_parallel_dist,
)
return flow.nn.layer_norm(
......@@ -604,14 +604,14 @@ def get_linear_params(
shape=(input_size, output_size),
dtype=dtype,
initializer=weight_initializer,
parallel_distribution=weight_parallel_dist,
nd_sbp=weight_parallel_dist,
)
bias = flow.get_variable(
name="bias",
shape=(output_size,),
dtype=dtype,
initializer=bias_initializer,
parallel_distribution=bias_parallel_dist,
nd_sbp=bias_parallel_dist,
)
return weight, bias
......
......@@ -145,7 +145,7 @@ class Metric(object):
output = outputs[key].numpy()
assert isinstance(output, np.ndarray)
if micro_batches is None:
micro_batches = output.shape[0]
micro_batches = output.shape[0] if output.shape else 1
else:
assert micro_batches == output.shape[0]
self.kv_store_[key] += output.sum()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册