未验证 提交 de8c0ba5 编写于 作者: N Nyakku Shigure 提交者: GitHub

[CodeStyle][W291] trim trailing whitespace in python file (#45937)

* trim trailing whitespace

* fix `.cmake-format.py`

* revert npu ut changes, avoid npu ci error
上级 cbe64cc1
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
......@@ -22,7 +22,7 @@ def GenerateFileStructureForFinalDygraph(eager_dir):
|- generated
| |- CMakeLists.txt
| | "add_subdirectory(forwards), add_subdirectory(backwards)"
|
|
| |- forwards
| |- "dygraph_functions.cc"
| |- "dygraph_functions.h"
......@@ -59,7 +59,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
|- generated
| |- CMakeLists.txt
| | "add_subdirectory(forwards), add_subdirectory(nodes)"
|
|
| |- forwards
| |- "dygraph_forward_functions.cc"
| |- CMakeLists.txt
......@@ -70,7 +70,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
| |- "nodes.h"
| |- CMakeLists.txt
| | "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})"
|
|
| |- dygraph_forward_api.h
"""
# Directory Generation
......
......@@ -403,9 +403,9 @@ LAYOUT_LOGIC_TEMPLATE=\
if (paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune()) {{
VLOG(5) << "Check and Prepare For LAYOUT";
paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> tensors_vector = {};
{}
{}
paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune();
{}
paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune();
{}
{}
paddle::imperative::LayoutAutoTune::Instance().EnableLayoutAutoTune();
......@@ -1772,7 +1772,7 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
autograd_api = self.grad_api_contents['invoke'].replace(
forward_api_name, forward_api_name + '_dygraph_function', 1)
grad_function_call_str = f"""
if (trace_backward) {{
if (trace_backward) {{
{indent}{autograd_api_out} api_output = {autograd_api};
{out_assign_str}}} else {{
{indent}{autograd_api_out} api_output = paddle::experimental::{self.namespace}{self.grad_api_contents['invoke']};
......
......@@ -20,7 +20,7 @@ def untar(fname, dirs):
"""
extract the tar.gz file
:param fname: the name of tar.gz file
:param dirs: the path of decompressed file
:param dirs: the path of decompressed file
:return: bool
"""
try:
......
......@@ -106,8 +106,8 @@ def convert_pascalvoc_local2bin(args):
for object in objects:
bbox_sample = []
# start from 1
bbox_sample.append(
float(label_list.index(object.find('name').text)))
bbox_sample.append(float(label_list.index(
object.find('name').text)))
bbox = object.find('bndbox')
difficult = float(object.find('difficult').text)
bbox_sample.append(float(bbox.find('xmin').text) / im_width)
......@@ -131,7 +131,7 @@ def convert_pascalvoc_local2bin(args):
f1.close()
object_nums_sum = sum(object_nums)
# The data should be contains
# The data should be contains
# number of images + all images data + an array that represent object numbers of each image
# + labels of all objects in images + bboxes of all objects + difficulties of all objects
# so the target size should be as follows:
......@@ -210,8 +210,8 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
for object in objects:
bbox_sample = []
bbox_sample.append(
float(label_list.index(object.find('name').text)))
bbox_sample.append(float(label_list.index(
object.find('name').text)))
bbox = object.find('bndbox')
difficult = float(object.find('difficult').text)
bbox_sample.append(float(bbox.find('xmin').text) / im_width)
......@@ -230,7 +230,7 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
if line_idx % per_percentage:
print_processbar(line_idx / per_percentage)
# The data should be stored in binary in following sequence:
# The data should be stored in binary in following sequence:
# number of images->all images data->an array that represent object numbers in each image
# ->labels of all objects in images->bboxes of all objects->difficulties of all objects
f1.write(np.array(object_nums).astype('uint64').tobytes())
......@@ -258,9 +258,9 @@ def download_pascalvoc(data_url, data_dir, tar_targethash, tar_path):
def run_convert():
try_limit = 2
retry = 0
while not (os.path.exists(DATA_OUT_PATH) and
os.path.getsize(DATA_OUT_PATH) == BIN_FULLSIZE and BIN_TARGETHASH
== hashlib.md5(open(DATA_OUT_PATH, 'rb').read()).hexdigest()):
while not (os.path.exists(DATA_OUT_PATH) and os.path.getsize(DATA_OUT_PATH)
== BIN_FULLSIZE and BIN_TARGETHASH == hashlib.md5(
open(DATA_OUT_PATH, 'rb').read()).hexdigest()):
if os.path.exists(DATA_OUT_PATH):
sys.stderr.write(
"The existing binary file is broken. It is being removed...\n")
......@@ -275,52 +275,52 @@ def run_convert():
def main_pascalvoc_preprocess(args):
parser = argparse.ArgumentParser(
description="Convert the full pascalvoc val set or local data to binary file.",
description=
"Convert the full pascalvoc val set or local data to binary file.",
usage=None,
add_help=True)
parser.add_argument(
'--local',
action="store_true",
help="If used, user need to set --data_dir and then convert file")
parser.add_argument(
"--data_dir", default="", type=str, help="Dataset root directory")
parser.add_argument("--data_dir",
default="",
type=str,
help="Dataset root directory")
parser.add_argument(
"--img_annotation_list",
type=str,
default="test_100.txt",
help="A file containing the image file path and corresponding annotation file path"
help=
"A file containing the image file path and corresponding annotation file path"
)
parser.add_argument(
"--label_file",
type=str,
default="label_list",
help="List of object labels with same sequence as denoted in the annotation file"
help=
"List of object labels with same sequence as denoted in the annotation file"
)
parser.add_argument(
"--output_file",
type=str,
default="pascalvoc_small.bin",
help="File path of the output binary file")
parser.add_argument(
"--resize_h",
type=int,
default=RESIZE_H,
help="Image preprocess with resize_h")
parser.add_argument(
"--resize_w",
type=int,
default=RESIZE_W,
help="Image prerocess with resize_w")
parser.add_argument(
"--mean_value",
type=str,
default=MEAN_VALUE,
help="Image preprocess with mean_value")
parser.add_argument(
"--ap_version",
type=str,
default=AP_VERSION,
help="Image preprocess with ap_version")
parser.add_argument("--output_file",
type=str,
default="pascalvoc_small.bin",
help="File path of the output binary file")
parser.add_argument("--resize_h",
type=int,
default=RESIZE_H,
help="Image preprocess with resize_h")
parser.add_argument("--resize_w",
type=int,
default=RESIZE_W,
help="Image prerocess with resize_w")
parser.add_argument("--mean_value",
type=str,
default=MEAN_VALUE,
help="Image preprocess with mean_value")
parser.add_argument("--ap_version",
type=str,
default=AP_VERSION,
help="Image preprocess with ap_version")
args = parser.parse_args()
if args.local:
convert_pascalvoc_local2bin(args)
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -20,6 +20,7 @@ import sys
class AbsNet(paddle.nn.Layer):
def __init__(self):
super(AbsNet, self).__init__()
......@@ -32,7 +33,6 @@ if __name__ == '__main__':
# build network
model = AbsNet()
# save inferencing format model
net = to_static(
model, input_spec=[InputSpec(
shape=[None, 1, 28, 28], name='x')])
net = to_static(model,
input_spec=[InputSpec(shape=[None, 1, 28, 28], name='x')])
paddle.jit.save(net, sys.argv[1])
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -20,7 +20,6 @@ import paddle
import sys
model = EfficientNet.from_name('efficientnet-b4')
net = to_static(
model, input_spec=[InputSpec(
shape=[None, 3, 256, 256], name='x')])
net = to_static(model,
input_spec=[InputSpec(shape=[None, 3, 256, 256], name='x')])
paddle.jit.save(net, sys.argv[1])
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -38,8 +38,8 @@ class MBConvBlock(nn.Layer):
self._block_args = block_args
self._bn_mom = global_params.batch_norm_momentum
self._bn_eps = global_params.batch_norm_epsilon
self.has_se = (self._block_args.se_ratio is not None) and (
0 < self._block_args.se_ratio <= 1)
self.has_se = (self._block_args.se_ratio
is not None) and (0 < self._block_args.se_ratio <= 1)
self.id_skip = block_args.id_skip # skip connection and drop connect
# Get static or dynamic convolution depending on image size
......@@ -49,13 +49,13 @@ class MBConvBlock(nn.Layer):
inp = self._block_args.input_filters # number of input channels
oup = self._block_args.input_filters * self._block_args.expand_ratio # number of output channels
if self._block_args.expand_ratio != 1:
self._expand_conv = Conv2d(
in_channels=inp,
out_channels=oup,
kernel_size=1,
bias_attr=False)
self._bn0 = nn.BatchNorm2D(
num_features=oup, momentum=self._bn_mom, epsilon=self._bn_eps)
self._expand_conv = Conv2d(in_channels=inp,
out_channels=oup,
kernel_size=1,
bias_attr=False)
self._bn0 = nn.BatchNorm2D(num_features=oup,
momentum=self._bn_mom,
epsilon=self._bn_eps)
# Depthwise convolution phase
k = self._block_args.kernel_size
......@@ -67,32 +67,31 @@ class MBConvBlock(nn.Layer):
kernel_size=k,
stride=s,
bias_attr=False)
self._bn1 = nn.BatchNorm2D(
num_features=oup, momentum=self._bn_mom, epsilon=self._bn_eps)
self._bn1 = nn.BatchNorm2D(num_features=oup,
momentum=self._bn_mom,
epsilon=self._bn_eps)
# Squeeze and Excitation layer, if desired
if self.has_se:
num_squeezed_channels = max(1,
int(self._block_args.input_filters *
self._block_args.se_ratio))
self._se_reduce = Conv2d(
in_channels=oup,
out_channels=num_squeezed_channels,
kernel_size=1)
self._se_expand = Conv2d(
in_channels=num_squeezed_channels,
out_channels=oup,
kernel_size=1)
num_squeezed_channels = max(
1,
int(self._block_args.input_filters * self._block_args.se_ratio))
self._se_reduce = Conv2d(in_channels=oup,
out_channels=num_squeezed_channels,
kernel_size=1)
self._se_expand = Conv2d(in_channels=num_squeezed_channels,
out_channels=oup,
kernel_size=1)
# Output phase
final_oup = self._block_args.output_filters
self._project_conv = Conv2d(
in_channels=oup,
out_channels=final_oup,
kernel_size=1,
bias_attr=False)
self._bn2 = nn.BatchNorm2D(
num_features=final_oup, momentum=self._bn_mom, epsilon=self._bn_eps)
self._project_conv = Conv2d(in_channels=oup,
out_channels=final_oup,
kernel_size=1,
bias_attr=False)
self._bn2 = nn.BatchNorm2D(num_features=final_oup,
momentum=self._bn_mom,
epsilon=self._bn_eps)
self._swish = nn.Hardswish()
def forward(self, inputs, drop_connect_rate=None):
......@@ -121,8 +120,9 @@ class MBConvBlock(nn.Layer):
input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
if drop_connect_rate:
x = drop_connect(
x, prob=drop_connect_rate, training=self.training)
x = drop_connect(x,
prob=drop_connect_rate,
training=self.training)
x = x + inputs # skip connection
return x
......@@ -162,10 +162,14 @@ class EfficientNet(nn.Layer):
in_channels = 3 # rgb
out_channels = round_filters(
32, self._global_params) # number of output channels
self._conv_stem = Conv2d(
in_channels, out_channels, kernel_size=3, stride=2, bias_attr=False)
self._bn0 = nn.BatchNorm2D(
num_features=out_channels, momentum=bn_mom, epsilon=bn_eps)
self._conv_stem = Conv2d(in_channels,
out_channels,
kernel_size=3,
stride=2,
bias_attr=False)
self._bn0 = nn.BatchNorm2D(num_features=out_channels,
momentum=bn_mom,
epsilon=bn_eps)
# Build blocks
self._blocks = nn.LayerList([])
......@@ -186,16 +190,19 @@ class EfficientNet(nn.Layer):
block_args = block_args._replace(
input_filters=block_args.output_filters, stride=1)
for _ in range(block_args.num_repeat - 1):
self._blocks.append(
MBConvBlock(block_args, self._global_params))
self._blocks.append(MBConvBlock(block_args,
self._global_params))
# Head
in_channels = block_args.output_filters # output of final block
out_channels = round_filters(1280, self._global_params)
self._conv_head = Conv2d(
in_channels, out_channels, kernel_size=1, bias_attr=False)
self._bn1 = nn.BatchNorm2D(
num_features=out_channels, momentum=bn_mom, epsilon=bn_eps)
self._conv_head = Conv2d(in_channels,
out_channels,
kernel_size=1,
bias_attr=False)
self._bn1 = nn.BatchNorm2D(num_features=out_channels,
momentum=bn_mom,
epsilon=bn_eps)
# Final linear layer
self._avg_pooling = nn.AdaptiveAvgPool2D(1)
......@@ -253,20 +260,21 @@ class EfficientNet(nn.Layer):
advprop=False,
num_classes=1000,
in_channels=3):
model = cls.from_name(
model_name, override_params={'num_classes': num_classes})
load_pretrained_weights(
model, model_name, load_fc=(num_classes == 1000), advprop=advprop)
model = cls.from_name(model_name,
override_params={'num_classes': num_classes})
load_pretrained_weights(model,
model_name,
load_fc=(num_classes == 1000),
advprop=advprop)
if in_channels != 3:
Conv2d = get_same_padding_conv2d(
image_size=model._global_params.image_size)
out_channels = round_filters(32, model._global_params)
model._conv_stem = Conv2d(
in_channels,
out_channels,
kernel_size=3,
stride=2,
bias_attr=False)
model._conv_stem = Conv2d(in_channels,
out_channels,
kernel_size=3,
stride=2,
bias_attr=False)
return model
@classmethod
......@@ -280,5 +288,5 @@ class EfficientNet(nn.Layer):
""" Validates model name. """
valid_models = ['efficientnet-b' + str(i) for i in range(9)]
if model_name not in valid_models:
raise ValueError('model_name should be one of: ' + ', '.join(
valid_models))
raise ValueError('model_name should be one of: ' +
', '.join(valid_models))
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -96,15 +96,14 @@ class Conv2dDynamicSamePadding(nn.Conv2D):
dilation=1,
groups=1,
bias_attr=None):
super().__init__(
in_channels,
out_channels,
kernel_size,
stride,
0,
dilation,
groups,
bias_attr=bias_attr)
super().__init__(in_channels,
out_channels,
kernel_size,
stride,
0,
dilation,
groups,
bias_attr=bias_attr)
self.stride = self._stride if len(
self._stride) == 2 else [self._stride[0]] * 2
......@@ -113,10 +112,12 @@ class Conv2dDynamicSamePadding(nn.Conv2D):
kh, kw = self.weight.shape[-2:]
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
pad_h = max((oh - 1) * self.stride[0] +
(kh - 1) * self._dilation[0] + 1 - ih, 0)
pad_w = max((ow - 1) * self.stride[1] +
(kw - 1) * self._dilation[1] + 1 - iw, 0)
pad_h = max(
(oh - 1) * self.stride[0] + (kh - 1) * self._dilation[0] + 1 - ih,
0)
pad_w = max(
(ow - 1) * self.stride[1] + (kw - 1) * self._dilation[1] + 1 - iw,
0)
if pad_h > 0 or pad_w > 0:
x = F.pad(x, [
pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
......@@ -142,15 +143,18 @@ class Conv2dStaticSamePadding(nn.Conv2D):
# Calculate padding based on image size and save it
assert image_size is not None
ih, iw = image_size if type(
image_size) == list else [image_size, image_size]
ih, iw = image_size if type(image_size) == list else [
image_size, image_size
]
kh, kw = self.weight.shape[-2:]
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
pad_h = max((oh - 1) * self.stride[0] +
(kh - 1) * self._dilation[0] + 1 - ih, 0)
pad_w = max((ow - 1) * self.stride[1] +
(kw - 1) * self._dilation[1] + 1 - iw, 0)
pad_h = max(
(oh - 1) * self.stride[0] + (kh - 1) * self._dilation[0] + 1 - ih,
0)
pad_w = max(
(ow - 1) * self.stride[1] + (kw - 1) * self._dilation[1] + 1 - iw,
0)
if pad_h > 0 or pad_w > 0:
self.static_padding = nn.Pad2D([
pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
......@@ -166,6 +170,7 @@ class Conv2dStaticSamePadding(nn.Conv2D):
class Identity(nn.Layer):
def __init__(self, ):
super().__init__()
......@@ -225,9 +230,12 @@ class BlockDecoder(object):
def _encode_block_string(block):
"""Encodes a block to a string."""
args = [
'r%d' % block.num_repeat, 'k%d' % block.kernel_size, 's%d%d' %
(block.strides[0], block.strides[1]), 'e%s' % block.expand_ratio,
'i%d' % block.input_filters, 'o%d' % block.output_filters
'r%d' % block.num_repeat,
'k%d' % block.kernel_size,
's%d%d' % (block.strides[0], block.strides[1]),
'e%s' % block.expand_ratio,
'i%d' % block.input_filters,
'o%d' % block.output_filters
]
if 0 < block.se_ratio <= 1:
args.append('se%s' % block.se_ratio)
......@@ -291,7 +299,8 @@ def efficientnet(width_coefficient=None,
depth_coefficient=depth_coefficient,
depth_divisor=8,
min_depth=None,
image_size=image_size, )
image_size=image_size,
)
return blocks_args, global_params
......@@ -300,11 +309,10 @@ def get_model_params(model_name, override_params):
""" Get the block args and global params for a given model """
if model_name.startswith('efficientnet'):
w, d, s, p = efficientnet_params(model_name)
blocks_args, global_params = efficientnet(
width_coefficient=w,
depth_coefficient=d,
dropout_rate=p,
image_size=s)
blocks_args, global_params = efficientnet(width_coefficient=w,
depth_coefficient=d,
dropout_rate=p,
image_size=s)
else:
raise NotImplementedError('model name is not pre-defined: %s' %
model_name)
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -28,6 +28,7 @@ CLASS_NUM = 10
# define a random dataset
class RandomDataset(paddle.io.Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
......@@ -41,6 +42,7 @@ class RandomDataset(paddle.io.Dataset):
class LinearNet(nn.Layer):
def __init__(self):
super(LinearNet, self).__init__()
self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
......@@ -69,8 +71,11 @@ adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
# create data loader
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
loader = paddle.io.DataLoader(
dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)
loader = paddle.io.DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
# train
train(layer, loader, loss_fn, adam)
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -19,7 +19,6 @@ from paddle.static import InputSpec
import sys
model = resnet50(True)
net = to_static(
model, input_spec=[InputSpec(
shape=[None, 3, 256, 256], name='x')])
net = to_static(model,
input_spec=[InputSpec(shape=[None, 3, 256, 256], name='x')])
paddle.jit.save(net, sys.argv[1])
......@@ -706,29 +706,29 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
{code_indent} std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{{"""
for input_name in single_tensor_names[:-1]:
if input_name in self.optional_vars:
input_tensor_code = input_tensor_code + f"""
input_tensor_code = input_tensor_code + f"""
{code_indent} {{"{input_name}", {input_name}_record_shapes}},"""
else:
input_tensor_code = input_tensor_code + f"""
input_tensor_code = input_tensor_code + f"""
{code_indent} {{"{input_name}", {{"""
input_tensors = input_name_tensor_map[input_name]
for input_tensor, _ in input_tensors[:-1]:
input_tensor_code = input_tensor_code + f"""
input_tensor_code = input_tensor_code + f"""
{code_indent} (*{input_tensor}).dims(),"""
input_tensor_code = input_tensor_code + f"""
input_tensor_code = input_tensor_code + f"""
{code_indent} (*{input_tensors[-1][0]}).dims()}}}},"""
if single_tensor_names[-1] in self.optional_vars:
input_tensor_code = input_tensor_code + f"""
{code_indent} {{"{single_tensor_names[-1]}",
input_tensor_code = input_tensor_code + f"""
{code_indent} {{"{single_tensor_names[-1]}",
{code_indent} {single_tensor_names[-1]}_record_shapes}}}};"""
else:
input_tensor_code = input_tensor_code + f"""
input_tensor_code = input_tensor_code + f"""
{code_indent} {{"{single_tensor_names[-1]}", {{"""
input_tensors = input_name_tensor_map[single_tensor_names[-1]]
for input_tensor, _ in input_tensors[:-1]:
input_tensor_code = input_tensor_code + f"""
input_tensor_code = input_tensor_code + f"""
{code_indent} (*{input_tensor}).dims(),"""
input_tensor_code = input_tensor_code + f"""
input_tensor_code = input_tensor_code + f"""
{code_indent} (*{input_tensors[-1][0]}).dims()}}}}}};"""
if list_tensor_names:
input_tensor_code = input_tensor_code + f"""
......@@ -757,14 +757,14 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
{code_indent} ddims_vec.emplace_back((*{input_tensor_truncate}[i]).dims());
{code_indent} }}"""
else:
input_tensor_code = input_tensor_code + f"""
input_tensor_code = input_tensor_code + f"""
ddims_vec.emplace_back((*{input_tensor}).dims());
{code_indent} """
input_tensor_code = input_tensor_code + f"""
{code_indent} input_shapes.emplace_back("{input_name}", ddims_vec);"""
input_tensor_code = input_tensor_code + f"""
{code_indent} platform::RecordOpInfoSupplement("{self.api}", input_shapes);
input_tensor_code = input_tensor_code + f"""
{code_indent} platform::RecordOpInfoSupplement("{self.api}", input_shapes);
{code_indent} }}"""
kernel_args = ["*dev_ctx"]
for param in kernel_param:
......
#!/bin/python
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -24,17 +24,17 @@ import time
def parse_args():
parser = argparse.ArgumentParser("conda build for paddlepaddle version")
parser.add_argument(
"--paddle_version",
type=str,
required=True,
help="paddle version for conda build.")
parser.add_argument("--paddle_version",
type=str,
required=True,
help="paddle version for conda build.")
args = parser.parse_args()
return args
class ConstantVar:
def __init__(self):
self.build = r"""
build:
......@@ -89,7 +89,7 @@ about:
self.build_const = r"""
"""
self.blt_const = r"""
self.blt_const = r"""
"""
self.python36 = r" - python>=3.6, <3.7"
......
......@@ -25,21 +25,21 @@ def auto_cast(enable=True,
dtype='float16'):
"""
Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
If enabled, the input data type (float32 or float16) of each operator is decided
by autocast algorithm for better performance.
Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
If enabled, the input data type (float32 or float16) of each operator is decided
by autocast algorithm for better performance.
Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.
Args:
enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
fp16 calculation and are considered numerically-safe and performance-critical. These ops
fp16 calculation and are considered numerically-safe and performance-critical. These ops
will be converted to fp16.
custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
calculation and are considered numerically-dangerous and whose effects may also be
calculation and are considered numerically-dangerous and whose effects may also be
observed in downstream ops. These ops will not be converted to fp16.
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list;
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list;
O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
......@@ -69,7 +69,7 @@ def auto_cast(enable=True,
with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}):
c = a + b
print(c.dtype) # paddle.float32
with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O2'):
d = a + b
print(d.dtype) # paddle.float32
......@@ -85,15 +85,15 @@ def decorate(models,
master_weight=None,
save_dtype=None):
"""
Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
When level is O2(pure float16/bfloat16), the decorate will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm.
Commonly, it is used together with `auto_cast` to achieve Pure float16/bfloat16 in imperative mode.
Args:
models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
O2 represent Pure float16/bfloat16, the decorator will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm. Default is O1(amp)
dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
......@@ -102,7 +102,7 @@ def decorate(models,
Examples:
.. code-block:: python
.. code-block:: python
# required: gpu
# Demo1: single model and optimizer:
......@@ -118,7 +118,7 @@ def decorate(models,
with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
output = model(data)
print(output.dtype) # FP16
# required: gpu
# Demo2: multi models and optimizers:
model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
......@@ -133,7 +133,7 @@ def decorate(models,
output2 = models[1](data)
print(output.dtype) # FP16
print(output2.dtype) # FP16
# required: gpu
# Demo3: optimizers is None:
model3 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
......
......@@ -25,7 +25,7 @@ def _refresh_optimizer_state():
class GradScaler(AmpScaler):
"""
GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode.
GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode.
It controls the scaling of loss, helps avoiding numerical overflow.
The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters.
......@@ -36,19 +36,19 @@ class GradScaler(AmpScaler):
`update` is used to update the loss_scaling.
Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in
Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in
dynamic graph mode.
Args:
enable(bool, optional): Enable loss scaling or not. Default is True.
init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
incr_ratio(float, optional): The multiplier to use when increasing the loss
incr_ratio(float, optional): The multiplier to use when increasing the loss
scaling. Default is 2.0.
decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
the loss scaling. Default is 0.5.
incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
steps with finite gradients. Default is 1000.
decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
accumulated steps with nan or inf gradients. Default is 2.
use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
Returns:
......@@ -57,7 +57,7 @@ class GradScaler(AmpScaler):
Examples:
.. code-block:: python
import paddle
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
......@@ -68,10 +68,10 @@ class GradScaler(AmpScaler):
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.minimize(optimizer, scaled) # update parameters
scaler.minimize(optimizer, scaled) # update parameters
optimizer.clear_grad()
"""
......@@ -90,18 +90,18 @@ class GradScaler(AmpScaler):
def scale(self, var):
"""
Multiplies a Tensor by the scale factor and returns scaled outputs.
Multiplies a Tensor by the scale factor and returns scaled outputs.
If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
Args:
var (Tensor): The tensor to scale.
Returns:
The scaled tensor or original tensor.
Examples:
.. code-block:: python
import paddle
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
......@@ -113,9 +113,9 @@ class GradScaler(AmpScaler):
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.minimize(optimizer, scaled) # update parameters
scaler.minimize(optimizer, scaled) # update parameters
optimizer.clear_grad()
"""
return super(GradScaler, self).scale(var)
......@@ -123,7 +123,7 @@ class GradScaler(AmpScaler):
def minimize(self, optimizer, *args, **kwargs):
"""
This function is similar as `optimizer.minimize()`, which performs parameters updating.
If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.
......@@ -149,9 +149,9 @@ class GradScaler(AmpScaler):
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.minimize(optimizer, scaled) # update parameters
scaler.minimize(optimizer, scaled) # update parameters
optimizer.clear_grad()
"""
return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
......@@ -159,7 +159,7 @@ class GradScaler(AmpScaler):
def step(self, optimizer):
"""
This function is similar as `optimizer.step()`, which performs parameters updating.
If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.
......@@ -169,7 +169,7 @@ class GradScaler(AmpScaler):
Examples:
.. code-block:: python
# required: gpu
import paddle
......@@ -180,7 +180,7 @@ class GradScaler(AmpScaler):
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.step(optimizer) # update parameters
scaler.update() # update the loss scaling ratio
......@@ -212,11 +212,11 @@ class GradScaler(AmpScaler):
def update(self):
"""
Updates the loss_scaling.
Examples:
.. code-block:: python
# required: gpu
import paddle
......@@ -227,11 +227,11 @@ class GradScaler(AmpScaler):
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.step(optimizer) # update parameters
scaler.update() # update the loss scaling ratio
optimizer.clear_grad()
optimizer.clear_grad()
"""
if not self._enable:
return
......@@ -242,7 +242,7 @@ class GradScaler(AmpScaler):
def unscale_(self, optimizer):
"""
Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
Args:
......@@ -250,7 +250,7 @@ class GradScaler(AmpScaler):
Returns:
The unscaled parameters or original parameters.
Examples:
.. code-block:: python
......@@ -265,12 +265,12 @@ class GradScaler(AmpScaler):
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.unscale_(optimizer) # unscale the parameter
scaler.step(optimizer)
scaler.update()
optimizer.clear_grad()
scaler.update()
optimizer.clear_grad()
"""
return super(GradScaler, self)._unscale(optimizer)
......@@ -280,7 +280,7 @@ class GradScaler(AmpScaler):
Returns:
bool: enable loss scaling return True else return False.
Examples:
.. code-block:: python
......@@ -304,11 +304,11 @@ class GradScaler(AmpScaler):
Returns:
bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamicly return true.
Examples:
.. code-block:: python
# required: gpu,xpu
# required: gpu,xpu
import paddle
scaler = paddle.amp.GradScaler(enable=True,
init_loss_scaling=1024,
......@@ -328,7 +328,7 @@ class GradScaler(AmpScaler):
Reurns:
float: the initial loss scaling factor.
Examples:
.. code-block:: python
......@@ -352,10 +352,10 @@ class GradScaler(AmpScaler):
Args:
new_init_loss_scaling(float): The new_init_loss_scaling used to update initial loss scaling factor.
Examples:
.. code-block:: python
# required: gpu,xpu
import paddle
scaler = paddle.amp.GradScaler(enable=True,
......@@ -378,7 +378,7 @@ class GradScaler(AmpScaler):
Reurns:
float: the multiplier to use when increasing the loss scaling.
Examples:
.. code-block:: python
......@@ -402,7 +402,7 @@ class GradScaler(AmpScaler):
Args:
new_incr_ratio(float): The new_incr_ratio used to update the multiplier to use when increasing the loss scaling.
Examples:
.. code-block:: python
......@@ -428,7 +428,7 @@ class GradScaler(AmpScaler):
Reurns:
float: the less-than-one-multiplier to use when decreasing the loss scaling.
Examples:
.. code-block:: python
......@@ -452,7 +452,7 @@ class GradScaler(AmpScaler):
Args:
new_decr_ratio(float): The new_decr_ratio used to update the less-than-one-multiplier to use when decreasing the loss scaling.
Examples:
.. code-block:: python
......@@ -478,7 +478,7 @@ class GradScaler(AmpScaler):
Reurns:
int: the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
Examples:
.. code-block:: python
......@@ -502,7 +502,7 @@ class GradScaler(AmpScaler):
Args:
new_incr_every_n_steps(int): The new_incr_every_n_steps used to update the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
Examples:
.. code-block:: python
......@@ -528,7 +528,7 @@ class GradScaler(AmpScaler):
Reurns:
int: the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
Examples:
.. code-block:: python
......@@ -552,7 +552,7 @@ class GradScaler(AmpScaler):
Args:
new_decr_every_n_nan_or_inf(int): The new_decr_every_n_nan_or_inf used to update the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
Examples:
.. code-block:: python
......@@ -588,7 +588,7 @@ class GradScaler(AmpScaler):
decr_count(int): The number of recent consecutive skipped steps.
use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
Examples:
.. code-block:: python
......@@ -610,10 +610,10 @@ class GradScaler(AmpScaler):
def load_state_dict(self, state_dict):
"""
Loads the scaler state.
Args:
state_dict(dict): scaler state. Should be an object returned from a call to `GradScaler.state_dict()`.
Examples:
.. code-block:: python
......
......@@ -247,7 +247,7 @@ def create_dct(n_mfcc: int,
"""Create a discrete cosine transform(DCT) matrix.
Args:
n_mfcc (int): Number of mel frequency cepstral coefficients.
n_mfcc (int): Number of mel frequency cepstral coefficients.
n_mels (int): Number of mel filterbanks.
norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
......
......@@ -24,12 +24,12 @@ __all__ = []
def backward(tensors, grad_tensors=None, retain_graph=False):
"""
Compute the backward gradients of given tensors.
Args:
tensors(list of Tensors): the tensors which the gradient to be computed. The tensors can not contain the same tensor.
grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
and if any of the elements is None, then the init gradient is the default value which is filled with 1.0.
and if any of the elements is None, then the init gradient is the default value which is filled with 1.0.
If None, all the gradients of the ``tensors`` is the default value which is filled with 1.0.
Defaults to None.
......@@ -37,7 +37,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
:code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
Defaults to False.
Returns:
NoneType: None
......
......@@ -54,16 +54,16 @@ class LegacyPyLayerContext(object):
def save_for_backward(self, *tensors):
"""
Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
.. note::
This API should be called at most once, and only inside `forward`.
This API should be called at most once, and only inside `forward`.
Args:
tensors(list of Tensors): Tensors to be stored.
Returns:
None
Examples:
.. code-block:: python
......@@ -94,7 +94,7 @@ class LegacyPyLayerContext(object):
Get the tensors stored by ``save_for_backward``.
Returns:
list of Tensors or None: If context contains tensors stored by `save_for_backward`,
list of Tensors or None: If context contains tensors stored by `save_for_backward`,
then return these tensors, otherwise return None.
Examples:
......@@ -147,7 +147,7 @@ class CPyLayer(object):
Returns:
tensors or other types : output of PyLayer.
Examples:
.. code-block:: python
......@@ -210,15 +210,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
Their first argument should be a context and `None` can not be included in the returned result.
2. Input of backward contains a context as the first argument, and the rest arguments are the
gradient of forward's output tensors. so the number of backward's input tensors equal to
the number of forward output tensors. If you need the forward's inputs or outputs in `backward`,
2. Input of backward contains a context as the first argument, and the rest arguments are the
gradient of forward's output tensors. so the number of backward's input tensors equal to
the number of forward output tensors. If you need the forward's inputs or outputs in `backward`,
you can use `save_for_backward` to store the required tensors, and then use them in the backward.
3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`.
Output tensors of backward are the gradient of forward's input tensors,
Output tensors of backward are the gradient of forward's input tensors,
so the number of backward's output tensors equal to the number of forward input tensors.
After building the custom Layer, run it through the `apply` method.
Examples:
.. code-block:: python
......@@ -259,8 +259,8 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
@staticmethod
def forward(ctx, *args, **kwargs):
"""
It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
the first argument, followed by any number of arguments (tensors or other types).
It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
the first argument, followed by any number of arguments (tensors or other types).
`None` can not be included in the returned result.
Args:
......@@ -269,7 +269,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
Returns:
tensors or other types : output of PyLayer.
Examples:
.. code-block:: python
......@@ -297,9 +297,9 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
@staticmethod
def backward(ctx, *args, **kwargs):
"""
This is a function to calculate the gradient. It is to be overloaded by subclasses.
It must accept a object of `PyLayerContext` as the first argument, and the rest
arguments are the gradient of forward's output tensors. Output tensors of backward
This is a function to calculate the gradient. It is to be overloaded by subclasses.
It must accept a object of `PyLayerContext` as the first argument, and the rest
arguments are the gradient of forward's output tensors. Output tensors of backward
are the gradient of forward's input tensors.
Args:
......@@ -308,7 +308,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
Returns:
Tensor or list of Tensors: The gradient of forward's input tensor(s).
Examples:
.. code-block:: python
......@@ -340,16 +340,16 @@ class EagerPyLayerContext(object):
def save_for_backward(self, *tensors):
"""
Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
.. note::
This API should be called at most once, and only inside `forward`.
This API should be called at most once, and only inside `forward`.
Args:
tensors(list of Tensors): Tensors to be stored.
Returns:
None
Examples:
.. code-block:: python
......@@ -380,7 +380,7 @@ class EagerPyLayerContext(object):
Get the tensors stored by ``save_for_backward``.
Returns:
list of Tensors or None: If context contains tensors stored by `save_for_backward`,
list of Tensors or None: If context contains tensors stored by `save_for_backward`,
then return these tensors, otherwise return None.
Examples:
......@@ -410,11 +410,11 @@ class EagerPyLayerContext(object):
def mark_not_inplace(self, *args):
"""
Marks inputs as not inplace.
This should be called at most once, only from inside the `forward` method,
This should be called at most once, only from inside the `forward` method,
and all arguments should be Tensor inputs.
If the Tensor returned by `forward` method is the same as the Tensor input of forward,
and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output.
If the Tensor returned by `forward` method is the same as the Tensor input of forward,
and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output.
Thereby preventing the auto grad information of the input Tensor from being overwritten.
Examples:
......@@ -427,7 +427,7 @@ class EagerPyLayerContext(object):
def forward(ctx, x):
ctx.mark_not_inplace(x)
return x
@staticmethod
def backward(ctx, grad_output):
out = grad_output.exp()
......@@ -438,7 +438,7 @@ class EagerPyLayerContext(object):
attn_layers = []
for idx in range(0, 2):
attn_layers.append(Exp())
for step in range(0, 2):
a = x
for j in range(0,2):
......@@ -450,7 +450,7 @@ class EagerPyLayerContext(object):
def mark_non_differentiable(self, *args):
"""
Marks outputs as non-differentiable.
This should be called at most once, only from inside the `forward` method,
This should be called at most once, only from inside the `forward` method,
and all arguments should be tensor outputs.
This will mark outputs as not requiring gradients, increasing the
......@@ -564,8 +564,8 @@ class EagerPyLayer(
@staticmethod
def forward(ctx, *args, **kwargs):
"""
It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
the first argument, followed by any number of arguments (tensors or other types).
It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
the first argument, followed by any number of arguments (tensors or other types).
`None` can not be included in the returned result.
Args:
......@@ -574,7 +574,7 @@ class EagerPyLayer(
Returns:
tensors or other types : output of PyLayer.
Examples:
.. code-block:: python
......@@ -602,9 +602,9 @@ class EagerPyLayer(
@staticmethod
def backward(ctx, *args):
"""
This is a function to calculate the gradient. It is to be overloaded by subclasses.
It must accept a object of `PyLayerContext` as the first argument, and the rest
arguments are the gradient of forward's output tensors. Output tensors of backward
This is a function to calculate the gradient. It is to be overloaded by subclasses.
It must accept a object of `PyLayerContext` as the first argument, and the rest
arguments are the gradient of forward's output tensors. Output tensors of backward
are the gradient of forward's input tensors.
Args:
......@@ -613,7 +613,7 @@ class EagerPyLayer(
Returns:
Tensor or list of Tensors: The gradient of forward's input tensor(s).
Examples:
.. code-block:: python
......
......@@ -17,30 +17,30 @@ __all__ = []
def batch(reader, batch_size, drop_last=False):
"""
This operator creates a batched reader which combines the data from the
This operator creates a batched reader which combines the data from the
input reader to batched data.
Args:
reader(generator): the data reader to read from.
batch_size(int): size of each mini-batch.
drop_last(bool, optional): If set to True, the last batch is dropped when
drop_last(bool, optional): If set to True, the last batch is dropped when
the size of last batch is not equal to batch_size, if set to False,
it will not. Default: False.
Returns:
The batched reader.
The batched reader.
Return Type:
generator
generator
Examples:
.. code-block:: python
import paddle
def reader():
for i in range(10):
yield i
batch_reader = paddle.batch(reader, batch_size=2)
for data in batch_reader():
print(data)
......
......@@ -25,7 +25,7 @@ long_type = int
def to_text(obj, encoding='utf-8', inplace=False):
"""
All string in PaddlePaddle should be represented as a literal string.
This function will convert object to a literal string without any encoding.
Especially, if the object type is a list or set container, we will iterate
all items in the object and convert them to literal string.
......@@ -43,7 +43,7 @@ def to_text(obj, encoding='utf-8', inplace=False):
Returns:
Decoded result of obj
Examples:
.. code-block:: python
......@@ -121,7 +121,7 @@ def _to_text(obj, encoding):
def to_bytes(obj, encoding='utf-8', inplace=False):
"""
All string in PaddlePaddle should be represented as a literal string.
This function will convert object to a bytes with specific encoding.
Especially, if the object type is a list or set container, we will iterate
all items in the object and convert them to bytes.
......@@ -140,7 +140,7 @@ def to_bytes(obj, encoding='utf-8', inplace=False):
Returns:
Decoded result of obj
Examples:
.. code-block:: python
......
......@@ -119,7 +119,7 @@ def XPUPlace(dev_id):
.. code-block:: python
# required: xpu
import paddle
place = paddle.device.XPUPlace(0)
"""
......@@ -163,15 +163,15 @@ def MLUPlace(dev_id):
def get_cudnn_version():
"""
This funciton return the version of cudnn. the retuen value is int which represents the
This funciton return the version of cudnn. the retuen value is int which represents the
cudnn version. For example, if it return 7600, it represents the version of cudnn is 7.6.
Returns:
int: A int value which represents the cudnn version. If cudnn version is not installed, it return None.
Examples:
.. code-block:: python
import paddle
cudnn_version = paddle.device.get_cudnn_version()
......@@ -305,7 +305,7 @@ def set_device(device):
Examples:
.. code-block:: python
import paddle
paddle.device.set_device("cpu")
......@@ -322,13 +322,13 @@ def get_device():
"""
This funciton can get the current global device of the program is running.
It's a string which is like 'cpu', 'gpu:x', 'xpu:x', 'mlu:x' and 'npu:x'. if the global device is not
set, it will return a string which is 'gpu:x' when cuda is avaliable or it
set, it will return a string which is 'gpu:x' when cuda is avaliable or it
will return a string which is 'cpu' when cuda is not avaliable.
Examples:
.. code-block:: python
import paddle
device = paddle.device.get_device()
......@@ -394,7 +394,7 @@ def get_all_custom_device_type():
"""
Get all available custom device types.
Returns:
Returns:
A list of all available custom device types.
Examples:
......
......@@ -42,12 +42,12 @@ def current_stream(device=None):
Return the current CUDA stream by the device.
Parameters:
device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from.
device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from.
If device is None, the device is the current device. Default: None.
Returns:
CUDAStream: the stream to the device.
Examples:
.. code-block:: python
......@@ -82,7 +82,7 @@ def synchronize(device=None):
Parameters:
device(paddle.CUDAPlace()|int, optional): The device or the ID of the device.
If device is None, the device is the current device. Default: None.
Examples:
.. code-block:: python
......@@ -111,7 +111,7 @@ def synchronize(device=None):
def device_count():
'''
Return the number of GPUs available.
Returns:
int: the number of GPUs available.
......@@ -158,7 +158,7 @@ def extract_cuda_device_id(device, op_name):
Return the id of the given cuda device. It is just a utility that will not be exposed to users.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'.
Default: None.
......@@ -197,12 +197,12 @@ def max_memory_allocated(device=None):
Return the peak size of gpu memory that is allocated to tensor of the given device.
.. note::
The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None.
Return:
......@@ -232,8 +232,8 @@ def max_memory_reserved(device=None):
Return the peak size of GPU memory that is held by the allocator of the given device.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None.
Return:
......@@ -263,12 +263,12 @@ def memory_allocated(device=None):
Return the current size of gpu memory that is allocated to tensor of the given device.
.. note::
The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None.
Return:
......@@ -298,14 +298,14 @@ def memory_reserved(device=None):
Return the current size of GPU memory that is held by the allocator of the given device.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None.
Return:
int: The current size of GPU memory that is held by the allocator of the given device, in bytes.
Examples:
Examples:
.. code-block:: python
# required: gpu
......@@ -389,18 +389,18 @@ def get_device_properties(device=None):
Return the properties of given device.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x' which to get the properties of the
device from. If device is None, the device is the current device.
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x' which to get the properties of the
device from. If device is None, the device is the current device.
Default: None.
Returns:
_gpuDeviceProperties: The properties of the device which include ASCII string
identifying device, major compute capability, minor compute capability, global
_gpuDeviceProperties: The properties of the device which include ASCII string
identifying device, major compute capability, minor compute capability, global
memory available and the number of multiprocessors on the device.
Examples:
.. code-block:: python
# required: gpu
......@@ -484,7 +484,7 @@ def get_device_capability(device=None):
Return the major and minor revision numbers defining the device's compute capability which are got from CUDA function `cudaDeviceProp <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g1bf9d625a931d657e08db2b4391170f0>`_.
Parameters:
device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device.
device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device.
Returns:
tuple(int,int): the major and minor revision numbers defining the device's compute capability.
......
......@@ -49,14 +49,14 @@ class LinkType(IntEnum):
class DeviceMesh(core.DeviceMesh):
r"""
The class `DeviceMesh` describes the topology of physical devices.
The class `DeviceMesh` describes the topology of physical devices.
Args:
mesh (list|numpy.array): an N-dimensional array describes the toplogy
of logical processes.
dim_names (list, optional): the i-th element of this list gives the name of the
i-th dimension.
Returns:
None
......@@ -65,9 +65,9 @@ class DeviceMesh(core.DeviceMesh):
import paddle
import paddle.distributed as dist
paddle.enable_static()
mesh = dist.DeviceMesh([[2, 4, 5], [0, 1, 3]])
assert mesh.shape == [2, 3]
assert mesh.device_ids == [2, 4, 5, 0, 1, 3]
......
......@@ -901,7 +901,7 @@ class Completer:
def _complete_high_order_grad_annotation(self, serial_main_program=None):
"""
NOTE:
NOTE:
[HighOrderGrad] Complete the annotation of vars and ops only for high order gradient.
This function is temporary to support high order gradient, and will be removed in the future.
"""
......
......@@ -21,18 +21,18 @@ from ..utils import get_logger
class Converter(object):
"""
Converter is a class object for auto parallel to convert tensors from
one parallel strategy to another one. Tensors will merge and slice value
Converter is a class object for auto parallel to convert tensors from
one parallel strategy to another one. Tensors will merge and slice value
with their strategy when strategies are different.
"""
def __init__(self, tensors_dict, pre_strategy, cur_strategy):
"""
Args:
tensors_dict(dict): tensors' value of all ranks that to be converted.
tensors_dict(dict): tensors' value of all ranks that to be converted.
key is tensor's name(str), value is all ranks' data(list(numpy.ndarray))
pre_strategy(dict): tensors' distributed attribute of last training process.
key is tensor's name(str), value is tensor's distributed attribute in last
key is tensor's name(str), value is tensor's distributed attribute in last
training process.
cur_strategy(dict): tensors' distributed attribute of current rank.
key is tensor's name(str), value is tensor's distributed attribute in current
......@@ -432,7 +432,7 @@ class Converter(object):
process_group = [0, 1, 2]
slice_tensor = _slice_tensor(complete_tensor, [[], [], [2, 4]], 3)
# slice_tensor:
# slice_tensor:
# [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
index = _get_sliced_index(rank, complete_shape, dims_mapping
......
......@@ -433,9 +433,9 @@ class CostModel(object):
def merge_linear(self):
r'''
This method does the following:
This method does the following:
If X depends on Y only, they must be run sequentially.
[ e.g. A ->- C ->- D D and E depends on C only.]
[ e.g. A ->- C ->- D D and E depends on C only.]
[ B ->-/ \->- E C depends on A and B. ]
We merge X and Y into a new node and sum up their cost time.
'''
......@@ -453,7 +453,7 @@ class CostModel(object):
r'''
This method does the following:
If a node has more than one successor, there is *branch*.
[ e.g. A ->- B ->- D ]
[ e.g. A ->- B ->- D ]
[ \->- C ->- / , B and C can be run at the same time ]
case 1: if B or C is null (or D is directly dependent on A),
it's equivalent to A->C->D or A->B->D, fall back to self.merge_linear
......@@ -789,12 +789,12 @@ def estimate_cost(distributed_program, cluster, pipeline_config,
standalone_cost_data, batch_size):
"""
Estimated cost from distributed program, cluster model and distributed settings.
Args:
distributed_program(list): list of paddle programs
cluster(Cluster): cluster model
cluster(Cluster): cluster model
standalone_cost_data(CostData): cost data given by paddle.core
batch_size(int): batch size of the training workload
batch_size(int): batch size of the training workload
pipeline_config(list): configuration of pipeline stage allocation
"""
# the following line is left for now, cluster model will be involved in the future
......
......@@ -25,11 +25,11 @@ from .utils import _linear_idx2coordinate
class DistributedTensor:
"""
DistributedTensor represents the distribution of tensor on the process group and
DistributedTensor represents the distribution of tensor on the process group and
local tensors can be created by DistributedTensor.
Only support even sharding now and uneven sharding will be supported in the future.
Local tensor information can be obtained from the DistributedTensor instance object,
or obtained by the static methods provided by DistributedTensor,
Local tensor information can be obtained from the DistributedTensor instance object,
or obtained by the static methods provided by DistributedTensor,
including shard (i.e. the index in the serial tensor), offsets, and sizes.
"""
......
......@@ -39,8 +39,8 @@ def shard_tensor(x, dist_attr=None):
x (Tensor): the tensor to be sharded.
dist_attr (dict): the tensor distributed attributes. The accepted attributes are as follow:
"process_mesh": a nested list an to describe the mesh topology of logical processes.
"dims_mapping": a list to describe the mapping between `x` and `process_mesh`, the dimension
`i` of `x` is split across the dimension `dims_mapping[i]` of `process_mesh`,
"dims_mapping": a list to describe the mapping between `x` and `process_mesh`, the dimension
`i` of `x` is split across the dimension `dims_mapping[i]` of `process_mesh`,
where -1 means that tensor dimension is not split.
Both process_mesh and dims_mapping are optional and users can specify as need.
......@@ -52,7 +52,7 @@ def shard_tensor(x, dist_attr=None):
import paddle
import paddle.distributed as dist
paddle.enable_static()
x = paddle.ones([4, 6])
......@@ -76,12 +76,12 @@ def shard_op(op_fn, dist_attr=None):
Args:
op_fn (callable): a callable operator or module to be sharded.
dist_attr (dict): the operator distributed attributes. The accepted attributes are classified into
two categories. The first category decsribes the distributed attributes shared by all inputs and
dist_attr (dict): the operator distributed attributes. The accepted attributes are classified into
two categories. The first category decsribes the distributed attributes shared by all inputs and
outputs, and only `process_mesh` can be specified now. The second category describes distributed
attributes for inputs or outputs same as the `dist_attr` of `shard_tensor`. All of them are
optional and users can specify them as need. Note that `process_mesh` for operators must be the
same as these process_meshes for inputs and outputs.
same as these process_meshes for inputs and outputs.
Returns:
list: the outputs of the function `op_fn`, which are annotated with distributed attributes.
......@@ -93,7 +93,7 @@ def shard_op(op_fn, dist_attr=None):
import paddle.distributed as dist
paddle.enable_static()
x = paddle.ones([4, 6])
y = paddle.zeros([4, 6])
dist_add = dist.shard_op(paddle.add,
......
......@@ -176,7 +176,7 @@ def register_distributed_operator_impl(op_type, dist_impl):
def find_compatible_distributed_operator_impls(dist_op, fwd=True, partial=True):
"""
Here just return the first compatible implemention.
Here just return the first compatible implemention.
This will be improved by cost model in the future.
"""
op_type = dist_op.serial_op.type
......@@ -327,9 +327,9 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank):
Args:
dist_ctx (DistributedContext): dist context.
op (Operator): the current (backward) operator which might need.
act_grad_names (list): list of input activation grads variable name to the current operator.
out_grad_names (list): list of the output parameter's grads variable name of the current operator.
op (Operator): the current (backward) operator which might need.
act_grad_names (list): list of input activation grads variable name to the current operator.
out_grad_names (list): list of the output parameter's grads variable name of the current operator.
rank (int): global ranks index for current process.
"""
dp_group = None
......@@ -360,13 +360,13 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank):
def sync_and_scale_gradients(dist_ctx, op, dp_group, allreduce_var_names):
"""
insert the allreudce and scale ops for gradients of model
insert the allreudce and scale ops for gradients of model
parameters for operator in data parallelism.
Args:
dist_ctx (DistributedContext): dist context.
op (Operator): the current (backward) operator which might need.
allreduce_var_names (list): list of the parameter's grads variable name in the current operator output.
op (Operator): the current (backward) operator which might need.
allreduce_var_names (list): list of the parameter's grads variable name in the current operator output.
"""
op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
......@@ -417,14 +417,14 @@ def sync_and_scale_gradients(dist_ctx, op, dp_group, allreduce_var_names):
def gradient_synchronization(dist_ctx, op, act_grad_names, out_grad_names,
rank):
"""
conduct the allreudce and scaling(dp size)for gradients of model
conduct the allreudce and scaling(dp size)for gradients of model
parameters for operator in data parallelism.
Args:
dist_ctx (DistributedContext): dist context.
op (Operator): the current (backward) operator which might need.
act_grad_names (list): list of input activation grads variable name to the current operator.
out_grad_names (list): list of the output parameter's grads variable name of the current operator.
op (Operator): the current (backward) operator which might need.
act_grad_names (list): list of input activation grads variable name to the current operator.
out_grad_names (list): list of the output parameter's grads variable name of the current operator.
rank (int): global ranks index for current process.
"""
......
......@@ -57,9 +57,9 @@ class AutoParallelizer:
AutoParallelizer is the main controller class to do the auto parallel process.
And the auto parallel process will be triggered in the wrapped parallelize function.
To facilitate the auto parallelization, it will contain information about program, cluster and the
related context. In this basic version, the program information will be retrevied from
related context. In this basic version, the program information will be retrevied from
Fleet object, and the cluster information can be retrevied in the new created Cluster object,
and the context information can be retrevied in the new created DistributedContext.
and the context information can be retrevied in the new created DistributedContext.
"""
def __init__(self, fleet):
......
......@@ -39,7 +39,7 @@ class Partitioner(object):
warning:: Partitioner is experimental and subject to change.
Partitioner convert a program into another program.
Given a serial program which has been auto completed with shard annotation, the Partitioner
Given a serial program which has been auto completed with shard annotation, the Partitioner
convert the serial program into a "distributed" program. The Partitioner will modify the serial
program in following two ways, which is also the major difference between serial and distributed program:
1. partition op: replace a serial op into its corresponding dist op infered from the shard annotation
......
......@@ -38,7 +38,7 @@ def _flatten_nested_list(nested_list):
class ProcessMesh(object):
r"""
The class `Processmesh` describes the topology of logical processes.
The class `Processmesh` describes the topology of logical processes.
A mesh is an N-dimensional array. The shape of the N-dimensional
array represents the topology of logical processes and every
element of the N-dimensional array represent a logical process. For
......@@ -52,9 +52,9 @@ class ProcessMesh(object):
Args:
mesh (list): an N-dimensional array (nested list) describes the toplogy
of logical processes. The shape of the N-dimensional array
represents the topology of logical processes and every
represents the topology of logical processes and every
element of the N-dimensional array represents a logical process.
Returns:
None
......@@ -66,9 +66,9 @@ class ProcessMesh(object):
import paddle
import paddle.distributed as dist
paddle.enable_static()
mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
assert mesh.topology == [2, 3]
assert mesh.processes == [2, 4, 5, 0, 1, 3]
......
......@@ -19,14 +19,14 @@ from paddle.fluid import core
class ProcessMesh(core.ProcessMesh):
r"""
The class `Processmesh` describes the topology of logical processes.
The class `Processmesh` describes the topology of logical processes.
Args:
mesh (list|numpy.array): an N-dimensional array describes the toplogy
of logical processes.
dim_names (list, optional): the i-th element of this list gives the name of the
i-th dimension.
Returns:
None
......@@ -35,9 +35,9 @@ class ProcessMesh(core.ProcessMesh):
import paddle
import paddle.distributed as dist
paddle.enable_static()
mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
assert mesh.shape == [2, 3]
assert mesh.processe_ids == [2, 4, 5, 0, 1, 3]
......
......@@ -23,12 +23,12 @@ from .trial import OptimizationTunerTrial as Trial
class AlgorithmBase(ABC):
"""
An Tuning alogrithm is a class to find out an optimal configuration
given the selected tuning optimization pass(es) and the arguments to be tuned.
An Tuning alogrithm is a class to find out an optimal configuration
given the selected tuning optimization pass(es) and the arguments to be tuned.
Different optimization pass(es) will correspond to a different algorithm,
where different search space **pruning rules** will applied.
In another word, the key "algorithm" for this class is the
In another word, the key "algorithm" for this class is the
search space pruning rules specific for the given optimization scenario.
"""
_REGISTERED_ALGORITHMS = {}
......@@ -52,9 +52,9 @@ class AlgorithmBase(ABC):
def collect_model_info(self, main_prog, startup_prog):
"""
Collect the model static info (from programs) that could be used to
pruning candidate trials and saving tuning time.For instance,
model info like number of model parameters and activation memory could be
Collect the model static info (from programs) that could be used to
pruning candidate trials and saving tuning time.For instance,
model info like number of model parameters and activation memory could be
used to prune candidated trial and decide the next trial.
"""
pass
......@@ -70,7 +70,7 @@ class AlgorithmBase(ABC):
@abstractmethod
def update(self, results):
"""
Update the algorthim with the results of last trial. Using this information is used to
Update the algorthim with the results of last trial. Using this information is used to
pruning the search space of the future trial.
"""
pass
......
......@@ -33,7 +33,7 @@ class TuningConfig(object):
"""
A uniform config wrap:
distributed strategy: the user defined configuration for optimization pass
tuning config: configuration for the tuning process: mode (profile or cost model), log dir, extra tuning config for optimization like search range for specific
tuning config: configuration for the tuning process: mode (profile or cost model), log dir, extra tuning config for optimization like search range for specific
"""
def __init__(self, user_config, strategy):
......
......@@ -161,7 +161,7 @@ def _copy_context(ref_dist_context):
class OptimizationTuner:
"""
OptimizationTuner is used to manage the tuning procedure of hyper-parameters (configs)
OptimizationTuner is used to manage the tuning procedure of hyper-parameters (configs)
of Optimization Pass in AutoParallel.
"""
......@@ -466,7 +466,7 @@ class OptimizationTuner:
Return the best optimization configuration found in the tuning.
Returns:
A object of fleet.DistributedStrategy with best configuration.
A object of fleet.DistributedStrategy with best configuration.
"""
assert self._best_iter >= 0, "The best configuration is not found yet !"
best_trial = self._finished_trials[self._best_iter]
......@@ -481,7 +481,7 @@ class OptimizationTuner:
summary_ = """
Tuning Result Summary
Run total {} trials with {} min.
The best trial is: [{}], whose configuration is following:
The best trial is: [{}], whose configuration is following:
""".format(len(self._finished_trials),
(time.time() - self._tuning_start_time) / 60,
best_trial.name)
......@@ -508,8 +508,8 @@ The best trial is: [{}], whose configuration is following:
def tune(self):
"""
Performs the search for best hyperparameter configuations
for the selected optimization pass(es).
Performs the search for best hyperparameter configuations
for the selected optimization pass(es).
"""
# step1: collect model info which might be used for
......
......@@ -171,7 +171,7 @@ def print_program_with_dist_attr(program, dist_context=None):
def _get_comm_group(processes, shape, axis, rank):
"""
Given a rank and the processes mesh the rank belongs to,
Given a rank and the processes mesh the rank belongs to,
compute the communication peers of the rank based on the give axis in the mesh.
Example: 16 processes managed in a 4-Dimensinal mesh with shape of [2, 2, 2, 2].
......@@ -205,7 +205,7 @@ def _get_comm_group(processes, shape, axis, rank):
def _get_idx_in_axis(processes, shape, axis, rank):
"""
Given a rank and the processes mesh the rank belongs to,
Given a rank and the processes mesh the rank belongs to,
compute the index of the rank in given axis.
Example: 27 processes managed in a 3-Dimensinal mesh with shape of [3, 3, 3].
......@@ -226,20 +226,20 @@ def _coordinate2linear_idx(mesh_shape, coordinate):
"""
convert a coordinate in multidimensional mesh space into a scala idx in linear space.
it use Row-major order for dimension conversion.
it use Row-major order for dimension conversion.
so it has: [most_significant_dim, ..., least_significant_dim]
assume:
assume:
the size of i-th dimension to be: S[i]
the index of j-th dimension is: I[j]
linear_idx of a n dimensional coordinate is:
linear_idx of a n dimensional coordinate is:
I[n-1] * (S[n-2] * S[n-3] * S[n-4] * .... S[0]) +
I[n-2] * ( S[n-3] * S[n-4] * .... S[0]) +
I[n-3] * ( S[n-4] * .... S[0]) +
I[n-2] * ( S[n-3] * S[n-4] * .... S[0]) +
I[n-3] * ( S[n-4] * .... S[0]) +
...
I[1] * ( S[0]) +
I[1] * ( S[0]) +
I[0]
"""
......@@ -279,7 +279,7 @@ def _linear_idx2coordinate(mesh_shape, linear_idx):
mapping a linear scala into multidimensional mesh space, return it coordinate in that space.
it is the inverse function of _coordinate2linear_idx.
assume:
assume:
the size of i-th dimension to be: S[i]
the index of j-th dimension is: I[j]
......@@ -460,8 +460,8 @@ def save_distributed_checkpoint(program,
addition_info=None,
is_integrated=False,
dist_context=None):
"""
Save model parameter state, optimzer state, distributed attribute and
"""
Save model parameter state, optimzer state, distributed attribute and
additional information of each rank.
Args:
......@@ -502,7 +502,7 @@ def save_distributed_checkpoint(program,
def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
"""
"""
Load parameter, optimizer, distributed attribute and addition_info.
Args:
......@@ -512,7 +512,7 @@ def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
Returns:
param_dict(dict): parameters' value of all ranks.
dist_attr(dict): parameters' distributed attribute.
addition_info(dict): additional information user saved in last training.
addition_info(dict): additional information user saved in last training.
Notes:
The return, 'addition_info', is belonging to the first file of checkpoint_path by default.
......@@ -520,9 +520,9 @@ def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
Examples:
.. code-block:: python
ckpt_path = ['./model_state_rank0.pdmodel',
ckpt_path = ['./model_state_rank0.pdmodel',
'./model_state_rank1.pdmodel']
dist_attr_path = ['./dist_attr_rank0.pdattr',
dist_attr_path = ['./dist_attr_rank0.pdattr',
'./dist_attr_rank1.pdattr']
param_dict, dist_attr, add_info = load_distributed_checkpoint(ckpt_path, dist_attr_path)
"""
......@@ -542,7 +542,7 @@ def load_checkpoint_into_program(checkpoint_path,
dist_attr_path,
program,
dist_context=None):
"""
"""
Load parameter, optimizer, distributed attribute and addition_info into model.
Args:
......@@ -553,7 +553,7 @@ def load_checkpoint_into_program(checkpoint_path,
Returns:
addition_info(dict): user saved in last train.
Notes:
The return, 'addition_info', is belonging to the first file of checkpoint_path by default.
......@@ -561,9 +561,9 @@ def load_checkpoint_into_program(checkpoint_path,
.. code-block:: python
exe.run(startup_program)
ckpt_path = ['./model_state_rank0.pdmodel',
ckpt_path = ['./model_state_rank0.pdmodel',
'./model_state_rank1.pdmodel']
dist_attr_path = ['./dist_attr_rank0.pdattr',
dist_attr_path = ['./dist_attr_rank0.pdattr',
'./dist_attr_rank1.pdattr']
load_checkpoint_into_program(ckpt_path, dist_attr_path, main_program)
"""
......@@ -590,7 +590,7 @@ def load_checkpoint_into_program(checkpoint_path,
def load_parameter_into_program(param_dict, program):
"""
"""
Load parameters into program.
Args:
......@@ -672,7 +672,7 @@ def _load_distributed_state_dict(checkpoint_path):
def get_dist_attr(program, dist_context=None):
"""
"""
Get distributed attribute of current rank.
Args:
......@@ -935,7 +935,7 @@ def _get_sliced_param_index(rank, complete_shape, dims_mapping, process_shape,
process_group = [0, 1, 2]
slice_param = _slice_parameter(complete_param, [[], [], [2, 4]], 3)
# slice_param:
# slice_param:
# [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
index = _get_sliced_param_index(rank, complete_shape, dims_mapping
......
......@@ -579,10 +579,10 @@ def destroy_process_group(group=None):
Destroy a given group for communication
Args:
group (ProcessGroup, optional): The group to be destroyed. All of process groups, including
the default group, will be destroyed and the distributed
group (ProcessGroup, optional): The group to be destroyed. All of process groups, including
the default group, will be destroyed and the distributed
environment will be deinitialized.
Returns : None
Examples:
......@@ -776,7 +776,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
Reduce a tensor over all ranks so that all get the result.
As shown below, one process is started with a GPU and the data of this process is represented
by its group rank. The reduce operator is sum. Through all_reduce operator,
by its group rank. The reduce operator is sum. Through all_reduce operator,
each GPU will have the sum of the data from all GPUs.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/allreduce.png
......@@ -1662,10 +1662,10 @@ def _parallel_linear(x,
"""
Parallel Linear
axis the dimension of the parameter of linear layer.
axis the dimension of the parameter of linear layer.
axis = 0: the row dimension
axis = 1: the col dimension
"""
if group is not None and not group.is_member():
return
......@@ -1840,7 +1840,7 @@ def split(x,
of which is a matrix with N/num_partitions rows and M column.
The linear layer put on single card is shown as below, the input variable is represented by X,
the weight matrix is represented by W and the output vaiable is O. The linear layer on single card is
the weight matrix is represented by W and the output vaiable is O. The linear layer on single card is
simple matrix multiplication operation, O = X * W.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_single.png
......@@ -1863,14 +1863,14 @@ def split(x,
of which is a matrix with N rows and M/num_partitions column.
The linear layer put on single card has been illustrated on case 2 and Column Parallel Linear
is shown as below. The Column Parallel Linear splits the weight matrix W into [W_col1, W_col2] along the column and
these splitted matrices respectively multiply the input. Finally apply AllGather on the output from each card to get the final output.
is shown as below. The Column Parallel Linear splits the weight matrix W into [W_col1, W_col2] along the column and
these splitted matrices respectively multiply the input. Finally apply AllGather on the output from each card to get the final output.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col.png
:width: 800
:alt: split_col
:align: center
As observed, the column parallel linear and row parallel linear can be combined to skip one ALLGATHER communication
operator. Furthermore the Attention and MLP can be combined to imporve the performance as shown below.
......@@ -2019,10 +2019,10 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
data type of the input Tensors.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True.
Returns:
None.
Examples:
.. code-block:: python
......@@ -2116,16 +2116,16 @@ def alltoall_single(in_tensor,
Args:
in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor``
out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor``
must be divisible by group size and ``out_tensor`` will be gathered averagely from all participators. Default: None.
group (Group, optional): The group instance return by ``new_group`` or None for global default group. Default: None.
use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True.
Returns:
None, if ``use_calc_stream`` is set to ``True``; ``Task`` of ``group``, if ``use_calc_stream`` is set to ``False``.
Examples:
.. code-block:: python
......@@ -2207,7 +2207,7 @@ def send(tensor, dst=0, group=None, use_calc_stream=True):
dst (int): The destination rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
Returns:
None.
......@@ -2272,7 +2272,7 @@ def recv(tensor, src=0, group=None, use_calc_stream=True):
src (int): The source rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
Returns:
None.
......@@ -2353,11 +2353,11 @@ def isend(tensor, dst, group=None):
should be float16, float32, float64, int32, int64, int8, uint8 or bool.
dst (int): The destination rank.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
Returns:
A distributed task object.
Warning:
Warning:
This API only supports the dygraph mode.
Examples:
......@@ -2407,7 +2407,7 @@ def irecv(tensor, src=None, group=None):
Returns:
A distributed task object.
Warning:
Warning:
This API only supports the dygraph mode.
Examples:
......@@ -2456,7 +2456,7 @@ class P2POp(object):
The type of ``op`` is either ``paddle.distributed.isend`` or ``paddle.distributed.irecv``.
tensor (Tensor): Tensor to send or receive.
peer (int): The destination or source rank.
group (Group, optional): The group instance return by new_group or None for global
group (Group, optional): The group instance return by new_group or None for global
default group. Default: None.
"""
......@@ -2505,7 +2505,7 @@ def batch_isend_irecv(p2p_op_list):
"""
Send or Receive a batch of tensors asynchronously and return a list of requests.
Process each of the point-to-point operations in ``p2p_op_list`` and return the
Process each of the point-to-point operations in ``p2p_op_list`` and return the
corresponding tasks. NCCL are currently supported.
Args:
......@@ -2516,9 +2516,9 @@ def batch_isend_irecv(p2p_op_list):
Returns:
A list of distributed tasks returned by calling the corresponding
op in the op_list.
op in the op_list.
Warning:
Warning:
This API only supports the dygraph mode.
Examples:
......@@ -2546,7 +2546,7 @@ def batch_isend_irecv(p2p_op_list):
for task in tasks:
task.wait()
print(recv_t)
# paddle.tensor([1, 2]) # Rank-0
# paddle.tensor([0, 1]) # Rank-1
......@@ -2587,15 +2587,15 @@ def reduce_scatter(tensor,
tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
should be float16, float32, float64, int32, int64, int8, uint8 or bool.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
group (Group, optional): The group instance return by new_group or None for global
group (Group, optional): The group instance return by new_group or None for global
default group. Default: None.
use_calc_stream (bool, optional): Whether this op should be an async op.
Returns:
Async task handle, if use_calc_stream is set to False.
None, if use_calc_stream or if not part of the group.
Warning:
Warning:
This API only supports the dygraph mode.
......@@ -2652,7 +2652,7 @@ def _reduce_scatter_base(output,
Args:
output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
should be float16, float32, float64, int32, int64, int8, uint8 or bool.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
group (ProcessGroup, optional): The process group to work on. If None,
......
......@@ -114,12 +114,12 @@ class DistributedStrategy(object):
"""
DistributedStrategy is the main configuration entry for distributed training of Paddle.
All of the distributed training configurations can be configured in DistributedStrategy,
such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
asynchronous update parameter server(ASGD), etc.
DistributedStrategy can be serialized into protobuf file or deserialized from protobuf file
Users who run local training usually configure BuildStrategy and ExecutionStrategy, and
Users who run local training usually configure BuildStrategy and ExecutionStrategy, and
DistributedStrategy supports configurations from BuildStrategy and ExecutionStrategy
"""
......@@ -290,7 +290,7 @@ class DistributedStrategy(object):
def a_sync(self):
"""
Indicating whether we are using asynchronous stocastic gradient descent updates
for training. This property is valid when we are using parameter server training,
for training. This property is valid when we are using parameter server training,
which is implied by setting approperate RoleMaker
Default value: True
......@@ -372,7 +372,7 @@ class DistributedStrategy(object):
@property
def trainer_desc_configs(self):
"""
Set trainer desc configurations.
Set trainer desc configurations.
**Notes**:
dump_fields_path(str): the path of dump fields
......@@ -381,7 +381,7 @@ class DistributedStrategy(object):
dump_param(list(str)): the param that you want to dump
stat_var_names(list(str)):
stat_var_names(list(str)):
Examples:
......@@ -443,12 +443,12 @@ class DistributedStrategy(object):
@property
def fs_client_param(self):
"""
Set fs client configurations.
Set fs client configurations.
**Notes**:
uri(str): the uri of fs client
user(str): the user_name of fs client
passwd(str): the passwd of fs client
hadoop_bin(str):
hadoop_bin(str):
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1001,15 +1001,15 @@ class DistributedStrategy(object):
@property
def last_comm_group_size_MB(self):
"""
Specifying the size of gradient to fuse in Mega-Bytes when
the last group of each batch communicates. Making the last group
small is useful to improve performance.
Specifying the size of gradient to fuse in Mega-Bytes when
the last group of each batch communicates. Making the last group
small is useful to improve performance.
Default value: 1
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.last_comm_group_size_MB = 2
......@@ -1027,7 +1027,7 @@ class DistributedStrategy(object):
@property
def find_unused_parameters(self):
"""
Indicating whether we are using find_unused_parameters to
Indicating whether we are using find_unused_parameters to
find unused parameters in DataParallel.
Default value: False
......@@ -1104,20 +1104,20 @@ class DistributedStrategy(object):
@property
def recompute_configs(self):
"""
Set recompute configurations.
Set recompute configurations.
**Note**:
checkpoints(list): list of string name of checkpoints. In general, the recompute
strategy of current implementation should have some manually assign checkpoints.
enable_offload(bool): enable recompute checkpoints offload feature. this feature
enable_offload(bool): enable recompute checkpoints offload feature. this feature
will offload the checkpoint to host memory to allow even larger batch size. since
the memcpy from host to device takes time, it is a trade off between larger batch
size and training speed.
checkpoint_shape(list): list of int that specific the shape of checkpoint. so far
recompute-offload requires that all checkpoint to be same shape, and every dimension
specific here should be determined ("-1" is not allowed).
specific here should be determined ("-1" is not allowed).
Examples:
......@@ -1145,7 +1145,7 @@ class DistributedStrategy(object):
def sharding(self):
"""
Indicating whether we are using sharding Optimizer for memory
optimization. We implement the sharding optimizer following the ZeRO-DP
optimization. We implement the sharding optimizer following the ZeRO-DP
idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
Model parameters and Optimizer State are sharded into different ranks allowing to fit larger model.
......@@ -1174,26 +1174,26 @@ class DistributedStrategy(object):
@property
def sharding_configs(self):
"""
Set sharding configurations.
Set sharding configurations.
**Note**:
sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are
available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and
sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are
available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and
communication. Default is segment_broadcast_MB.
segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and
segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and
after every segment_broadcast_MB size parameter being broadcasted, the program will be cutted into one segment.
This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .
segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation.
segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation.
this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.
sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1. Default is 8.
gradient_merge_acc_step(int, optional): specific the accumulation steps in gradient merge; and gradient merge will be turn off if gradient_merge_acc_step=1. Default is 1.
optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model.
optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model.
the moment var will be prefetch from and offloaded to Host memory during update stage. it is a stragtegy that trades off between training speed and GPU memory, and is recommened to be turn on only when gradient_merge_acc_step large, where
the number of time of update stage will be relatively small compared with forward&backward's. Default is False.
......@@ -1203,7 +1203,7 @@ class DistributedStrategy(object):
pp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1. Default is 1.
pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on.
pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on.
This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now.. Default is False.
optimize_cast(bool, optional): [Hybrid parallelism ONLY] Move the cast op of AMP which cast fp32 param to fp16 param to optimizer. optimize_cast will persist fp16 param, it
......@@ -1385,11 +1385,11 @@ class DistributedStrategy(object):
"""
Set pipeline parallelism configurations. In pipeline parallelism,
different parts of neural networks are running on different GPUS.
There are Tensor queue buffer between each pair of neighborhood GPUS
There are Tensor queue buffer between each pair of neighborhood GPUS
that are responsible for synchronizing hidden Tensor results between
GPUs. Pipeline parallelism consists of serveral producer-consumer style
hardware pairs, such as GPU-GPU, CPU-GPU, GPU-XPU. The best way to speedup
pipeline parallelism is to make the size of Tensor in Tensor queue smaller,
pipeline parallelism is to make the size of Tensor in Tensor queue smaller,
so that we will have a faster producer for downstream consumers.
**Notes**:
......@@ -1475,7 +1475,7 @@ class DistributedStrategy(object):
@property
def hybrid_configs(self):
"""
Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism
Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism
needs to meet the following relationships
total_number_GPUs = dp_degree * mp_degree * pp_degree
......@@ -1483,7 +1483,7 @@ class DistributedStrategy(object):
**Note**:
dp_degree(int): set number of GPUs in a data parallel group. Default -1.
This value should be an integer greater than 0.
If it is not set, or set to -1, its value will be inferred
If it is not set, or set to -1, its value will be inferred
based on the total number of cards.
mp_degree(int): set number of GPUs in a model parallel group. Default 1
pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
......@@ -1567,7 +1567,7 @@ class DistributedStrategy(object):
def adaptive_localsgd(self):
"""
Indicating whether we are using Adaptive Local SGD training. Default Value: False
For more details, please refer to `Adaptive Communication Strategies to Achieve
For more details, please refer to `Adaptive Communication Strategies to Achieve
the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
......@@ -1770,8 +1770,8 @@ class DistributedStrategy(object):
@property
def lars(self):
"""
Set lars configurations. lars is used to deal with the convergence problems when the global
batch size is larger than 8k. For more details, please refer to
Set lars configurations. lars is used to deal with the convergence problems when the global
batch size is larger than 8k. For more details, please refer to
[Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
Default Value: False
......@@ -1802,8 +1802,8 @@ class DistributedStrategy(object):
**Notes**:
**lars_coeff (float)**: trust ratio in lars formula.
**lars_weight_decay** (float): weight decay coefficient in lars formula.
**epsilon (float)**: argument is used to avoid potential devision-by-zero
when compute the local lr;
**epsilon (float)**: argument is used to avoid potential devision-by-zero
when compute the local lr;
**exclude_from_weight_decay ([string])**: is a list of name strings of layers which
will be exclude from weight decay in lars formula.
......@@ -1832,9 +1832,9 @@ class DistributedStrategy(object):
@property
def lamb(self):
"""
Set lamb configurations. lamb is used to deal with the convergence problems for large
batch size training, specially for attention-related model like BERT. For more details,
please refer to
Set lamb configurations. lamb is used to deal with the convergence problems for large
batch size training, specially for attention-related model like BERT. For more details,
please refer to
[Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).
Default Value: False
......@@ -1908,7 +1908,7 @@ class DistributedStrategy(object):
def auto(self):
"""
Indicating whether we are using auto-parallel configuration
This feature is currently an experimental feature. Currently,
This feature is currently an experimental feature. Currently,
auto-parallelism can be used only when a user does not set any other
strategy configs except auto. For details, please reference the following
code example
......@@ -1943,7 +1943,7 @@ class DistributedStrategy(object):
def semi_auto(self):
"""
Indicating whether we are using semi-auto parallel function
This feature is currently an experimental feature. Currently,
This feature is currently an experimental feature. Currently,
auto-parallelism can be used only when a user does not set any other
strategy configs except semi-auto. For details, please reference the following
code example
......@@ -2047,7 +2047,7 @@ class DistributedStrategy(object):
activation_bits(int): quantization bit number for activation. Default is 8.
not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope,
not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope,
the corresponding op will not be quantized.
algo(str): Other quantization training algorithm.
......
......@@ -24,11 +24,11 @@ def wait_server_ready(endpoints):
"""
Wait until parameter servers are ready, use connext_ex to detect
port readiness.
Args:
endpoints (list|tuple): endpoints string list, like:
["127.0.0.1:8080", "127.0.0.1:8081"]
Examples:
.. code-block:: python
......
......@@ -750,7 +750,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
def _get_previous_trainers(self):
"""
invoked by heter worker
invoked by heter worker
"""
if not self._role_is_generated:
self._generate_role()
......@@ -761,7 +761,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
def _get_next_trainers(self):
"""
invoked by heter worker
invoked by heter worker
"""
if not self._role_is_generated:
self._generate_role()
......
......@@ -116,7 +116,7 @@ class StrategyCompiler(StrategyCompilerBase):
"""
StrategyCompiler is responsible for meta optimizers combination
Generally, a user can define serveral distributed strategies that
can generate serveral meta optimizer. The combination of these
can generate serveral meta optimizer. The combination of these
meta optimizers should have the right order to apply the optimizers'
minimize function.
This class is responsible for the executable distributed optimizer
......@@ -162,7 +162,7 @@ class StrategyCompiler(StrategyCompilerBase):
"""
Meta Optimizer Type A: rewrite forward, backward. e.g. recompute, async, sync, pipeline.
results will be splitted in async, sync, pipeline
Meta Optimizer Type B: rewrite forward,
Meta Optimizer Type B: rewrite forward,
e.g. AMP and the corresponding backward is generated by rewritten forward
Meta Opitmizer Type B: rewrite backward. e.g. gradient fusion
Meta Optimizer Type D: rewrite optimize. e.g. lars, lamb, localsgd, gradient merge, dgc
......
......@@ -32,7 +32,7 @@ class ParallelMode(object):
- DATA_PARALLEL: Distribute input data to different devices.
- TENSOR_PARALLEL: Shards tensors in the network to different devices.
- PIPELINE_PARALLEL: Place different layers of the network on different devices.
- SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states
- SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states
corresponding to the parameters to each device.
Examples:
......
......@@ -286,7 +286,7 @@ class UtilBase(object):
def print_on_rank(self, message, rank_id):
"""
Woker of rank `rank_id` print some message.
Woker of rank `rank_id` print some message.
Args:
message(str): Log to be printed.
......
......@@ -22,7 +22,7 @@ class DataGenerator(object):
"""
DataGenerator is a general Base class for user to inherit
A user who wants to define his/her own python processing logic
with paddle.distributed.InMemoryDataset/QueueDataset should
with paddle.distributed.InMemoryDataset/QueueDataset should
inherit this class.
"""
......@@ -96,7 +96,7 @@ class DataGenerator(object):
def run_from_stdin(self):
'''
This function reads the data row from stdin, parses it with the
process function, and further parses the return value of the
process function, and further parses the return value of the
process function with the _gen_str function. The parsed data will
be wrote to stdout and the corresponding protofile will be
generated.
......@@ -152,7 +152,7 @@ class DataGenerator(object):
def generate_sample(self, line):
'''
This function needs to be overridden by the user to process the
This function needs to be overridden by the user to process the
original data row into a list or tuple.
Args:
......@@ -160,8 +160,8 @@ class DataGenerator(object):
Returns:
Returns the data processed by the user.
The data format is list or tuple:
[(name, [feasign, ...]), ...]
The data format is list or tuple:
[(name, [feasign, ...]), ...]
or ((name, [feasign, ...]), ...)
For example:
......@@ -290,7 +290,7 @@ class MultiSlotDataGenerator(DataGenerator):
and updating proto_info information.
The input line will be in this format:
>>> [(name, [feasign, ...]), ...]
>>> [(name, [feasign, ...]), ...]
>>> or ((name, [feasign, ...]), ...)
The output will be in this format:
>>> [ids_num id1 id2 ...] ...
......
......@@ -46,7 +46,7 @@ class DatasetBase(object):
fs_ugi="",
download_cmd="cat"):
"""
should be called only once in user's python scripts to initialize setings of dataset instance.
should be called only once in user's python scripts to initialize setings of dataset instance.
Normally, it is called by InMemoryDataset or QueueDataset.
Args:
......@@ -341,7 +341,7 @@ class DatasetBase(object):
class InMemoryDataset(DatasetBase):
"""
:api_attr: Static Graph
It will load data into memory and shuffle data before training.
Examples:
......@@ -376,8 +376,8 @@ class InMemoryDataset(DatasetBase):
Args:
kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
merge_size(int): ins size to merge, if merge_size > 0, set merge by line id,
instances of same line id will be merged after shuffle,
merge_size(int): ins size to merge, if merge_size > 0, set merge by line id,
instances of same line id will be merged after shuffle,
you should parse line id in data generator. default is -1.
parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
parse_content(bool): Set if Dataset need to parse content. default is False.
......@@ -404,7 +404,7 @@ class InMemoryDataset(DatasetBase):
parse_content=True,
fea_eval=True,
candidate_size=10000)
"""
merge_size = kwargs.get("merge_size", -1)
if merge_size > 0:
......@@ -449,8 +449,8 @@ class InMemoryDataset(DatasetBase):
data_feed_type(str): data feed type used in c++ code. default is "MultiSlotInMemoryDataFeed".
queue_num(int): Dataset output queue num, training threads get data from queues. default is-1, which is set same as thread number in c++.
merge_size(int): ins size to merge, if merge_size > 0, set merge by line id,
instances of same line id will be merged after shuffle,
merge_size(int): ins size to merge, if merge_size > 0, set merge by line id,
instances of same line id will be merged after shuffle,
you should parse line id in data generator. default is -1.
parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
parse_content(bool): Set if Dataset need to parse content. default is False.
......@@ -463,7 +463,7 @@ class InMemoryDataset(DatasetBase):
Examples:
.. code-block:: python
import paddle
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
......@@ -479,7 +479,7 @@ class InMemoryDataset(DatasetBase):
fea_eval=True,
candidate_size=10000)
dataset.update_settings(batch_size=2)
"""
for key in kwargs:
if key == "pipe_command":
......@@ -515,10 +515,10 @@ class InMemoryDataset(DatasetBase):
:api_attr: Static Graph
should be called only once in user's python scripts to initialize setings of dataset instance
Args:
kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
batch_size(int): batch size. It will be effective during training. default is 1.
thread_num(int): thread num, it is the num of readers. default is 1.
use_var(list): list of variables. Variables which you will use. default is [].
......@@ -561,7 +561,7 @@ class InMemoryDataset(DatasetBase):
dataset.set_filelist(
["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
dataset.load_into_memory()
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
startup_program = paddle.static.Program()
......@@ -569,7 +569,7 @@ class InMemoryDataset(DatasetBase):
exe.run(startup_program)
exe.train_from_dataset(main_program, dataset)
os.remove("./test_queue_dataset_run_a.txt")
os.remove("./test_queue_dataset_run_b.txt")
......@@ -831,7 +831,7 @@ class InMemoryDataset(DatasetBase):
def load_into_memory(self, is_shuffle=False):
"""
:api_attr: Static Graph
Load data into memory
Args:
......@@ -842,7 +842,7 @@ class InMemoryDataset(DatasetBase):
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
......@@ -1035,7 +1035,7 @@ class InMemoryDataset(DatasetBase):
def release_memory(self):
"""
:api_attr: Static Graph
Release InMemoryDataset memory data, when data will not be used again.
Examples:
......@@ -1043,7 +1043,7 @@ class InMemoryDataset(DatasetBase):
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
......@@ -1144,7 +1144,7 @@ class InMemoryDataset(DatasetBase):
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
......@@ -1180,13 +1180,13 @@ class InMemoryDataset(DatasetBase):
"""
set fea eval mode for slots shuffle to debug the importance level of
slots(features), fea_eval need to be set True for slots shuffle.
Args:
record_candidate_size(int): size of instances candidate to shuffle
record_candidate_size(int): size of instances candidate to shuffle
one slot
fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
default is True.
Examples:
.. code-block:: python
......@@ -1202,12 +1202,12 @@ class InMemoryDataset(DatasetBase):
def slots_shuffle(self, slots):
"""
Slots Shuffle
Slots Shuffle is a shuffle method in slots level, which is usually used
Slots Shuffle
Slots Shuffle is a shuffle method in slots level, which is usually used
in sparse feature with large scale of instances. To compare the metric, i.e.
auc while doing slots shuffle on one or several slots with baseline to
auc while doing slots shuffle on one or several slots with baseline to
evaluate the importance level of slots(features).
Args:
slots(list[string]): the set of slots(string) to do slots shuffle.
......@@ -1216,7 +1216,7 @@ class InMemoryDataset(DatasetBase):
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset._init_distributed_settings(fea_eval=True)
slots = ["slot1", "slot2", "slot3", "slot4"]
......@@ -1442,7 +1442,7 @@ class BoxPSDataset(InMemoryDataset):
def begin_pass(self):
"""
Begin Pass
Notify BoxPS to load sparse parameters of next pass to GPU Memory
Notify BoxPS to load sparse parameters of next pass to GPU Memory
Examples:
.. code-block:: python
......@@ -1456,7 +1456,7 @@ class BoxPSDataset(InMemoryDataset):
def end_pass(self, need_save_delta):
"""
End Pass
Notify BoxPS that current pass ended
Notify BoxPS that current pass ended
Examples:
.. code-block:: python
......@@ -1522,12 +1522,12 @@ class BoxPSDataset(InMemoryDataset):
def slots_shuffle(self, slots):
"""
Slots Shuffle
Slots Shuffle is a shuffle method in slots level, which is usually used
Slots Shuffle
Slots Shuffle is a shuffle method in slots level, which is usually used
in sparse feature with large scale of instances. To compare the metric, i.e.
auc while doing slots shuffle on one or several slots with baseline to
auc while doing slots shuffle on one or several slots with baseline to
evaluate the importance level of slots(features).
Args:
slots(list[string]): the set of slots(string) to do slots shuffle.
......@@ -1585,7 +1585,7 @@ class BoxPSDataset(InMemoryDataset):
def preprocess_instance(self):
"""
Merge pv instance and convey it from input_channel to input_pv_channel.
Merge pv instance and convey it from input_channel to input_pv_channel.
It will be effective when enable_pv_merge_ is True.
Examples:
......
......@@ -360,7 +360,7 @@ class ElasticManager(object):
def _parse_np(self, np: str):
"""
np format is "MIN" or "MIN:MAX"
np format is "MIN" or "MIN:MAX"
"""
np_str = np or os.getenv('PADDLE_ELASTIC_NP', "0")
np_dict = np_str.split(":")
......
......@@ -174,14 +174,14 @@ class Fleet(object):
Args:
role_maker (RoleMakerBase, optional): A ``RoleMakerBase`` containing the configuration
of environment variables related to distributed training.If you did not initialize
of environment variables related to distributed training.If you did not initialize
the rolemaker by yourself, it will be automatically initialized to PaddleRoleMaker.
The default value is None.
is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program
is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program
runs on Collective mode or ParameterServer mode. True means the program runs on
Collective mode, and False means running on ParameterServer mode. The default value
Collective mode, and False means running on ParameterServer mode. The default value
is False.
strategy (DistributedStrategy): Extra properties for distributed training.
strategy (DistributedStrategy): Extra properties for distributed training.
For details, please refer to paddle.distributed.fleet.DistributedStrategy. Default: None.
......@@ -991,10 +991,10 @@ class Fleet(object):
Args:
optimizer(Optimizer): The executor to run for init server.
strategy(DistributedStrategy): Extra properties for distributed optimizer.
strategy(DistributedStrategy): Extra properties for distributed optimizer.
It is recommended to use DistributedStrategy in fleet.init(). The strategy
here is for compatibility. If the strategy in fleet.distributed_optimizer()
is not None, then it will overwrite the DistributedStrategy in fleet.init(),
here is for compatibility. If the strategy in fleet.distributed_optimizer()
is not None, then it will overwrite the DistributedStrategy in fleet.init(),
which will take effect in distributed training.
Returns:
......@@ -1057,14 +1057,14 @@ class Fleet(object):
use_fp16_test=False):
"""
Init the amp training, such as cast fp32 parameters to fp16 type.
Args:
place(CUDAPlace): place is used to initialize
place(CUDAPlace): place is used to initialize
fp16 parameters with fp32 values.
scope(Scope): The scope is used to find fp32 parameters.
test_program(Program): The program is used for testing.
use_fp16_test(bool): Whether to use fp16 testing.
Examples:
.. code-block:: python
......@@ -1086,7 +1086,7 @@ class Fleet(object):
loss = paddle.mean(hidden)
# 2) Create the optimizer and set `multi_precision` to True.
# Setting `multi_precision` to True can avoid the poor accuracy
# or the slow convergence in a way.
# or the slow convergence in a way.
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
# 3) These ops in `custom_black_list` will keep in the float32 computation type.
amp_list = paddle.static.amp.CustomOpLists(
......@@ -1106,9 +1106,9 @@ class Fleet(object):
# 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
# If you want to perform the testing process, you should pass `test_program` into `amp_init`.
optimizer.amp_init(place, scope=paddle.static.global_scope())
if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
run_example_code()
run_example_code()
"""
amp_optimizer = self._get_amp_optimizer()
return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test)
......
......@@ -39,7 +39,7 @@ class TaskNode:
:param role (int): The role of the task node. (Will be removed in the future)
:param node_type (str): The type of the task node.
:param task_id (int): The id of task node.
:param ops (list): A list of op.desc to init the task node. (Will be removed in the future)
:param ops (list): A list of op.desc to init the task node. (Will be removed in the future)
:param program (Program): An instance of Program to init the task node.
:param lazy_initialize (bool): In user-defined task, the program may change adding feed/fetch op. As efficient consideration, the task node will have the C++ object later.
"""
......
......@@ -543,7 +543,7 @@ def which_distributed_mode(args):
def launch():
"""
Paddle distribution training entry ``python -m paddle.distributed.launch``.
Usage:
.. code-block:: bash
:name: code-block-bash1
......@@ -553,7 +553,7 @@ def launch():
[--worker_num WORKER_NUM] [--server_num SERVER_NUM] [--heter_worker_num HETER_WORKER_NUM]
[--http_port HTTP_PORT] [--elastic_server ELASTIC_SERVER] [--job_id JOB_ID] [--np NP] [--scale SCALE]
[--host HOST] [--force FORCE]
training_script ...
training_script ...
Base Parameters:
......@@ -566,9 +566,9 @@ def launch():
- ``--gpus``: It's for gpu training. e.g., ``--gpus=0,1,2,3`` will launch four training processes each bound to one gpu.
- ``--selected_gpus``: gpus aliases, recommend to use ``--gpus``.
- ``--xpus``: It's for xpu training if xpu is available. e.g., ``--xpus=0,1,2,3``.
- ``--selected_xpus``: xpus aliases, recommend to use ``--xpus``.
- ``--mlus``: It's for mlu training. e.g., ``--mlus=0,1,2,3`` will launch four training processes each bound to one mlu.
......@@ -594,7 +594,7 @@ def launch():
- ``--server_num``: Number of servers (It recommend to set when in the emulated distributed environment using single node)
- ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)
- ``--heter_devices``: Type of heter_device in each stage
- ``--http_port``: Gloo http Port
......@@ -615,18 +615,18 @@ def launch():
Examples 1 (collective, single node):
.. code-block:: bash
:name: code-block-example-bash1
# For training on single node using 4 gpus.
python -m paddle.distributed.launch --gpus=0,1,2,3 train.py --lr=0.01
Examples 2 (collective, multi node):
.. code-block:: bash
:name: code-block-example-bash2
# The parameters of --gpus and --ips must be consistent in each node.
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17
# On 192.168.0.16:
......@@ -634,15 +634,15 @@ def launch():
# On 192.168.0.17:
python -m paddle.distributed.launch --gpus=0,1,2,3 --ips=192.168.0.16,192.168.0.17 train.py --lr=0.01
Examples 3 (ps, cpu, single node):
.. code-block:: bash
:name: code-block-example-bash3
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
Examples 4 (ps, cpu, multi node):
.. code-block:: bash
:name: code-block-example-bash4
......@@ -662,10 +662,10 @@ def launch():
:name: code-block-example-bash5
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
Examples 6 (ps, gpu, multi node):
.. code-block:: bash
:name: code-block-example-bash6
......@@ -687,10 +687,10 @@ def launch():
:name: code-block-example-bash7
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
Examples 8 (ps-heter, cpu + gpu, multi node):
.. code-block:: bash
:name: code-block-example-bash8
......@@ -712,7 +712,7 @@ def launch():
:name: code-block-example-bash9
python -m paddle.distributed.launch --elastic_server=127.0.0.1:2379 --np=2 --job_id=job1 --gpus=0,1,2,3 train.py
"""
args = _parse_args()
......
......@@ -27,7 +27,7 @@ def _is_trainable(param):
class DygraphShardingOptimizer(object):
"""
A wrapper for Sharding Optimizer in Dygraph.
A wrapper for Sharding Optimizer in Dygraph.
.. warning: DygraphShardingOptimizer is experimental and subject to change.
......@@ -88,7 +88,7 @@ class DygraphShardingOptimizer(object):
Partitions parameters among sharding ranks.
Return:
Dict[int, List]
Dict[int, List]
"""
# TODO(JZ-LIANG) support multiple partition methods
# method1: greedy even but unorder
......@@ -113,7 +113,7 @@ class DygraphShardingOptimizer(object):
mapping parameters to the shard which holds it.
Return:
Dict[str, int]
Dict[str, int]
"""
mapping = {}
for rank, params in self._rank2params.items():
......
......@@ -49,7 +49,7 @@ align = {
class ShardingOptimizerStage2(Optimizer):
"""
A wrapper for Sharding Stage2 Optimizer in Dygraph.
A wrapper for Sharding Stage2 Optimizer in Dygraph.
.. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer.
......
......@@ -816,7 +816,7 @@ def insert_scale_loss_grad_ops(block, scale=1.0):
def comm_analyse(main_program):
"""
Analyse the parameter size that need to be broadcast/allreduce during sharding training
Analyse the parameter size that need to be broadcast/allreduce during sharding training
"""
reduce_vars = {}
broadcast_vars = {}
......@@ -858,7 +858,7 @@ def comm_analyse(main_program):
def add_sync_comm(program, sharding_ring_id):
"""
When clone a test prog by clone from the sharding main prog,
When clone a test prog by clone from the sharding main prog,
part of the sync_comm op maybe be pruned by mistake, this function
add the sync_comm op for the test prog.
......
......@@ -961,7 +961,7 @@ class ShardingOptimizer(MetaOptimizerBase):
2. prune cast_fp32_to_fp16; update amp_infine_checking
3. prune gradient_clip related; update global_norm_sum
4. prune optimizer op + param + gradient
"""
weightdecay_helper = WeightDecayHelper()
weightdecay_helper.prune_weight_decay(block, shard)
......@@ -1066,7 +1066,7 @@ class ShardingOptimizer(MetaOptimizerBase):
add broadcast allreduce op
if enable gradient_merge, insert related ops
if combined with pipeline(grad accumulate),
if combined with pipeline(grad accumulate),
the grad allreduce should be done in optimize role
"""
if len(self._segments) < 1:
......@@ -1302,7 +1302,7 @@ class ShardingOptimizer(MetaOptimizerBase):
pp: 4
pp-pair: >= 20
if one parallelism is not enable: -1
and only support parallelism hierarchy: mp --> sharding --> pp --> dp
and only support parallelism hierarchy: mp --> sharding --> pp --> dp
"""
# step 1: initialize nccl
self.global_word_size = self.role_maker._worker_num()
......@@ -1688,7 +1688,7 @@ class ShardingOptimizer(MetaOptimizerBase):
grad@gradientmerge / acc_step
re-create all optimize ops of origin main block and rename them
cast(backward)
amp
amp
clip
opt
# fill constant grad@gradientmerge
......
......@@ -198,11 +198,11 @@ class PipelineLayer(Layer):
"""PipelineLayer
Args:
layers(Iterable): A sequence of layers description to define the structure for pipeline.
num_stages(int, optional): pp degree, if not specified, 'topology' parameter must be given.
num_stages(int, optional): pp degree, if not specified, 'topology' parameter must be given.
topology(CommunicateTopology, optional): topo of hybrid parallel, if it is None, 'num_stages' parameters must be given.
loss_fn(callable, optional): Loss function.
seg_method(str, optional): the method of splitting pp layer, default 'uniform', or use specific layer to split, method's name must be start with 'layer:'.
recompute_interval(int, optional): the number of layers to be used recompute, the value of 0 represents no recompute. default 0.
recompute_interval(int, optional): the number of layers to be used recompute, the value of 0 represents no recompute. default 0.
recompute_ctx(dict,optional): the context of recompute, when 'recompute_interval' > 0, the context must be given.
num_virtual_pipeline_stages(int, optional): the num of virtual pipeline stages for interleave pp.
Examples:
......@@ -212,7 +212,7 @@ class PipelineLayer(Layer):
from paddle.fluid.dygraph.layers import Layer
import paddle.nn.functional as F
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
pipeline_parallel_size = 2
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
......@@ -224,19 +224,19 @@ class PipelineLayer(Layer):
"accumulate_steps": 4,
"micro_batch_size": 2
}
fleet.init(is_collective=True, strategy=strategy)
hcg = fleet.get_hybrid_communicate_group()
class ReshapeHelp(Layer):
def __init__(self, shape):
super(ReshapeHelp, self).__init__()
self.shape = shape
def forward(self, x):
return x.reshape(shape=self.shape)
class AlexNetPipeDesc(PipelineLayer):
def __init__(self, num_classes=10, **kwargs):
self.num_classes = num_classes
......@@ -268,7 +268,7 @@ class PipelineLayer(Layer):
]
super(AlexNetPipeDesc, self).__init__(
layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
model = AlexNetPipeDesc(num_stages=pipeline_parallel_size, topology=hcg._topo)
"""
......
......@@ -107,7 +107,7 @@ def _initialize_recompute_hcg(hcg):
def _all_gather(tensor, group=None, use_calc_stream=True):
"""
The main difference with paddle.distributed.all_gather:
The main difference with paddle.distributed.all_gather:
no need to pass in tensor_list, the returned tensor is spliced
"""
if group is not None and not group.is_member():
......
......@@ -47,7 +47,7 @@ align = {
class GroupShardedOptimizerStage2(Optimizer):
"""
A wrapper for Sharding Stage2 Optimizer in Dygraph.
A wrapper for Sharding Stage2 Optimizer in Dygraph.
.. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer.
......
......@@ -47,8 +47,8 @@ def _trainable(param):
class GroupShardedStage2(nn.Layer):
"""
A wrapper for Sharding Stage2 Layer in Dygraph.
"""
A wrapper for Sharding Stage2 Layer in Dygraph.
.. warning: GroupShardedStage2 encapsulates the layer strategy and integrates it into the nn.Layer.
.. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
"""
......
......@@ -33,7 +33,7 @@ from .group_sharded_utils import Type, GroupShardedClipGrad, device_guard
def _all_gather(tensor, buffer_size, group):
"""
The main difference with paddle.distributed.all_gather:
The main difference with paddle.distributed.all_gather:
no need to pass in tensor_list, the returned tensor is spliced
"""
......@@ -58,8 +58,8 @@ CHECK_LAYER = dict() # Help to check layer's id -> layer's name
class GroupShardedStage3(nn.Layer):
"""
A wrapper for Sharding Stage3 Layer in Dygraph.
"""
A wrapper for Sharding Stage3 Layer in Dygraph.
.. warning: GroupShardedStage3 encapsulates the layer strategy and integrates it into the nn.Layer.
......
......@@ -48,8 +48,8 @@ def _trainable(param):
class ShardingStage2(nn.Layer):
"""
A wrapper for Sharding Stage2 Layer in Dygraph.
"""
A wrapper for Sharding Stage2 Layer in Dygraph.
.. warning: ShardingStage2 encapsulates the layer strategy and integrates it into the nn.Layer.
.. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
"""
......
......@@ -50,8 +50,8 @@ CHECK_LAYER = dict() # Help to check layer's id -> layer's name
class ShardingStage3(nn.Layer):
"""
A wrapper for Sharding Stage3 Layer in Dygraph.
"""
A wrapper for Sharding Stage3 Layer in Dygraph.
.. warning: ShardingStage3 encapsulates the layer strategy and integrates it into the nn.Layer.
......
......@@ -41,7 +41,7 @@ def sum(input, scope=None, util=None):
global_cnt = fluid.layers.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
tmp = fluid.layers.elementwise_add(cnt, global_cnt)
fluid.layers.assign(tmp, global_cnt)
# in train.py, after train or infer
res = np.array(scope.find_var(global_cnt.name).get_tensor())
print("sum array: ", paddle.distributed.fleet.sum(res))
......
......@@ -131,14 +131,14 @@ class LocalFS(FS):
"""
def ls_dir(self, fs_path):
"""
"""
List directorys and files under `fs_path` .
Args:
fs_path(str): The local file path.
Returns:
Tuple: Return a 2-tuple, the first is a list of all its subdirectories,
Tuple: Return a 2-tuple, the first is a list of all its subdirectories,
and the second is a list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
Examples:
......@@ -290,7 +290,7 @@ class LocalFS(FS):
fs_path(str): The local file path.
Returns:
Bool: Wheter it's a file or directory, return true if the path exists,
Bool: Wheter it's a file or directory, return true if the path exists,
otherwise return false.
Examples:
......@@ -359,7 +359,7 @@ class LocalFS(FS):
return self.rename(src_path, dst_path)
def list_dirs(self, fs_path):
"""
"""
Only list directorys under `fs_path` .
Args:
......@@ -430,7 +430,7 @@ class HDFSClient(FS):
A tool of HDFS.
Args:
hadoop_home(str): Hadoop home.
hadoop_home(str): Hadoop home.
configs(dict): Hadoop config. It is a dictionary and needs to contain the
keys: "fs.default.name" and "hadoop.job.ugi".
......@@ -491,7 +491,7 @@ class HDFSClient(FS):
@_handle_errors()
def list_dirs(self, fs_path):
"""
"""
Only list directorys under `fs_path` .
Args:
......@@ -523,14 +523,14 @@ class HDFSClient(FS):
@_handle_errors()
def ls_dir(self, fs_path):
"""
"""
List directorys and files under `fs_path` .
Args:
fs_path(str): The HDFS file path.
Returns:
Tuple: Return a 2-tuple, the first element is the list of all its subdirectories,
Tuple: Return a 2-tuple, the first element is the list of all its subdirectories,
and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
Examples:
......@@ -923,7 +923,7 @@ class HDFSClient(FS):
fs_src_path(str): Name of the file or directory, that's needed to be moved.
fs_dst_path(str): Name of the file or directory to which to move to.
overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption.
test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption.
Examples:
......@@ -1174,7 +1174,7 @@ class AFSClient(FS):
self._fs.init(fs_name, fs_user, fs_passwd, fs_conf)
def list_dirs(self, fs_path):
"""
"""
Only list directorys under `fs_path` .
Args:
......@@ -1200,14 +1200,14 @@ class AFSClient(FS):
return dirs
def ls_dir(self, fs_path):
"""
"""
List directorys and files under `fs_path` .
Args:
fs_path(str): The HDFS file path.
Returns:
Tuple: Return a 2-tuple, the first element is the list of all its subdirectories,
Tuple: Return a 2-tuple, the first element is the list of all its subdirectories,
and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
Examples:
......@@ -1438,7 +1438,7 @@ class AFSClient(FS):
fs_src_path(str): Name of the file or directory, that's needed to be moved.
fs_dst_path(str): Name of the file or directory to which to move to.
overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption.
test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption.
Examples:
......
......@@ -23,7 +23,7 @@ import numpy as np
class HybridParallelInferenceHelper(object):
"""
A helper class to split program for inference with hybrid parallelism.
Args:
startup_program (Program): the startup program.
main_program (Program): the main program.
......@@ -34,15 +34,15 @@ class HybridParallelInferenceHelper(object):
init_comm (bool): wheter if initilize comminication group. Default ``True``.
role_maker (RoleMakerBase or subclass): user custom define RoleMakerBase.
If ``role_maker==None``, then use PaddleCloudRoleMaker. Default ``None``.
Returns:
None.
Write Paradigm:
.. code-block:: bash
:name: bash-example1
# while op pattern
with paddle.fluid.device_guard(f'{device}:all'):
# init global cond
......@@ -51,10 +51,10 @@ class HybridParallelInferenceHelper(object):
cond_int = layers.fill_constant(shape=[1], dtype="int64", value=0, force_cpu=False, name="cond_int")
cond = layers.cast(step_idx < max_len, dtype="bool")
while_op = layers.While(cond, is_test=True)
# init global lod_tensor_array for generation task
arr = layers.array_write(data, step_idx)
with while_op.block():
with paddle.fluid.device_guard(f'{device}:all'):
# read data from global lod_tensor_array
......@@ -63,36 +63,36 @@ class HybridParallelInferenceHelper(object):
# it need for send_v2 of lod_tensor_array
layers.increment(x=step_idx, value=1.0, in_place=True)
layers.array_write(element_in_arr, i=step_idx, array=arr)
with paddle.fluid.device_guard(f'{device}:0'):
... some code
with paddle.fluid.device_guard(f'{device}:1'):
... some code
with paddle.fluid.device_guard(f'{device}:{num_pp-1}'):
# generate some data in while block and write to global lod_tensor_array
# that they are read in next while step.
# we will using send_v2 to send global lod_tensor_array to other pipeline and sync
layers.array_write(other_var, i=step_idx, array=arr)
# update cond and assign to cond_int, we will sync cond_int
layers.assign(layers.cast(cond, dtype="int32"), cond_int)
with paddle.fluid.device_guard(f'{model._device}:all'):
# the code below must at end of while block and exists in device:all
layers.assign(layers.cast(cond_int, dtype='bool'), cond)
with paddle.fluid.device_guard(f'{model._device}:all'):
# use a empty lod_tensor_array to clear lod_tensor_array
layers.assign(layers.create_array(data.dtype), arr)
Examples:
.. code-block:: python
:name: code-example1
# required: distributed
import os
import numpy as np
......@@ -172,7 +172,7 @@ class HybridParallelInferenceHelper(object):
exe = paddle.static.Executor(paddle.CUDAPlace(dev_id))
exe.run(startup_program)
np.random.seed(2333)
for step in range(5):
init_data = np.random.uniform(low=0.0, high=1.0, size=[2, 2]).astype('float32')
......@@ -358,7 +358,7 @@ class HybridParallelInferenceHelper(object):
Args:
stage (int): pipeline stage
block_idx (int): block index
Returns:
used_var_names (set): used var names in block_idx block
"""
......@@ -445,9 +445,9 @@ class HybridParallelInferenceHelper(object):
def _add_op_device_attr(self, block):
"""
Add op_device attrribute for ops in block that have
Add op_device attrribute for ops in block that have
not that attribute set.
Args:
block (Block): the block to process.
"""
......@@ -474,7 +474,7 @@ class HybridParallelInferenceHelper(object):
def _check_validation(self, block):
"""
Check whether ops in a block have both the op_device and the
Check whether ops in a block have both the op_device and the
op_role attributes set.
"""
assert isinstance(block, Block)
......@@ -729,7 +729,7 @@ class HybridParallelInferenceHelper(object):
"""
Generate inference program.
Params:
sync_in_while_lastpp2firstpp_var_names (list(str)): the vars in the last pipeline
sync_in_while_lastpp2firstpp_var_names (list(str)): the vars in the last pipeline
that need to send var to first pipeline and exclude bool dtype var
sync_in_while_var_names (list(str)): the vars sync among all pipeline in while block
e.g cond. Note that cond cannot be bool dtype.
......
......@@ -352,13 +352,13 @@ def recompute(function, *args, **kwargs):
recompute intermediate activations to save then memory.
Parameters:
function(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model
whose intermediate activations will be released to save memory in forward stage and will be recomputed
in backward stage for gradient calculation.
*args(Tensor): inputs to the function.
**kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to
indicate whether to save the forward rng. If it is True, then the last forward rng value will be
restored when the forward recalculation of backpropagation is performed. The default
function(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model
whose intermediate activations will be released to save memory in forward stage and will be recomputed
in backward stage for gradient calculation.
*args(Tensor): inputs to the function.
**kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to
indicate whether to save the forward rng. If it is True, then the last forward rng value will be
restored when the forward recalculation of backpropagation is performed. The default
preserve_rng_state is True.
Returns:
......
......@@ -18,7 +18,7 @@ from .context import Context
def launch():
"""
Paddle distribution training entry ``python -m paddle.distributed.launch``.
Usage:
.. code-block:: bash
:name: code-block-bash1
......@@ -77,7 +77,7 @@ def launch():
- ``--heter_workers``: User defined heter workers ip1:port1;ip2:port2, e.g., ``--heter_workers="192.168.0.16:6172;192.168.0.17:6172"``
- ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)
- ``--heter_devices``: Type of heter_device in each stage
- ``--gloo_port``: Gloo http Port. Default ``--gloo_port=6767``.
......@@ -94,12 +94,12 @@ def launch():
IPU Parameters:
IPU distributed launch only requires and allowes three arguments ``--devices``, ``training_script`` and ``training_script_args``.
The ``--devices`` is the number of IPU devices. e.g., ``--devices=4`` will launch the training program with four IPU devices.
The ``training_script`` is only allowed to set as ``ipu``.
The ``training_script`` is only allowed to set as ``ipu``.
The ``training_script_args`` includes arguments required by IPU distributed launch and illustrated as below.
``Examples 10`` has provided a example of paddle.distributed.launch with IPUs.
- ``--hosts``: The hosts for IPU distributd training. Each host is able to include multiple processes.
- ``--nproc_per_host``: The number of processes launched per host. Each process is able to include multiple replicas.
- ``--ipus_per_replica``: The number of IPUs requested per replica. Each replica is able to include multiple IPUs.
......@@ -144,16 +144,16 @@ def launch():
Examples 1 (collective, single node):
.. code-block:: bash
:name: code-block-example-bash1
# For training on single node using 4 gpus.
python -m paddle.distributed.launch --devices=0,1,2,3 train.py --lr=0.01
Examples 2 (collective, multi node):
.. code-block:: bash
:name: code-block-example-bash2
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17
# On 192.168.0.16:
......@@ -161,15 +161,15 @@ def launch():
# On 192.168.0.17:
python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
Examples 3 (ps, cpu, single node):
.. code-block:: bash
:name: code-block-example-bash3
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
Examples 4 (ps, cpu, multi node):
.. code-block:: bash
:name: code-block-example-bash4
......@@ -194,10 +194,10 @@ def launch():
:name: code-block-example-bash5
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
Examples 6 (ps, gpu, multi node):
.. code-block:: bash
:name: code-block-example-bash6
......@@ -219,10 +219,10 @@ def launch():
:name: code-block-example-bash7
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
Examples 8 (ps-heter, cpu + gpu, multi node):
.. code-block:: bash
:name: code-block-example-bash8
......@@ -246,7 +246,7 @@ def launch():
# With the following command, the job will begin to run immediately if 4 nodes are ready,
# or it will run after elastic_timeout if only 2 or 3 nodes ready
python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:4 train.py
# once the number of nodes changes between 2:4 during training, the strategy holds
Examples 10 (ipu):
......
......@@ -60,18 +60,18 @@ def _number_count(numbers, upper_range):
def _assign_pos(x, cum_count):
"""
Assign pos decides which tokens should be fetched belong to
Assign pos decides which tokens should be fetched belong to
specially expert orderingly.
Args:
x (Tensor): Tensor. Every element in the list must be a Tensor whose data type
should be float16, float32, float64, int32 or int64.
cum_count (Tensor): The cumulative sum tokens of counters. Every element in the list must be a Tensor whose
cum_count (Tensor): The cumulative sum tokens of counters. Every element in the list must be a Tensor whose
data type should be int64.
Returns:
out (Tensor): Assemble numbers in the order of counters.
out (Tensor): Assemble numbers in the order of counters.
Examples:
.. code-block:: python
......@@ -185,10 +185,10 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
gate_idx (Tensor): Represents the gate_id sequence corresponding to the input data with type int32, int64.
expert_count (Tensor): The quantity value counted on the gate_id sequence of the input data with type int32, int64.
n_worker(int,optional): The number of workers on the trainer with type int64.
Returns:
new_gate_idx (Tensor): The gate_id sequence corresponding to the new input data after passing through prune.
Examples:
.. code-block:: python
......
......@@ -105,7 +105,7 @@ def init_parallel_env():
Returns:
None
Examples:
.. code-block:: python
# required: gpu
......@@ -119,7 +119,7 @@ def init_parallel_env():
super(LinearNet, self).__init__()
self._linear1 = nn.Linear(10, 10)
self._linear2 = nn.Linear(10, 1)
def forward(self, x):
return self._linear2(self._linear1(x))
......@@ -140,7 +140,7 @@ def init_parallel_env():
outputs = dp_layer(inputs)
labels = paddle.randn([10, 1], 'float32')
loss = loss_fn(outputs, labels)
loss.backward()
adam.step()
......
......@@ -41,7 +41,7 @@ def numel(var):
class DataParallelOptimizationPass(PassBase):
"""
Apply Optimizations that specialized for data parallelism in Auto Parallel.
1. prune grad scaling
1. prune grad scaling
2. overlap comm and calc
3. fuse allreduce
"""
......@@ -350,9 +350,9 @@ class DataParallelOptimizationPass(PassBase):
"""
conditions for gradients to be grouped:
1. group size < max_fuse_numel
2. same dp group
2. same dp group
3. same dtype
4. dependency: grad would NOT be used by other ops within group segment
4. dependency: grad would NOT be used by other ops within group segment
gradients inside same group would be fuse into one coalesce tensor
"""
......
......@@ -126,7 +126,7 @@ class FP16State(object):
def _build_state(self):
"""
mark the execution mode (fp16 or fp32) for ops in all blocks
mark the execution mode (fp16 or fp32) for ops in all blocks
include forward ops & backward ops
"""
# mark op dtype
......
......@@ -95,7 +95,7 @@ class RecomputeState(ProgramStats):
def modify_forward_desc_for_recompute(self, dist_context):
"""
If program's foward part has 'dropout' op, this function will insert
If program's foward part has 'dropout' op, this function will insert
a seed op before it to guarantee that two dropout op have the same outputs.
"""
op_types = [op.desc.type() for op in self._ops]
......
......@@ -86,11 +86,11 @@ def prune_program(program, start_op_idx, end_op_idx):
def split_program(program, op_indices):
"""
Split the program by op_indices.
Split the program by op_indices.
For examples, a program has 100 ops, and op_indices = [25, 60].
Then the program is splitted into 3 parts, containing 25, 35 and 40
ops respectively.
ops respectively.
The return values are a tuple with 3 elements: the splitted program
list, the input var names of each splitted program, and the output
......
......@@ -1140,7 +1140,7 @@ class SplitTrainerOpsPass(PassBase):
split cpu-trainer program from origin-program
1. find heter op (located on different device)
2. find input&output of every heter-block
3. create cpu-trainer program, add send&recv op
3. create cpu-trainer program, add send&recv op
"""
attrs = pass_ctx._attrs
default_device_ = 'cpu'
......
......@@ -611,7 +611,7 @@ def find_heter_ops(program, default_device="cpu"):
if no_grad_var in var2idx:
"""
insert sum op & remove sum op from var2idx and origin place
"""
op_list = list(block.ops)
sum_op = op_list[var2idx[no_grad_var]]
......@@ -1335,7 +1335,7 @@ def build_var_distributed(context):
context["param_name_to_grad_name"] = param_name_to_grad_name
context["grad_name_to_param_name"] = grad_name_to_param_name
'''
'''
print("public build_var_distributed origin_sparse_pairs:",
context["origin_sparse_pairs"])
print("public build_var_distributed origin_for_dense:",
......
......@@ -62,12 +62,12 @@ def group_sharded_parallel(model,
buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
Returns:
model: A wrapper for group sharded given model.
optimizer: A wrapper for group sharded given optimizer.
scaler: A wrapper for group sharded given scaler.
Examples:
.. code-block:: python
......@@ -184,7 +184,7 @@ def save_group_sharded_model(model, output, optimizer=None):
model (Layer): A wrapper for group sharded given model.
output (str): Save directory.
optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.
Examples:
.. code-block:: python
......
......@@ -60,10 +60,10 @@ def global_scatter(x,
group=None,
use_calc_stream=True):
"""
The global_scatter operator distributes the data of x to n_expert * world_size experts according to local_count,
and then receives data according to global_count. The expert refers to a user-defined expert network,
The global_scatter operator distributes the data of x to n_expert * world_size experts according to local_count,
and then receives data according to global_count. The expert refers to a user-defined expert network,
n_expert refers to the number of expert networks owned by each card, and world_size refers to the number of graphics cards running the network.
As shown below, the value of the world size is 2, n_expert 2, the batch size of the x 4 and local_count is [2, 0, 2, 0].
The global_count of the rank 0 is [2, 0, , ], rank 1 is [2, 0, ,](Due to the limited space, only the data calculated on rank 0 is shown here).
In the global_scatter operator, local_count[i] represents sending local_count[i] data to the (i % n_expert)th expert of the (i // n_expert)th card,
......@@ -101,10 +101,10 @@ def global_scatter(x,
how many data needed to be received. The tensor data type should be int64.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
Returns:
out (Tensor): The data received from all experts.
out (Tensor): The data received from all experts.
Examples:
.. code-block:: python
......@@ -120,7 +120,7 @@ def global_scatter(x,
local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], \
dtype=np.float32)
if paddle.distributed.ParallelEnv().local_rank == 0:
local_count = np.array([2, 1, 1, 1])
local_count = np.array([2, 1, 1, 1])
global_count = np.array([2, 1, 1, 1])
else:
local_count = np.array([1, 1, 2, 1])
......@@ -195,11 +195,11 @@ def global_gather(x,
The process of global_gather sending data is as follows:
The global_count[0] of the 0th card represents sending 2 data to the 0th expert of the 0th card;
The global_count[1] of the 0th card represents sending 0 data to the 1th expert of the 0th card;
The global_count[0] of the 1th card represents sending 2 data to the 0th expert of the 0th card;
The global_count[1] of the 1th card represents sending 0 data to the 1th expert of the 0th card.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/global_scatter_gather.png
......@@ -216,10 +216,10 @@ def global_gather(x,
how many data needed to be sent. Tensor data type should be int64.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
Returns:
out (Tensor): The data received from all experts.
out (Tensor): The data received from all experts.
Examples:
.. code-block:: python
......
......@@ -21,11 +21,11 @@ class Beta(exponential_family.ExponentialFamily):
r"""
Beta distribution parameterized by alpha and beta.
In probability theory and statistics, the beta distribution is a family of
continuous probability distributions defined on the interval [0, 1]
parameterized by two positive shape parameters, denoted by alpha and beta,
that appear as exponents of the random variable and control the shape of
the distribution. The generalization to multiple variables is called a
In probability theory and statistics, the beta distribution is a family of
continuous probability distributions defined on the interval [0, 1]
parameterized by two positive shape parameters, denoted by alpha and beta,
that appear as exponents of the random variable and control the shape of
the distribution. The generalization to multiple variables is called a
Dirichlet distribution.
The probability density function (pdf) is
......@@ -38,18 +38,18 @@ class Beta(exponential_family.ExponentialFamily):
.. math::
B(\alpha, \beta) = \int_{0}^{1} t^{\alpha - 1} (1-t)^{\beta - 1}\mathrm{d}t
B(\alpha, \beta) = \int_{0}^{1} t^{\alpha - 1} (1-t)^{\beta - 1}\mathrm{d}t
Args:
alpha (float|Tensor): Alpha parameter. It supports broadcast semantics.
The value of alpha must be positive. When the parameter is a tensor,
it represents multiple independent distribution with
alpha (float|Tensor): Alpha parameter. It supports broadcast semantics.
The value of alpha must be positive. When the parameter is a tensor,
it represents multiple independent distribution with
a batch_shape(refer to ``Distribution`` ).
beta (float|Tensor): Beta parameter. It supports broadcast semantics.
The value of beta must be positive(>0). When the parameter is tensor,
it represent multiple independent distribution with
a batch_shape(refer to ``Distribution`` ).
beta (float|Tensor): Beta parameter. It supports broadcast semantics.
The value of beta must be positive(>0). When the parameter is tensor,
it represent multiple independent distribution with
a batch_shape(refer to ``Distribution`` ).
Examples:
......@@ -114,7 +114,7 @@ class Beta(exponential_family.ExponentialFamily):
Args:
value (Tensor): Value to be evaluated.
Returns:
Tensor: Probability.
"""
......@@ -125,7 +125,7 @@ class Beta(exponential_family.ExponentialFamily):
Args:
value (Tensor): Value to be evaluated
Returns:
Tensor: Log probability.
"""
......
......@@ -31,9 +31,9 @@ from paddle.tensor import arange, concat, gather_nd, multinomial
class Categorical(distribution.Distribution):
r"""
Categorical distribution is a discrete probability distribution that
describes the possible results of a random variable that can take on
one of K possible categories, with the probability of each category
Categorical distribution is a discrete probability distribution that
describes the possible results of a random variable that can take on
one of K possible categories, with the probability of each category
separately specified.
The probability mass function (pmf) is:
......@@ -267,9 +267,9 @@ class Categorical(distribution.Distribution):
def probs(self, value):
"""Probabilities of the given category (``value``).
If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as
If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as
category, and the others represents the different distributions.
At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the
At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the
same number of distributions as ``logits``.
If ``value`` is not 1-D Tensor, ``value`` should have the same number distributions
with ``logits. That is, ``value[:-1] = logits[:-1]``.
......
......@@ -23,32 +23,32 @@ class Dirichlet(exponential_family.ExponentialFamily):
r"""
Dirichlet distribution with parameter "concentration".
The Dirichlet distribution is defined over the `(k-1)-simplex` using a
The Dirichlet distribution is defined over the `(k-1)-simplex` using a
positive, lenght-k vector concentration(`k > 1`).
The Dirichlet is identically the Beta distribution when `k = 2`.
For independent and identically distributed continuous random variable
:math:`\boldsymbol X \in R_k` , and support
:math:`\boldsymbol X \in (0,1), ||\boldsymbol X|| = 1` ,
For independent and identically distributed continuous random variable
:math:`\boldsymbol X \in R_k` , and support
:math:`\boldsymbol X \in (0,1), ||\boldsymbol X|| = 1` ,
The probability density function (pdf) is
.. math::
f(\boldsymbol X; \boldsymbol \alpha) = \frac{1}{B(\boldsymbol \alpha)} \prod_{i=1}^{k}x_i^{\alpha_i-1}
where :math:`\boldsymbol \alpha = {\alpha_1,...,\alpha_k}, k \ge 2` is
f(\boldsymbol X; \boldsymbol \alpha) = \frac{1}{B(\boldsymbol \alpha)} \prod_{i=1}^{k}x_i^{\alpha_i-1}
where :math:`\boldsymbol \alpha = {\alpha_1,...,\alpha_k}, k \ge 2` is
parameter, the normalizing constant is the multivariate beta function.
.. math::
B(\boldsymbol \alpha) = \frac{\prod_{i=1}^{k} \Gamma(\alpha_i)}{\Gamma(\alpha_0)}
:math:`\alpha_0=\sum_{i=1}^{k} \alpha_i` is the sum of parameters,
:math:`\alpha_0=\sum_{i=1}^{k} \alpha_i` is the sum of parameters,
:math:`\Gamma(\alpha)` is gamma function.
Args:
concentration (Tensor): "Concentration" parameter of dirichlet
distribution, also called :math:`\alpha`. When it's over one
concentration (Tensor): "Concentration" parameter of dirichlet
distribution, also called :math:`\alpha`. When it's over one
dimension, the last axis denotes the parameter of distribution,
``event_shape=concentration.shape[-1:]`` , axes other than last are
condsider batch dimensions with ``batch_shape=concentration.shape[:-1]`` .
......
......@@ -39,15 +39,15 @@ from paddle.tensor import arange, concat, gather_nd, multinomial
class Distribution(object):
"""
The abstract base class for probability distributions. Functions are
The abstract base class for probability distributions. Functions are
implemented in specific distributions.
Args:
batch_shape(Sequence[int], optional): independent, not identically
batch_shape(Sequence[int], optional): independent, not identically
distributed draws, aka a "collection" or "bunch" of distributions.
event_shape(Sequence[int], optional): the shape of a single
draw from the distribution; it may be dependent across dimensions.
For scalar distributions, the event shape is []. For n-dimension
event_shape(Sequence[int], optional): the shape of a single
draw from the distribution; it may be dependent across dimensions.
For scalar distributions, the event shape is []. For n-dimension
multivariate distribution, the event shape is [n].
"""
......@@ -118,16 +118,16 @@ class Distribution(object):
def probs(self, value):
"""Probability density/mass function.
.. note::
This method will be deprecated in the future, please use `prob`
.. note::
This method will be deprecated in the future, please use `prob`
instead.
"""
raise NotImplementedError
def _extend_shape(self, sample_shape):
"""compute shape of the sample
"""compute shape of the sample
Args:
sample_shape (Tensor): sample shape
......@@ -239,9 +239,9 @@ class Distribution(object):
def _probs_to_logits(self, probs, is_binary=False):
r"""
Converts probabilities into logits. For the binary, probs denotes the
probability of occurrence of the event indexed by `1`. For the
multi-dimensional, values of last axis denote the probabilities of
Converts probabilities into logits. For the binary, probs denotes the
probability of occurrence of the event indexed by `1`. For the
multi-dimensional, values of last axis denote the probabilities of
occurrence of each of the events.
"""
return (paddle.log(probs) - paddle.log1p(-probs)) \
......@@ -249,8 +249,8 @@ class Distribution(object):
def _logits_to_probs(self, logits, is_binary=False):
r"""
Converts logits into probabilities. For the binary, each value denotes
log odds, whereas for the multi-dimensional case, the values along the
Converts logits into probabilities. For the binary, each value denotes
log odds, whereas for the multi-dimensional case, the values along the
last dimension denote the log probabilities of the events.
"""
return paddle.nn.functional.sigmoid(logits) \
......
......@@ -18,19 +18,19 @@ from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
class ExponentialFamily(distribution.Distribution):
r"""
ExponentialFamily is the base class for probability distributions belonging
to exponential family, whose probability mass/density function has the
r"""
ExponentialFamily is the base class for probability distributions belonging
to exponential family, whose probability mass/density function has the
form is defined below
ExponentialFamily is derived from `paddle.distribution.Distribution`.
.. math::
f_{F}(x; \theta) = \exp(\langle t(x), \theta\rangle - F(\theta) + k(x))
where :math:`\theta` denotes the natural parameters, :math:`t(x)` denotes
the sufficient statistic, :math:`F(\theta)` is the log normalizer function
where :math:`\theta` denotes the natural parameters, :math:`t(x)` denotes
the sufficient statistic, :math:`F(\theta)` is the log normalizer function
for a given family and :math:`k(x)` is the carrier measure.
Distribution belongs to exponential family referring to https://en.wikipedia.org/wiki/Exponential_family
......@@ -48,7 +48,7 @@ class ExponentialFamily(distribution.Distribution):
raise NotImplementedError
def entropy(self):
"""caculate entropy use `bregman divergence`
"""caculate entropy use `bregman divergence`
https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf
"""
entropy_value = -self._mean_carrier_measure
......
......@@ -20,17 +20,17 @@ class Independent(distribution.Distribution):
Reinterprets some of the batch dimensions of a distribution as event dimensions.
This is mainly useful for changing the shape of the result of
:meth:`log_prob`.
:meth:`log_prob`.
Args:
base (Distribution): The base distribution.
reinterpreted_batch_rank (int): The number of batch dimensions to
reinterpreted_batch_rank (int): The number of batch dimensions to
reinterpret as event dimensions.
Examples:
.. code-block:: python
import paddle
from paddle.distribution import independent
......
......@@ -35,7 +35,7 @@ def kl_divergence(p, q):
.. math::
KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x
KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x
Args:
p (Distribution): ``Distribution`` object.
......@@ -64,11 +64,11 @@ def kl_divergence(p, q):
def register_kl(cls_p, cls_q):
"""Decorator for register a KL divergence implemention function.
The ``kl_divergence(p, q)`` function will search concrete implemention
functions registered by ``register_kl``, according to multi-dispatch pattern.
If an implemention function is found, it will return the result, otherwise,
it will raise ``NotImplementError`` exception. Users can register
implemention funciton by the decorator.
The ``kl_divergence(p, q)`` function will search concrete implemention
functions registered by ``register_kl``, according to multi-dispatch pattern.
If an implemention function is found, it will return the result, otherwise,
it will raise ``NotImplementError`` exception. Users can register
implemention funciton by the decorator.
Args:
cls_p(Distribution): Subclass derived from ``Distribution``.
......
......@@ -24,14 +24,14 @@ except:
class Multinomial(distribution.Distribution):
r"""
Multinomial distribution parameterized by :attr:`total_count` and
Multinomial distribution parameterized by :attr:`total_count` and
:attr:`probs`.
In probability theory, the multinomial distribution is a generalization of
In probability theory, the multinomial distribution is a generalization of
the binomial distribution, it models the probability of counts for each side
of a k-sided die rolled n times. When k is 2 and n is 1, the multinomial is
the bernoulli distribution, when k is 2 and n is grater than 1, it is the
binomial distribution, when k is grater than 2 and n is 1, it is the
of a k-sided die rolled n times. When k is 2 and n is 1, the multinomial is
the bernoulli distribution, when k is 2 and n is grater than 1, it is the
binomial distribution, when k is grater than 2 and n is 1, it is the
categorical distribution.
The probability mass function (PMF) for multinomial is
......@@ -40,18 +40,18 @@ class Multinomial(distribution.Distribution):
f(x_1, ..., x_k; n, p_1,...,p_k) = \frac{n!}{x_1!...x_k!}p_1^{x_1}...p_k^{x_k}
where, :math:`n` is number of trials, k is the number of categories,
:math:`p_i` denote probability of a trial falling into each category,
:math:`{\textstyle \sum_{i=1}^{k}p_i=1}, p_i \ge 0`, and :math:`x_i` denote
count of each category.
where, :math:`n` is number of trials, k is the number of categories,
:math:`p_i` denote probability of a trial falling into each category,
:math:`{\textstyle \sum_{i=1}^{k}p_i=1}, p_i \ge 0`, and :math:`x_i` denote
count of each category.
Args:
total_count (int): Number of trials.
probs (Tensor): Probability of a trial falling into each category. Last
probs (Tensor): Probability of a trial falling into each category. Last
axis of probs indexes over categories, other axes index over batches.
Probs value should between [0, 1], and sum to 1 along last axis. If
the value over 1, it will be normalized to sum to 1 along the last
axis.
Probs value should between [0, 1], and sum to 1 along last axis. If
the value over 1, it will be normalized to sum to 1 along the last
axis.
Examples:
......
......@@ -55,7 +55,7 @@ class Normal(distribution.Distribution):
Examples:
.. code-block:: python
import paddle
from paddle.distribution import Normal
......@@ -248,7 +248,7 @@ class Normal(distribution.Distribution):
.. math::
ratio = \\frac{\sigma_0}{\sigma_1}
.. math::
diff = \mu_1 - \mu_0
......
......@@ -50,45 +50,45 @@ class Type(enum.Enum):
class Transform(object):
r"""Base class for the transformations of random variables.
``Transform`` can be used to represent any differentiable and injective
function from the subset of :math:`R^n` to subset of :math:`R^m`, generally
used for transforming a random sample generated by ``Distribution``
instance.
Suppose :math:`X` is a K-dimensional random variable with probability
density function :math:`p_X(x)`. A new random variable :math:`Y = f(X)` may
be defined by transforming :math:`X` with a suitably well-behaved funciton
:math:`f`. It suffices for what follows to note that if f is one-to-one and
its inverse :math:`f^{-1}` have a well-defined Jacobian, then the density of
``Transform`` can be used to represent any differentiable and injective
function from the subset of :math:`R^n` to subset of :math:`R^m`, generally
used for transforming a random sample generated by ``Distribution``
instance.
Suppose :math:`X` is a K-dimensional random variable with probability
density function :math:`p_X(x)`. A new random variable :math:`Y = f(X)` may
be defined by transforming :math:`X` with a suitably well-behaved funciton
:math:`f`. It suffices for what follows to note that if f is one-to-one and
its inverse :math:`f^{-1}` have a well-defined Jacobian, then the density of
:math:`Y` is
.. math::
p_Y(y) = p_X(f^{-1}(y)) |det J_{f^{-1}}(y)|
where det is the matrix determinant operation and :math:`J_{f^{-1}}(y)` is
where det is the matrix determinant operation and :math:`J_{f^{-1}}(y)` is
the Jacobian matrix of :math:`f^{-1}` evaluated at :math:`y`.
Taking :math:`x = f^{-1}(y)`, the Jacobian matrix is defined by
.. math::
J(y) = \begin{bmatrix}
{\frac{\partial x_1}{\partial y_1}} &{\frac{\partial x_1}{\partial y_2}}
{\frac{\partial x_1}{\partial y_1}} &{\frac{\partial x_1}{\partial y_2}}
&{\cdots} &{\frac{\partial x_1}{\partial y_K}} \\
{\frac{\partial x_2}{\partial y_1}} &{\frac{\partial x_2}
{\partial y_2}}&{\cdots} &{\frac{\partial x_2}{\partial y_K}} \\
{\vdots} &{\vdots} &{\ddots} &{\vdots}\\
{\frac{\partial x_K}{\partial y_1}} &{\frac{\partial x_K}{\partial y_2}}
&{\cdots} &{\frac{\partial x_K}{\partial y_K}}
{\frac{\partial x_K}{\partial y_1}} &{\frac{\partial x_K}{\partial y_2}}
&{\cdots} &{\frac{\partial x_K}{\partial y_K}}
\end{bmatrix}
A ``Transform`` can be characterized by three operations:
#. forward
Forward implements :math:`x \rightarrow f(x)`, and is used to convert
Forward implements :math:`x \rightarrow f(x)`, and is used to convert
one random outcome into another.
#. inverse
Undoes the transformation :math:`y \rightarrow f^{-1}(y)`.
Undoes the transformation :math:`y \rightarrow f^{-1}(y)`.
#. log_det_jacobian
The log of the absolute value of the determinant of the matrix of all
first-order partial derivatives of the inverse function.
......@@ -121,14 +121,14 @@ class Transform(object):
return Type.is_injective(cls._type)
def __call__(self, input):
"""Make this instance as a callable object. The return value is
depening on the input type.
"""Make this instance as a callable object. The return value is
depening on the input type.
* If the input is a ``Tensor`` instance, return
* If the input is a ``Tensor`` instance, return
``self.forward(input)`` .
* If the input is a ``Distribution`` instance, return
* If the input is a ``Distribution`` instance, return
``TransformedDistribution(base=input, transforms=[self])`` .
* If the input is a ``Transform`` instance, return
* If the input is a ``Transform`` instance, return
``ChainTransform([self, input])`` .
Args:
......@@ -145,12 +145,12 @@ class Transform(object):
return self.forward(x)
def forward(self, x):
"""Forward transformation with mapping :math:`y = f(x)`.
"""Forward transformation with mapping :math:`y = f(x)`.
Useful for turning one random outcome into another.
Args:
x (Tensos): Input parameter, generally is a sample generated
x (Tensos): Input parameter, generally is a sample generated
from ``Distribution``.
Returns:
......@@ -166,7 +166,7 @@ class Transform(object):
return self._forward(x)
def inverse(self, y):
"""Inverse transformation :math:`x = f^{-1}(y)`. It's useful for "reversing"
"""Inverse transformation :math:`x = f^{-1}(y)`. It's useful for "reversing"
a transformation to compute one probability in terms of another.
Args:
......@@ -185,15 +185,15 @@ class Transform(object):
return self._inverse(y)
def forward_log_det_jacobian(self, x):
"""The log of the absolute value of the determinant of the matrix of all
"""The log of the absolute value of the determinant of the matrix of all
first-order partial derivatives of the inverse function.
Args:
x (Tensor): Input tensor, generally is a sample generated from
x (Tensor): Input tensor, generally is a sample generated from
``Distribution``
Returns:
Tensor: The log of the absolute value of Jacobian determinant.
Tensor: The log of the absolute value of Jacobian determinant.
"""
if not isinstance(x, paddle.fluid.framework.Variable):
raise TypeError(
......@@ -212,11 +212,11 @@ class Transform(object):
def inverse_log_det_jacobian(self, y):
"""Compute :math:`log|det J_{f^{-1}}(y)|`.
Note that ``forward_log_det_jacobian`` is the negative of this function,
Note that ``forward_log_det_jacobian`` is the negative of this function,
evaluated at :math:`f^{-1}(y)`.
Args:
y (Tensor): The input to the ``inverse`` Jacobian determinant
y (Tensor): The input to the ``inverse`` Jacobian determinant
evaluation.
Returns:
......@@ -269,13 +269,13 @@ class Transform(object):
return variable.real
def _forward(self, x):
"""Inner method for publid API ``forward``, subclass should
"""Inner method for publid API ``forward``, subclass should
overwrite this method for supporting forward transformation.
"""
raise NotImplementedError('Forward not implemented')
def _inverse(self, y):
"""Inner method of public API ``inverse``, subclass should
"""Inner method of public API ``inverse``, subclass should
overwrite this method for supporting inverse transformation.
"""
raise NotImplementedError('Inverse not implemented')
......@@ -301,35 +301,35 @@ class Transform(object):
'is implemented. One of them is required')
def _forward_shape(self, shape):
"""Inner method called by ``forward_shape``, which is used to infer the
forward shape. Subclass should overwrite this method for supporting
"""Inner method called by ``forward_shape``, which is used to infer the
forward shape. Subclass should overwrite this method for supporting
``forward_shape``.
"""
return shape
def _inverse_shape(self, shape):
"""Inner method called by ``inverse_shape``, whic is used to infer the
invese shape. Subclass should overwrite this method for supporting
"""Inner method called by ``inverse_shape``, whic is used to infer the
invese shape. Subclass should overwrite this method for supporting
``inverse_shape``.
"""
return shape
class AbsTransform(Transform):
r"""Absolute transformation with formula :math:`y = f(x) = abs(x)`,
r"""Absolute transformation with formula :math:`y = f(x) = abs(x)`,
element-wise.
This non-injective transformation allows for transformations of scalar
distributions with the absolute value function, which maps ``(-inf, inf)``
This non-injective transformation allows for transformations of scalar
distributions with the absolute value function, which maps ``(-inf, inf)``
to ``[0, inf)`` .
* For ``y`` in ``(0, inf)`` , ``AbsTransform.inverse(y)`` returns the set invese
* For ``y`` in ``(0, inf)`` , ``AbsTransform.inverse(y)`` returns the set invese
``{x in (-inf, inf) : |x| = y}`` as a tuple, ``-y, y`` .
* For ``y`` equal ``0`` , ``AbsTransform.inverse(0)`` returns ``0, 0``, which is not
the set inverse (the set inverse is the singleton {0}), but "works" in
conjunction with ``TransformedDistribution`` to produce a left
* For ``y`` equal ``0`` , ``AbsTransform.inverse(0)`` returns ``0, 0``, which is not
the set inverse (the set inverse is the singleton {0}), but "works" in
conjunction with ``TransformedDistribution`` to produce a left
semi-continuous pdf.
* For ``y`` in ``(-inf, 0)`` , ``AbsTransform.inverse(y)`` returns the
* For ``y`` in ``(-inf, 0)`` , ``AbsTransform.inverse(y)`` returns the
wrong thing ``-y, y``. This is done for efficiency.
Examples:
......@@ -388,7 +388,7 @@ class AbsTransform(Transform):
class AffineTransform(Transform):
r"""Affine transformation with mapping
r"""Affine transformation with mapping
:math:`y = \text{loc} + \text{scale} \times x`.
Args:
......@@ -638,26 +638,26 @@ class ExpTransform(Transform):
class IndependentTransform(Transform):
r"""
``IndependentTransform`` wraps a base transformation, reinterprets
``IndependentTransform`` wraps a base transformation, reinterprets
some of the rightmost batch axes as event axes.
Generally, it is used to expand the event axes. This has no effect on the
forward or inverse transformaion, but does sum out the
``reinterpretd_bach_rank`` rightmost dimensions in computing the determinant
forward or inverse transformaion, but does sum out the
``reinterpretd_bach_rank`` rightmost dimensions in computing the determinant
of Jacobian matrix.
To see this, consider the ``ExpTransform`` applied to a Tensor which has
sample, batch, and event ``(S,B,E)`` shape semantics. Suppose the Tensor's
To see this, consider the ``ExpTransform`` applied to a Tensor which has
sample, batch, and event ``(S,B,E)`` shape semantics. Suppose the Tensor's
paritioned-shape is ``(S=[4], B=[2, 2], E=[3])`` , reinterpreted_batch_rank
is 1. Then the reinterpreted Tensor's shape is ``(S=[4], B=[2], E=[2, 3])`` .
The shape returned by ``forward`` and ``inverse`` is unchanged, ie,
``[4,2,2,3]`` . However the shape returned by ``inverse_log_det_jacobian``
is ``[4,2]``, because the Jacobian determinant is a reduction over the
The shape returned by ``forward`` and ``inverse`` is unchanged, ie,
``[4,2,2,3]`` . However the shape returned by ``inverse_log_det_jacobian``
is ``[4,2]``, because the Jacobian determinant is a reduction over the
event dimensions.
Args:
base (Transform): The base transformation.
reinterpreted_batch_rank (int): The num of rightmost batch rank that
reinterpreted_batch_rank (int): The num of rightmost batch rank that
will be reinterpreted as event rank.
Examples:
......@@ -793,7 +793,7 @@ class PowerTransform(Transform):
class ReshapeTransform(Transform):
r"""Reshape the event shape of a tensor.
Note that ``in_event_shape`` and ``out_event_shape`` must have the same
Note that ``in_event_shape`` and ``out_event_shape`` must have the same
number of elements.
Args:
......@@ -943,8 +943,8 @@ class SigmoidTransform(Transform):
class SoftmaxTransform(Transform):
r"""Softmax transformation with mapping :math:`y=\exp(x)` then normalizing.
It's generally used to convert unconstrained space to simplex. This mapping
is not injective, so ``forward_log_det_jacobian`` and
It's generally used to convert unconstrained space to simplex. This mapping
is not injective, so ``forward_log_det_jacobian`` and
``inverse_log_det_jacobian`` are not implemented.
Examples:
......@@ -997,11 +997,11 @@ class SoftmaxTransform(Transform):
class StackTransform(Transform):
r""" ``StackTransform`` applies a sequence of transformations along the
r""" ``StackTransform`` applies a sequence of transformations along the
specific axis.
Args:
transforms(Sequence[Transform]): The sequence of transformations.
transforms(Sequence[Transform]): The sequence of transformations.
axis(int): The axis along which will be transformed.
Examples:
......@@ -1102,7 +1102,7 @@ class StackTransform(Transform):
class StickBreakingTransform(Transform):
r"""Convert an unconstrained vector to the simplex with one additional
r"""Convert an unconstrained vector to the simplex with one additional
dimension by the stick-breaking construction.
Examples:
......@@ -1213,8 +1213,8 @@ class TanhTransform(Transform):
return y.atanh()
def _forward_log_det_jacobian(self, x):
"""We implicitly rely on _forward_log_det_jacobian rather than
explicitly implement ``_inverse_log_det_jacobian`` since directly using
"""We implicitly rely on _forward_log_det_jacobian rather than
explicitly implement ``_inverse_log_det_jacobian`` since directly using
``-tf.math.log1p(-tf.square(y))`` has lower numerical precision.
See details: https://github.com/tensorflow/probability/blob/master/tensorflow_probability/python/bijectors/tanh.py#L69-L80
......
......@@ -20,8 +20,8 @@ from paddle.distribution import independent
class TransformedDistribution(distribution.Distribution):
r"""
Applies a sequence of Transforms to a base distribution.
r"""
Applies a sequence of Transforms to a base distribution.
Args:
base (Distribution): The base distribution.
......@@ -30,12 +30,12 @@ class TransformedDistribution(distribution.Distribution):
Examples:
.. code-block:: python
import paddle
import paddle
from paddle.distribution import transformed_distribution
d = transformed_distribution.TransformedDistribution(
paddle.distribution.Normal(0., 1.),
paddle.distribution.Normal(0., 1.),
[paddle.distribution.AffineTransform(paddle.to_tensor(1.), paddle.to_tensor(2.))]
)
......
......@@ -37,7 +37,7 @@ class Variable(object):
return self._event_rank
def constraint(self, value):
"""Check whether the 'value' meet the constraint conditions of this
"""Check whether the 'value' meet the constraint conditions of this
random variable."""
return self._constraint(value)
......@@ -59,8 +59,8 @@ class Independent(Variable):
Args:
base (Variable): Base variable.
reinterpreted_batch_rank (int): The rightmost batch rank to be
reinterpreted.
reinterpreted_batch_rank (int): The rightmost batch rank to be
reinterpreted.
"""
def __init__(self, base, reinterpreted_batch_rank):
......
此差异已折叠。
......@@ -19,9 +19,9 @@ import warnings
"""
Class of all kinds of Average.
All Averages are accomplished via Python totally.
All Averages are accomplished via Python totally.
They do not change Paddle's Program, nor do anything to
modify NN model's configuration. They are completely
modify NN model's configuration. They are completely
wrappers of Python functions.
"""
......@@ -41,9 +41,9 @@ class WeightedAverage(object):
"""
Calculate weighted average.
The average calculating is accomplished via Python totally.
The average calculating is accomplished via Python totally.
They do not change Paddle's Program, nor do anything to
modify NN model's configuration. They are completely
modify NN model's configuration. They are completely
wrappers of Python functions.
Examples:
......
......@@ -1409,11 +1409,11 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
"""
ops_to_remove = []
'''
NOTE(paddle-dev): while_grad op may hold some inputs which are not found
in the parent/forward block, and they are also the outputs of while_grad
op. These kinds of inputs are the recursive outputs inside while_grad op.
They should be considered as "already created" when scanning the inner
ops of while_grad ops.
NOTE(paddle-dev): while_grad op may hold some inputs which are not found
in the parent/forward block, and they are also the outputs of while_grad
op. These kinds of inputs are the recursive outputs inside while_grad op.
They should be considered as "already created" when scanning the inner
ops of while_grad ops.
'''
parent_op = _find_parent_op_(block)
parent_op_vars = []
......@@ -1452,7 +1452,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
continue
else:
'''
If the output is not empty and there is any grad input, find
If the output is not empty and there is any grad input, find
whether there is any existing input. If not, just remove it.
'''
if grad_var_ins:
......@@ -1464,11 +1464,11 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
if not existing_grad_var_ins:
'''
FIXME(paddle-dev, zengjinle): rnn_memory_helper_grad is used
in recurrent op. The input of this op does not even exist in
the program! Therefore, any dependency analysis would not
in recurrent op. The input of this op does not even exist in
the program! Therefore, any dependency analysis would not
work to this op! If I do not add the following code, this op
would be pruned, and the calculation result would be wrong.
Maybe we should re-design this op later...
would be pruned, and the calculation result would be wrong.
Maybe we should re-design this op later...
'''
if op_desc.type() not in ['rnn_memory_helper_grad']:
ops_to_remove.append(op_idx)
......@@ -2206,7 +2206,7 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
will be None.
Examples:
.. code-block:: python
:name: code-example
import paddle
......
......@@ -209,34 +209,34 @@ class ClipGradBase(object):
class ClipGradByValue(ClipGradBase):
"""
Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
- Any values less than min are set to ``min``.
- Any values greater than max are set to ``max``.
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
Note:
``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
automatically. In this case, ``max`` must be greater than 0.
Examples:
.. code-block:: python
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
......@@ -300,17 +300,17 @@ class ClipGradByValue(ClipGradBase):
class ClipGradByNorm(ClipGradBase):
r"""
Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
- If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
- If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
The clipping formula is:
.. math::
......@@ -329,7 +329,7 @@ class ClipGradByNorm(ClipGradBase):
norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
Note:
``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
......@@ -337,12 +337,12 @@ class ClipGradByNorm(ClipGradBase):
Examples:
.. code-block:: python
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
......@@ -415,17 +415,17 @@ def _allow_pure_fp16_global_norm_clip(*args):
class ClipGradByGlobalNorm(ClipGradBase):
r"""
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` .
- If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
- If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
The clipping formula is:
......@@ -441,7 +441,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
Note:
``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
......@@ -450,12 +450,12 @@ class ClipGradByGlobalNorm(ClipGradBase):
Examples:
.. code-block:: python
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
......@@ -719,23 +719,23 @@ class ClipGradByGlobalNorm(ClipGradBase):
def set_gradient_clip(clip, param_list=None, program=None):
"""
:api_attr: Static Graph
Warning:
This API must be used after building network, and before ``minimize`` ,
and it may be removed in future releases, so it is not recommended.
This API must be used after building network, and before ``minimize`` ,
and it may be removed in future releases, so it is not recommended.
It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
this is a better method to clip gradient. There are three clipping strategies:
:ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` .
To specify parameters that require gradient clip.
Args:
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
gradient clipping.
param_list (list(Variable), optional): Parameters that require gradient clip.
It can be a list of parameter or a list of parameter's name.
......@@ -789,7 +789,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
param_list=[param_var1, param_var2])
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
# network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
......@@ -800,10 +800,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
# Set the gradient clipping strategy: clip2
sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
sgd.minimize(loss)
# 'set_gradient_clip' will not take effect when setting has a conflict,
# 'set_gradient_clip' will not take effect when setting has a conflict,
# and the gradient clipping strategy will be 'clip2'
"""
warnings.warn("Caution! 'set_gradient_clip' is not recommended "
"and may be deprecated in future! "
......
......@@ -102,7 +102,7 @@ def _should_broadcast_or_not_exists(program, var_name):
class CompiledProgram(object):
"""
:api_attr: Static Graph
The CompiledProgram is used to transform a program or graph for
various optimizations according to the configuration of build_strategy,
for example, the operators' fusion in the computation graph, memory
......@@ -187,12 +187,12 @@ class CompiledProgram(object):
exec_strategy to set some optimizations that can be applied during the construction
and computation of the Graph, such as reducing the number of AllReduce operations,
specifying the size of the thread pool used in the computation Graph running the model,
and so on.
and so on.
.. note::
If build_strategy is specified when building CompiledProgram and calling
with_data_parallel, build_strategy in CompiledProgram will be overwritten, therefore,
if it is data parallel training, it is recommended to set build_strategy when calling
If build_strategy is specified when building CompiledProgram and calling
with_data_parallel, build_strategy in CompiledProgram will be overwritten, therefore,
if it is data parallel training, it is recommended to set build_strategy when calling
with_data_parallel interface.
Args:
......@@ -228,7 +228,7 @@ class CompiledProgram(object):
export CPU_NUM=4, if the environment variable is not set, the executor will
add the variable to the environment variable and set its value to 1.
The default is None. If ``places`` is the list of string, the string in the list
can be ``cpu``, ``gpu:x``, where ``x`` is the index of the GPUs.
can be ``cpu``, ``gpu:x``, where ``x`` is the index of the GPUs.
Returns:
CompiledProgram
......@@ -270,7 +270,7 @@ class CompiledProgram(object):
static.default_main_program()).with_data_parallel(
loss_name=loss.name, places=parallel_places)
# NOTE: if not set share_vars_from=compiled_train_prog,
# the parameters used in test process are different with
# the parameters used in test process are different with
# the parameters used by train process
compiled_test_prog = static.CompiledProgram(
test_program).with_data_parallel(
......@@ -701,7 +701,7 @@ class IpuStrategy(object):
Examples:
.. code-block:: python
# required: ipu
import paddle
......@@ -744,7 +744,7 @@ class IpuStrategy(object):
Examples:
.. code-block:: python
# required: ipu
import paddle
......@@ -762,7 +762,7 @@ class IpuStrategy(object):
Examples:
.. code-block:: python
# required: ipu
import paddle
......@@ -780,13 +780,13 @@ class IpuStrategy(object):
Args:
optimizer (Optimizer): Optimizer to be used in training.
Returns:
None.
Examples:
.. code-block:: python
# required: ipu
import paddle
......@@ -812,13 +812,13 @@ class IpuStrategy(object):
Args:
optimizer (Optimizer): Optimizer to be parsed.
Returns:
Dict.
Examples:
.. code-block:: python
# required: ipu
import paddle
......@@ -857,15 +857,15 @@ class IpuStrategy(object):
is_training (bool, optional): True is training graph, False is inference graph. Default True, which means is training mode.
batch_size (int, optional): The batch-size in the graph. Used to make the graph batch-size fixed,
if the batch-size in the graph is dynamic. Default 1, which means the batch-size would be set 1, if the batch-size is dynamice.
enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True.
Default False, which means disabled.
enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True.
Default False, which means disabled.
Returns:
None.
Examples:
.. code-block:: python
# required: ipu
import paddle
......@@ -900,15 +900,15 @@ class IpuStrategy(object):
Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance.
Args:
enable_pipelining (bool, optional): Enable data pipelining between subgraphs. Only if enable_manual_shard=True, enable_pipelining is able to be set True.
enable_pipelining (bool, optional): Enable data pipelining between subgraphs. Only if enable_manual_shard=True, enable_pipelining is able to be set True.
Default False, which means disabled.
batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1.
Default 1, which means no data pipelining.
enable_gradient_accumulation (bool, optional): Enable to accumulate gradients before updating the weights in training mode. Only if enable_pipelining=True,
enable_gradient_accumulation is able to be set True. Default False, which means no gradient accumulation.
accumulation_factor (int, optional): Specify the number of micro-batches to accumulate
enable_gradient_accumulation is able to be set True. Default False, which means no gradient accumulation.
accumulation_factor (int, optional): Specify the number of micro-batches to accumulate
before applying the varUpdate. Default 1, which means disable the accumulation.
Returns:
None.
......@@ -947,7 +947,7 @@ class IpuStrategy(object):
Args:
enable_fp16 (bool, optional): Enable FLOAT16 mode and transform FLOAT32 to FLOAT16. Default False, which means disable FLOAT16 mode.
Returns:
None.
......@@ -985,7 +985,7 @@ class IpuStrategy(object):
domain(str): domain name of custom op in popart.
version(int): version of custom op in popart.
Returns:
None.
......@@ -1021,7 +1021,7 @@ class IpuStrategy(object):
Args:
options(dict): dict of options.
Returns:
None.
......@@ -1051,7 +1051,7 @@ class IpuStrategy(object):
Args:
option(str): name of option.
Returns:
option value.
......@@ -1076,7 +1076,7 @@ class IpuStrategy(object):
Args:
pattern(string): the name of the pattern.
Returns:
None.
......@@ -1101,7 +1101,7 @@ class IpuStrategy(object):
Args:
pattern(string): the name of the pattern.
Returns:
None.
......@@ -1156,21 +1156,21 @@ class IpuCompiledProgram(object):
Args:
program(Program, optional): This parameter represents the :code:`Program`
to be executed. Default is None, which means the program will be set to
to be executed. Default is None, which means the program will be set to
the default program :code:`paddle.static.default_main_program()` .
scope(Scope, optional): The scope used to run this program, you can switch
it to different scope. Default is None, which means use the global
it to different scope. Default is None, which means use the global
scope :code:`paddle.static.global_scope()` .
ipu_strategy(IpuStrategy, optional): This argument is used to build the program with the
specified options, such as half computation, training or inference session, the number of IPUs, etc.
Default is None, which means build the program based on the default `ipu_strategy`.
Default is None, which means build the program based on the default `ipu_strategy`.
Returns:
IpuCompiledProgram
Example:
.. code-block:: python
# required: ipu
import paddle
......@@ -1181,12 +1181,12 @@ class IpuCompiledProgram(object):
a = static.data(name='data', shape=[None, 1], dtype='int32')
b = a + 1
main_prog = static.default_main_program()
ipu_strategy = static.IpuStrategy()
ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
ipu_strategy.set_precision_config(enable_fp16=False)
ipu_compiled_program = static.IpuCompiledProgram(
main_prog,
ipu_strategy=ipu_strategy)
......@@ -1232,7 +1232,7 @@ class IpuCompiledProgram(object):
"""
This interface is used to compile the input Program to a program
to run the model on the ipu.
Args:
feed_list(list): This parameter represents the input Tensors of the model.
......@@ -1244,14 +1244,14 @@ class IpuCompiledProgram(object):
Example:
.. code-block:: python
# required: ipu
import paddle
import paddle.static as static
paddle.enable_static()
a = static.data(name='data', shape=[None, 1], dtype='int32')
b = a + 1
main_prog = static.default_main_program()
......@@ -1260,7 +1260,7 @@ class IpuCompiledProgram(object):
ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
ipu_strategy.set_precision_config(enable_fp16=False)
program = static.IpuCompiledProgram(
main_prog,
ipu_strategy=ipu_strategy).compile([a.name], [b.name])
......
......@@ -49,7 +49,7 @@ def ctr_metric_bundle(input, label, ins_tag_weight=None):
label(Tensor): A 2D int Tensor indicating the label of the training
data. The height is batch size and width is always 1.
ins_tag_weight(Tensor): A 2D int Tensor indicating the ins_tag_weight of the training
data. 1 means real data, 0 means fake data.
data. 1 means real data, 0 means fake data.
A LoDTensor or Tensor with type float32,float64.
Returns:
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册