提交 1a13420b 编写于 作者: L LiuChiaChi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-transformer-generate_square_subsequent_mask-api
......@@ -30,8 +30,10 @@ __global__ void ComputeDifferent(T *centers_diff, const T *X, const T *centers,
while (idy < K) {
int64_t id = ids[idy];
PADDLE_ENFORCE(id >= 0, "received id:", id);
PADDLE_ENFORCE(id < N, "received id:", id);
PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
T *out = centers_diff + idy * D;
const T *x = X + idy * D;
const T *cent = centers + id * D;
......@@ -52,8 +54,9 @@ __global__ void UpdateCenters(T *centers, T *centers_diff, const int64_t *ids,
while (idy < K) {
int count = 1;
int64_t id = ids[idy];
PADDLE_ENFORCE(id >= 0, "received id:", id);
PADDLE_ENFORCE(id < N, "received id:", id);
PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
for (int i = 0; i < K; i++) {
if (ids[i] == id) {
......@@ -69,8 +69,10 @@ template <typename T>
class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override {
"It must use CUDAPlace.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"CTCAlign operator CUDA kernel must use CUDAPlace "
"rather than CPUPlace."));
auto* input = ctx.Input<LoDTensor>("Input");
auto* output = ctx.Output<LoDTensor>("Output");
const int blank = ctx.Attr<int>("blank");
......@@ -72,8 +72,11 @@ class CTCAlignKernel : public framework::OpKernel<T> {
// check input dims and lod
input_dims[0], static_cast<int64_t>(input_lod[level].back()),
"The first dimension of Input(Input) should be equal to "
"the sum of all sequences' lengths.");
"The first dimension %d of CTCAlign operator Input(Input) should "
"be equal to "
"the sum of all sequences' lengths %d.",
input_dims[0], static_cast<int64_t>(input_lod[level].back())));
const size_t num_sequences = input_lod[level].size() - 1;
......@@ -42,21 +42,21 @@ class MVOp : public framework::OperatorWithKernel {
OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
auto dim_x = context->GetInputDim("X");
auto dim_y = context->GetInputDim("Vec");
auto dim_vec = context->GetInputDim("Vec");
dim_x.size(), 2,
"The rank of input X should be 2, but is %d", dim_x.size()));
dim_y.size(), 1,
dim_vec.size(), 1,
"The rank of input Vec should be 1, but is %d", dim_y.size()));
PADDLE_ENFORCE_EQ(dim_x[1] == dim_y[0], true,
"The rank of input Vec should be 1, but is %d", dim_vec.size()));
PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
"The length of input X' second dim should equal the "
"length of input Vec,"
" but X[%d, %d], Vec[%d]",
dim_x[0], dim_x[1], dim_y[0]));
"X's second dimension is expected to be equal to "
"Vec's first dimension"
"but recieved X'shape = [%s], Vec's shape = [%s]",
dim_x, dim_vec));
framework::DDim dim_out = framework::make_ddim({dim_x[0]});
......@@ -19,7 +19,7 @@ namespace paddle {
namespace operators {
template <typename T>
__global__ void MVGradCUDAKernel(const int m, const int n, const T *dout,
__global__ void MVGradDxCUDAKernel(const int m, const int n, const T *dout,
const T *vec, T *dx) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < m * n; idx += blockDim.x * gridDim.x) {
......@@ -52,33 +52,32 @@ class MVGradKernel<platform::CUDADeviceContext, T>
int m = dim_x[0];
int n = dim_x[1];
dx->Resize(framework::make_ddim({m * n}));
// get data ptr
const T *x_data = x->data<T>();
const T *vec_data = vec->data<T>();
const T *dout_data = dout->data<T>();
T *dx_data = dx->mutable_data<T>(context.GetPlace());
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
auto &dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
// calculate dx
auto stream = context.cuda_device_context().stream();
auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
if (dx) {
T *dx_data = dx->mutable_data<T>(context.GetPlace());
T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
m, n, dout_data, vec_data, dx_data);
dx->Resize(framework::make_ddim({m, n}));
if (dvec) {
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
// calculate dvec
blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
static_cast<T>(0), dvec_data);
} // namespace operators
......@@ -74,31 +74,31 @@ class MVGradKernel : public framework::OpKernel<T> {
int m = dim_x[0];
int n = dim_x[1];
dx->Resize(framework::make_ddim({m * n}));
// get data ptr
const T *x_data = x->data<T>();
const T *vec_data = vec->data<T>();
const T *dout_data = dout->data<T>();
if (dx) {
T *dx_data = dx->mutable_data<T>(context.GetPlace());
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
// calculate dx
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j)
for (int j = 0; j < n; ++j) {
dx_data[i * n + j] = dout_data[i] * vec_data[j];
if (dvec) {
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
dx->Resize(framework::make_ddim({m, n}));
auto &dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
// calculate dvec
blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
static_cast<T>(0), dvec_data);
} // namespace operators
......@@ -45,8 +45,10 @@ template <typename T>
class PoolCUDNNOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"It must use CUDAPlace.");
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
"CUDAPlace rather than CPUPlace."));
const Tensor *input = ctx.Input<Tensor>("X");
Tensor *output = ctx.Output<Tensor>("Out");
......@@ -175,8 +177,10 @@ template <typename T>
class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"It must use CUDAPlace.");
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
"CUDAPlace rather than CPUPlace."));
const Tensor *input = ctx.Input<Tensor>("X");
const Tensor *output = ctx.Input<Tensor>("Out");
......@@ -38,18 +38,22 @@ int PoolOutputSize(int input_size, int filter_size, int padding_1,
output_size, 0,
"ShapeError: the output size must be greater than 0. But received: "
"output_size = %d due to the settings of input_size(%d), padding(%d,%d), "
"the output size must be greater than 0. But received: "
"output_size = %d due to the settings of input_size(%d), "
"padding(%d,%d), "
"k_size(%d) and stride(%d). Please check again!",
output_size, input_size, padding_1, padding_2, filter_size, stride);
output_size, input_size, padding_1, padding_2, filter_size, stride));
return output_size;
void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
"X(Input) of Pooling should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
"Out(Output) of Pooling should not be null.");
ctx->HasInput("X"), true,
platform::errors::NotFound("Input(X) of Pool operator is not found."));
ctx->HasOutput("Out"), true,
platform::errors::NotFound("Output(Out) of Pool operator is not found."));
std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
......@@ -65,28 +69,32 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
auto in_x_dims = ctx->GetInputDim("X");
in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
"ShapeError: the input of Op(pool) should be 4-D or 5-D Tensor. But "
"the input of Op(pool) should be 4-D or 5-D Tensor. But "
"received: %u-D Tensor and it's shape is [%s].",
in_x_dims.size(), in_x_dims);
in_x_dims.size(), in_x_dims));
in_x_dims.size() - ksize.size(), 2U,
"ShapeError: the dimension of input minus the size of "
"the dimension of input minus the size of "
"Attr(ksize) must be euqal to 2 in Op(pool). "
"But received: the dimension of input minus the size "
"of Attr(ksize) is %d, the "
"input's dimension is %d, the shape of input "
"is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
ksize.size(), framework::make_ddim(ksize));
ksize.size(), framework::make_ddim(ksize)));
PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
"ShapeError: the size of Attr(ksize) and Attr(strides) in "
ksize.size(), strides.size(),
"the size of Attr(ksize) and Attr(strides) in "
"Op(pool) must be equal. "
"But received: Attr(ksize)'s size is %d, Attr(strides)'s "
"size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
ksize.size(), strides.size(), framework::make_ddim(ksize),
// MKL-DNN Kernels are using NCHW order of dims description
// so we ignore data_format consideration for MKL-DNN kernel
......@@ -182,9 +190,12 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar(
void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
"Input(X) of Pool Gradoperator is not found."));
PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
"Input(X@GRAD) should not be null.");
"Input(X@GRAD) of Pool Gradoperator is not found."));
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
......@@ -210,7 +221,8 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used");
"Float16 can only be used when CUDNN is used"));
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
......@@ -81,9 +81,11 @@ inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
} else {
data_dims.size() * 2, paddings->size(),
"Paddings size should be the same or twice as the pooling size.");
PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
"Paddings size %d should be the same or twice as the "
"pooling size %d.",
paddings->size(), data_dims.size() * 2));
// when padding_algorithm is "VALID" or "SAME"
......@@ -200,7 +202,10 @@ class PoolKernel : public framework::OpKernel<T> {
pool_process, exclusive, adaptive, out);
} break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
default: {
"Pool op only supports 2D and 3D input."));
......@@ -287,7 +292,10 @@ class PoolGradKernel : public framework::OpKernel<T> {
adaptive, in_x_grad);
} break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
default: {
"Pool op only supports 2D and 3D input."));
......@@ -46,8 +46,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
bool adaptive = ctx->Attrs().Get<bool>("adaptive");
PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
"Pooling intput should be 4-D or 5-D tensor.");
in_x_dims.size() == 4 || in_x_dims.size() == 5,
platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D "
"tensor but received %dD-Tensor",
if (ctx->Attrs().Get<bool>("global_pooling")) {
ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
......@@ -57,16 +60,21 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(in_x_dims.size() - ksize.size(), 2U,
in_x_dims.size() - ksize.size(), 2U,
"Input size and pooling size should be consistent."));
PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
"The input size %d minus the kernel size %d should equal to 2.",
in_x_dims.size(), ksize.size()));
ksize.size(), strides.size(),
"Strides size and pooling size should be the same."));
"Strides size %d and pooling size %d should be the same.",
strides.size(), ksize.size()));
ksize.size(), paddings.size(),
"Paddings size and pooling size should be the same."));
"Paddings size %d and pooling size %d should be the same.",
paddings.size(), ksize.size()));
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
if (adaptive) {
......@@ -61,7 +61,10 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
} break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
default: {
"Pool op only supports 2D and 3D input."));
......@@ -106,7 +109,10 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
paddings, adaptive, in_x_grad);
} break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
default: {
"Pool op only supports 2D and 3D input."));
......@@ -176,22 +176,31 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
int height = in_dims[2];
int width = in_dims[3];
output_channels * pooled_height * pooled_width,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width");
input_channels, output_channels * pooled_height * pooled_width,
"The channels %d of input X should equal the product of "
"output_channels %d x pooled_height %d x pooled_width %d.",
input_channels, output_channels, pooled_height, pooled_width));
int rois_num = rois->dims()[0];
if (rois_num == 0) return;
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
rois_batch_size, batch_size,
"The rois_batch_size and input(X) batch_size must be the same.");
PADDLE_ENFORCE_EQ(rois_batch_size, batch_size,
"The batch size of input(ROIs) and input(X) must be "
"the same but received batch size of input(ROIs) and "
"input(X) is %d and %d respectively.",
rois_batch_size, batch_size));
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
"The rois_num from input and lod must be the same.");
"The number of rois from input(ROIs) and its LOD "
"must be the same. Received rois %d of input(ROIs) "
"but the number of rois %d from its LOD is %d",
rois_num, rois_num_with_lod));
// set rois batch id
framework::Tensor rois_batch_id_list;
......@@ -160,9 +160,14 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
if (ctx.HasInput("RoisNum")) {
auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
int rois_batch_size = rois_num_t->numel();
rois_batch_size, batch_size,
"The rois_batch_size and imgs batch_size must be the same.");
"The batch size of input(ROIs) and input(X) must be the same but "
"received batch size of input(ROIs) and input(X) is %d and %d "
rois_batch_size, batch_size));
std::vector<int> rois_num_list(rois_batch_size);
memory::Copy(cplace, rois_num_list.data(), gplace,
rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
......@@ -178,10 +183,19 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
int rois_batch_size = rois_lod.size() - 1;
rois_batch_size, batch_size,
"The rois_batch_size and imgs batch_size must be the same.");
"The batch size of input(ROIs) and input(X) must be the same but "
"received batch size of input(ROIs) and input(X) is %d and %d "
rois_batch_size, batch_size));
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
"The rois_num from input and lod must be the same.");
"The number of rois from input(ROIs) and its LOD "
"must be the same. Received rois %d of input(ROIs) "
"but the number of rois %d from its LOD is %d",
rois_num, rois_num_with_lod));
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
......@@ -30,6 +30,7 @@ __all__ = [
fleet = Fleet()
_final_strategy = fleet._final_strategy
init = fleet.init
is_first_worker = fleet.is_first_worker
worker_index = fleet.worker_index
......@@ -1244,8 +1244,7 @@ class DistributedStrategy(object):
if getattr(self.strategy, f.name):
draws += border + "\n"
draws += h1_format.format(
"{} = True, please check {}_configs".format(
f.name, f.name))
"{}=True <-> {}_configs".format(f.name, f.name))
draws += line + "\n"
my_configs = getattr(self.strategy,
f.name + "_configs")
......@@ -119,6 +119,8 @@ class Fleet(object):
self.strategy_compiler = None
self._is_collective = False
self._runtime_handle = None
self._util = None
self._context = {}
def init(self, role_maker=None, is_collective=False):
......@@ -569,8 +571,9 @@ class Fleet(object):
if strategy == None:
strategy = DistributedStrategy()
self.user_defined_strategy = strategy
self.valid_strategy = None
self._user_defined_strategy = copy.deepcopy(strategy)
self._context = {}
return self
......@@ -909,6 +912,15 @@ class Fleet(object):
# imitate target optimizer retrieval
return self.user_defined_optimizer.clear_grad()
def _final_strategy(self):
if "valid_strategy" not in self._context:
"WARNING: You may need to call minimize function before this function is called"
return {}
return self._context["valid_strategy"]
def minimize(self,
......@@ -958,12 +970,15 @@ class Fleet(object):
# for more examples, please reference https://github.com/PaddlePaddle/FleetX
context = {}
context["user_defined_strategy"] = copy.deepcopy(
if paddle.fluid.framework.in_dygraph_mode():
# imitate target optimizer retrieval
target_opt = self.user_defined_optimizer
self._context = context
return target_opt.minimize(loss)
context = {}
# cache original feed forward program
self.origin_main_program = loss.block.program
context["origin_main_program"] = self.origin_main_program
......@@ -984,17 +999,19 @@ class Fleet(object):
context["user_defined_strategy"] = copy.copy(self.user_defined_strategy)
context["user_defined_strategy"] = copy.deepcopy(
copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)
# trigger the auto-parallel in very strict condition
# strategy = DistributedStrategy()
# strategy.auto = True
# optimizer = paddle.optimizer.SGD(learning_rate=0.1)
# optimizer = fleet.distributed_optimizer(optimizer, strategy)
if self.user_defined_strategy._is_strict_auto():
if copy_user_defined_strategy._is_strict_auto():
# turn on all the strategy for each optimizer
for opt in distributed_optimizer_list:
opt._enable_strategy(self.user_defined_strategy, context)
opt._enable_strategy(copy_user_defined_strategy, context)
valid_optimizer_list = []
valid_graph_optimizer_list = []
......@@ -1003,7 +1020,7 @@ class Fleet(object):
for opt in distributed_optimizer_list:
opt._set_basic_info(loss, self._role_maker,
if opt._can_apply() and not opt._is_graph_out():
elif opt._can_apply() and opt._is_graph_out():
......@@ -1014,13 +1031,15 @@ class Fleet(object):
meta_optimizer, graph_optimizer = \
loss, self._role_maker, self.user_defined_optimizer,
self.user_defined_strategy, valid_optimizer_list,
copy_user_defined_strategy, valid_optimizer_list,
valid_strategy = self.strategy_compiler._get_valid_strategy(
self.user_defined_strategy, can_not_apply_optimizer_list)
copy_user_defined_strategy, can_not_apply_optimizer_list)
context["valid_strategy"] = copy.deepcopy(valid_strategy)
context["valid_strategy"] = valid_strategy
self._context = context
self.valid_strategy = valid_strategy
......@@ -1291,17 +1291,17 @@ def append_backward(loss,
It will be automatically invoked by the optimizer's `minimize` function.
loss( :ref:`api_guide_Variable_en` ): The loss variable of the network.
parameter_list(list[Variable|str], optional): List of Parameters or Parameter.names
loss(Tensor): The loss Tensor of the network.
parameter_list(list[Tensor|str], optional): List of Parameters or Parameter.names
that need to be updated by optimizers.
If it is None, all parameters
will be updated.
Default: None.
no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All variables with
no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All Tensors with
`stop_gradient=True` from all blocks will
be automatically added into this set.
If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
Default: None.
callbacks(list[callable object], optional): List of callback functions.
The callbacks are used for
......@@ -1312,70 +1312,73 @@ def append_backward(loss,
new gradient operator is added
into the program. The callable
object must have two input
parameters: 'block' and 'context'.
The 'block' is the :ref:`api_guide_Block_en` which
parameters: ``block`` and ``context`` .
The ``block`` is the :ref:`api_guide_Block_en` which
the new gradient operator will
be added to. The 'context' is a
be added to. The ``context`` is a
map, whose keys are gradient
variable names and values are
corresponding original :ref:`api_guide_Variable_en` .
In addition to this, the 'context'
Tensor names and values are
corresponding original :ref:`api_guide_tensor_en` .
In addition to this, the ``context``
has another special key-value pair:
the key is string '__current_op_desc__'
the key is string ``__current_op_desc__``
and the value is the op_desc of the
gradient operator who has just
triggered the callable object.
Default: None.
list of tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ): Pairs of parameter and its corresponding gradients.
The key is the parameter and the value is gradient variable.
list of tuple ( :ref:`api_guide_tensor_en` , :ref:`api_guide_tensor_en` ): Pairs of parameter and its corresponding gradients.
The key is the parameter and the value is gradient Tensor.
AssertionError: If `loss` is not an instance of Variable.
AssertionError: If ``loss`` is not an instance of Tensor.
.. code-block:: python
import paddle.fluid as fluid
import paddle
import paddle.nn.functional as F
x = fluid.data(name='x', shape=[None, 13], dtype='int64')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
x_emb = fluid.embedding(x, size=[100, 256])
y_predict = fluid.layers.fc(input=x_emb, size=1, act=None, name='my_fc')
loss = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_loss = fluid.layers.mean(loss)
x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
x_emb = paddle.static.nn.embedding(x, size=[100, 256])
y_predict = paddle.static.nn.fc(input=x_emb, size=1, act=None, name='my_fc')
loss = F.square_error_cost(input=y_predict, label=y)
avg_loss = paddle.mean(loss)
# Get all weights in main_program, not include bias.
all_weights = [param for param in fluid.default_main_program().block(0).all_parameters() if 'w_' in param.name]
all_weights = [param for param in paddle.static.default_main_program().block(0).all_parameters() if 'w_' in param.name]
all_weights_name = [w.name for w in all_weights]
# return all param_grads needed to be updated if parameter_list set default None.
p_g_list1 = fluid.backward.append_backward(loss=avg_loss)
p_g_list1 = paddle.static.append_backward(loss=avg_loss)
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
# return the param_grads corresponding to parameter_list that can be list of param (Variable).
p_g_list2 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights)
# return the param_grads corresponding to parameter_list that can be list of param (Tensor).
p_g_list2 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights)
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
# parameter_list can be list of param.name (str).
p_g_list3 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights_name)
p_g_list3 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights_name)
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
# no_grad_set can be set of Variables that means grad will be cut off from these Variables.
p_g_list4 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
# no_grad_set can be set of Tensors that means grad will be cut off from these Tensors.
p_g_list4 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
# output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
# no_grad_set can be set of Variable.name when the Variable is created inside layers and can't be specified explicitly.
p_g_list5 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
# no_grad_set can be set of Tensor.name when the Tensor is created inside layers and can't be specified explicitly.
p_g_list5 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
# return [] because all param_grads are filtered by no_grad_set.
p_g_list6 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
check_type(loss, 'loss', framework.Variable,
if loss.op is None:
# the loss is from a cloned program. Find loss op manually.
......@@ -1387,7 +1390,7 @@ def append_backward(loss,
if callbacks is not None:
check_type(callbacks, 'callbacks', list,
program = loss.block.program
root_block = program.block(0)
......@@ -1727,21 +1730,21 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
Backpropagate the gradients of targets to inputs.
targets(Variable|list[Variable]): The target variables
inputs(Variable|list[Variable]): The input variables
target_gradients (Variable|list[Variable], optional): The gradient variables
targets(Tensor|list[Tensor]): The target Tensors
inputs(Tensor|list[Tensor]): The input Tensors
target_gradients (Tensor|list[Tensor], optional): The gradient Tensors
of targets which has the same shape with targets, If None, ones will
be created for them.
no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All variables with
no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All Tensors with
`stop_gradient=True` from all blocks will
be automatically added into this set.
If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
Default: None.
(list[Variable]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable
(list[Tensor]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient Tensor
will be None
targets = _as_list(targets)
......@@ -1865,41 +1868,42 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
Backpropagate the gradients of targets to inputs.
targets (Variable|list[Variable]): The target variables.
inputs (Variable|list[Variable]): The input variables.
target_gradients (Variable|list[Variable], optional): The gradient variables
targets (Tensor|list[Tensor]): The target Tensors.
inputs (Tensor|list[Tensor]): The input Tensors.
target_gradients (Tensor|list[Tensor], optional): The gradient Tensor
of targets which has the same shape with targets, If None, ones will
be created for them.
no_grad_set (set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All variables with `stop_gradient=True` from all blocks will
be automatically added into this set. If this parameter is not None, the Variables or Variable.names
no_grad_set (set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All Tensors with ``stop_gradient=True`` from all blocks will
be automatically added into this set. If this parameter is not None, the Tensors or Tensor.names
in this set will be added to the default set. Default: None.
(list[Variable]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable
(list[Tensor]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient Tensor
will be None.
.. code-block:: python
import paddle.fluid as fluid
import paddle
import paddle.nn.functional as F
x = fluid.data(name='x', shape=[None,2,8,8], dtype='float32')
x = paddle.static.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
y = fluid.layers.conv2d(x, 4, 1, bias_attr=False)
y = fluid.layers.relu(y)
y = fluid.layers.conv2d(y, 4, 1, bias_attr=False)
y = fluid.layers.relu(y)
z = fluid.gradients([y], x)
y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
y = F.relu(y)
z = paddle.static.gradients([y], x)
print(z) # [var x@GRAD : fluid.VarType.LOD_TENSOR.shape(-1L, 2L, 8L, 8L).astype(VarType.FP32)]
check_type(targets, 'targets', (framework.Variable, list),
check_type(inputs, 'inputs', (framework.Variable, list),
check_type(target_gradients, 'target_gradients', (
framework.Variable, list, type(None)), 'fluid.backward.gradients')
framework.Variable, list, type(None)), 'paddle.static.gradients')
outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
return _as_list(outs)
......@@ -506,11 +506,12 @@ def name_scope(prefix=None):
:api_attr: Static Graph
Generate hierarchical name prefix for the operators.
Generate hierarchical name prefix for the operators in Static Graph.
This should only used for debugging and visualization purpose.
Don't use it for serious analysis such as graph/program transformations.
Don't use it in dygraph, since it will cause memory leak.
prefix(str, optional): prefix. Default is none.
......@@ -518,21 +519,22 @@ def name_scope(prefix=None):
.. code-block:: python
import paddle.fluid as fluid
with fluid.name_scope("s1"):
a = fluid.data(name='data', shape=[None, 1], dtype='int32')
import paddle
with paddle.static.name_scope("s1"):
a = paddle.data(name='data', shape=[None, 1], dtype='int32')
b = a + 1
with fluid.name_scope("s2"):
with paddle.static.name_scope("s2"):
c = b * 1
with fluid.name_scope("s3"):
with paddle.static.name_scope("s3"):
d = c / 1
with fluid.name_scope("s1"):
f = fluid.layers.pow(d, 2.0)
with fluid.name_scope("s4"):
with paddle.static.name_scope("s1"):
f = paddle.tensor.pow(d, 2.0)
with paddle.static.name_scope("s4"):
g = f - 1
# Op are created in the default main program.
for op in fluid.default_main_program().block(0).ops:
for op in paddle.static.default_main_program().block(0).ops:
# elementwise_add is created in /s1/
if op.type == 'elementwise_add':
assert op.desc.attr("op_namescope") == '/s1/'
......@@ -5396,13 +5398,13 @@ def program_guard(main_program, startup_program=None):
:api_attr: Static Graph
Change the global main program and startup program with `"with"` statement.
Layer functions in the Python `"with"` block will append operators and
variables to the new main programs.
Change the global main program and startup program with ``with`` statement.
Layer functions in the Python ``with`` block will append operators and
Tensors to the new main programs.
main_program(Program): New main program inside `"with"` statement.
startup_program(Program, optional): New startup program inside `"with"`
main_program(Program): New main program inside ``with`` statement.
startup_program(Program, optional): New startup program inside ``with``
statement. :code:`None` means not changing startup program,
default_startup_program is still used.
Default: None.
......@@ -5410,13 +5412,14 @@ def program_guard(main_program, startup_program=None):
.. code-block:: python
import paddle.fluid as fluid
import paddle
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
hidden = fluid.layers.fc(input=data, size=10, act='relu')
main_program = paddle.static.Program()
startup_program = paddle.static.Program()
with paddle.static.program_guard(main_program, startup_program):
data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
hidden = paddle.static.nn.fc(input=data, size=10, act='relu')
Notes: The temporary :code:`Program` can be used if the user does not need
to construct either of startup program or main program.
......@@ -5424,20 +5427,22 @@ def program_guard(main_program, startup_program=None):
.. code-block:: python
import paddle.fluid as fluid
import paddle
main_program = fluid.Program()
main_program = paddle.static.Program()
# does not care about startup program. Just pass a temporary value.
with fluid.program_guard(main_program, fluid.Program()):
data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
with paddle.static.program_guard(main_program, paddle.static.Program()):
data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
from .data_feeder import check_type
check_type(main_program, 'main_program', Program, 'fluid.program_guard')
check_type(main_program, 'main_program', Program,
main_program = switch_main_program(main_program)
if startup_program is not None:
check_type(startup_program, 'startup_program', Program,
startup_program = switch_startup_program(startup_program)
......@@ -9287,8 +9287,8 @@ def pad2d(input,
than height-1. And the width dimension has the same condition.
input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format, which is a 4-D Tensor with data type float32.
paddings (Variable | List[int32]): The padding size. If padding is a List, it must
input (Tensor): The input image with [N, C, H, W] format or [N, H, W, C] format, which is a 4-D Tensor with data type float32.
paddings (Tensor | List[int32]): The padding size. If padding is a List, it must
contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
Otherwise, it is a 1-D Tensor with shape [4]. Data type is int32.
Default is [0, 0, 0, 0].
......@@ -9304,10 +9304,7 @@ def pad2d(input,
name (str, optional) : The default value is None. Normally there is no need for
user to set this property. For more information, please refer to :ref:`api_guide_Name` .
Returns: a 4-D Tensor padded according to paddings and mode and data type is same as input.
Return Type: Variable
Returns: Tensor, a 4-D Tensor padded according to paddings and mode and data type is same as input.
.. code-block:: text
......@@ -9340,9 +9337,33 @@ def pad2d(input,
Code Examples:
.. code-block:: python
import paddle.fluid as fluid
data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
result = fluid.layers.pad2d(input=data, paddings=[0, 1, 2, 3], mode='reflect')
import numpy as np
import paddle
import paddle.nn.functional as F
# example 1
x_shape = (1, 1, 3, 4)
x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
tensor_x = paddle.to_tensor(x)
y = F.pad2d(tensor_x, paddings=[1, 2, 2, 1], pad_value=1, mode='constant')
# [[[[ 1. 1. 1. 1. 1. 1. 1.]
# [ 1. 1. 1. 2. 3. 4. 1.]
# [ 1. 1. 5. 6. 7. 8. 1.]
# [ 1. 1. 9. 10. 11. 12. 1.]
# [ 1. 1. 1. 1. 1. 1. 1.]
# [ 1. 1. 1. 1. 1. 1. 1.]]]]
# example 2
x_shape = (1, 1, 2, 3)
x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
tensor_x = paddle.to_tensor(x)
y = F.pad2d(tensor_x, paddings=[1, 1, 1, 1], mode='reflect')
# [[[[5. 4. 5. 6. 5.]
# [2. 1. 2. 3. 2.]
# [5. 4. 5. 6. 5.]
# [2. 1. 2. 3. 2.]]]]
input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
......@@ -394,7 +394,8 @@ foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
py_test_modules(test_warpctc_op MODULES test_warpctc_op)
# disable test_warpctc_op
# py_test_modules(test_warpctc_op MODULES test_warpctc_op)
py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
......@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
a_sync_configs = fleet._final_strategy().a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0)
......@@ -72,8 +72,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
a_sync_configs = fleet._final_strategy().a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0)
......@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
a_sync_configs = fleet._final_strategy().a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 800)
......@@ -18,6 +18,8 @@ import unittest
import paddle
import os
class TestFleetAMPOptimizer(unittest.TestCase):
def setUp(self):
......@@ -55,6 +57,8 @@ class TestFleetAMPOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
strategy = fleet._final_strategy()
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('cast', ops)
self.assertIn('check_finite_and_unscale', ops)
......@@ -18,6 +18,8 @@ import os
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
class TestDistributedStrategyAuto(unittest.TestCase):
def setUp(self):
......@@ -167,6 +167,8 @@ class TestFleetDygraph(unittest.TestCase):
state_dict = adam.state_dict()
final_strategy = fleet._final_strategy()
class TestFleetBaseSingleRunCollective(unittest.TestCase):
def setUp(self):
......@@ -19,6 +19,8 @@ import os
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
class TestFleetLambMetaOptimizer(unittest.TestCase):
def setUp(self):
......@@ -19,6 +19,8 @@ import os
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
class TestFleetLarsMetaOptimizer(unittest.TestCase):
def setUp(self):
......@@ -20,6 +20,7 @@ import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.core as core
from paddle.static import program_guard, Program
from op_test import OpTest
......@@ -37,7 +38,7 @@ class TestMVOp(OpTest):
self.check_grad(['X', 'Vec'], 'Out')
def init_config(self):
self.x = np.random.random((5, 100)).astype("float64")
self.x = np.random.random((2, 100)).astype("float64")
self.vec = np.random.random((100)).astype("float64")
......@@ -57,17 +58,32 @@ class TestMVAPI(unittest.TestCase):
def test_static_graph(self):
for x_stop_gradient in [False, True]:
for vec_stop_gradient in [False, True]:
train_program = Program()
startup_program = Program()
self.input_x = np.random.rand(5, 100).astype("float64")
self.input_vec = np.random.rand(100).astype("float64")
data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
data_vec = paddle.static.data("vec", shape=[100], dtype="float64")
with program_guard(train_program, startup_program):
data_x = paddle.static.data(
"x", shape=[5, 100], dtype="float64")
data_vec = paddle.static.data(
"vec", shape=[100], dtype="float64")
data_x.stop_gradient = x_stop_gradient
data_vec.stop_gradient = vec_stop_gradient
result_vec = paddle.mv(data_x, data_vec)
self.place = paddle.CPUPlace()
exe = paddle.static.Executor(self.place)
res, = exe.run(feed={"x": self.input_x,
res, = exe.run(
feed={"x": self.input_x,
"vec": self.input_vec},
z_expected = np.array(np.dot(self.input_x, self.input_vec))
......@@ -30,7 +30,6 @@ from ...fluid.layers import nn, utils
from ...fluid.data_feeder import check_variable_and_dtype
from ...fluid.param_attr import ParamAttr
from ...fluid.layer_helper import LayerHelper
from .common import pad2d
def _is_list_or_tuple(input):
......@@ -14,24 +14,20 @@
__all__ = [
'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer',
'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer',
'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', 'LambOptimizer',
'LarsMomentum', 'LarsMomentumOptimizer', 'LookaheadOptimizer',
'ModelAverage', 'Momentum', 'MomentumOptimizer', 'PipelineOptimizer',
'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer',
'_LRScheduler', 'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR',
'PolynomialLR', 'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR',
'LambdaLR', 'ReduceLROnPlateau', 'CosineAnnealingLR'
'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd',
'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer',
'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'MomentumOptimizer',
'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR',
'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
'ReduceLROnPlateau', 'CosineAnnealingLR'
from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
RecomputeOptimizer, LarsMomentumOptimizer
AdagradOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, \
FtrlOptimizer, AdadeltaOptimizer, ModelAverage, \
ExponentialMovingAverage, LookaheadOptimizer
from .optimizer import Optimizer
from .adam import Adam
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册