提交 dcc09dce 编写于 作者: N nhzlx

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_ut_for_trt

......@@ -211,7 +211,7 @@ function(merge_static_libs TARGET_NAME)
set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
#endif()
endforeach()
# windows cmd return error in clean env.
# COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
......@@ -255,7 +255,7 @@ function(cc_library TARGET_NAME)
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
endif()
# cpplint code style
foreach(source_file ${cc_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
......@@ -298,11 +298,10 @@ function(cc_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (${cc_test_SERIAL})
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
endif()
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif()
endif()
endfunction(cc_test)
......@@ -366,11 +365,10 @@ function(nv_test TARGET_NAME)
add_test(${TARGET_NAME} ${TARGET_NAME})
if (nv_test_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
endif()
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif()
endif()
endfunction(nv_test)
......@@ -558,26 +556,26 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
set(${HDRS})
if (MOBILE_INFERENCE)
set(EXTRA_FLAG "lite:")
set(EXTRA_FLAG "lite:")
else()
set(EXTRA_FLAG "")
set(EXTRA_FLAG "")
endif()
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(FIL_WE ${FIL} NAME_WE)
set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
list(APPEND ${SRCS} "${_protobuf_protoc_src}")
list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
add_custom_command(
OUTPUT "${_protobuf_protoc_src}"
"${_protobuf_protoc_hdr}"
COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-I${CMAKE_CURRENT_SOURCE_DIR}
--cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
DEPENDS ${ABS_FIL} protoc
......@@ -646,7 +644,7 @@ function(grpc_library TARGET_NAME)
get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
#FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
# somehow it didn't. line 602 to 604 is to patching this. Leaving this here
# somehow it didn't. line 602 to 604 is to patching this. Leaving this here
# for now to enable dist CI.
protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
......
......@@ -32,6 +32,15 @@ Normal
:members:
:noindex:
.. _api_fluid_initializer_Normal:
TruncatedNormal
------
.. autoclass:: paddle.fluid.initializer.TruncatedNormal
:members:
:noindex:
.. _api_fluid_initializer_Xavier:
Xavier
......
......@@ -290,14 +290,6 @@ Recv
.. autofunction:: paddle.fluid.layers.Recv
:noindex:
.. _api_fluid_layers_open_recordio_file:
open_recordio_file
------------------
.. autofunction:: paddle.fluid.layers.open_recordio_file
:noindex:
.. _api_fluid_layers_open_files:
open_files
......
paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Program.copy_data_info_from ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.prune ArgSpec(args=['self', 'targets'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.rollback ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
......@@ -86,6 +79,7 @@ paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program
paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
paddle.fluid.initializer.TruncatedNormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0))
paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
......@@ -131,7 +125,7 @@ paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name
paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None))
paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False))
paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
......@@ -175,9 +169,9 @@ paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=Non
paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None)
......
......@@ -25,6 +25,10 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
in.place().which(), dst_place.which(),
"Currently, model parallelism is only supported between CPU and CUDA");
// NOTE(yy): TransDataDevice should wait for computation of input.
platform::DeviceContextPool::Instance().Get(in.place())->Wait();
platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
// FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and
// the enforced checkings have been done in GetDeviceContext, so the
// `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
......
......@@ -129,6 +129,9 @@ class GradOpDescMakerBase {
std::string ForwardOpType() const { return this->fwd_op_.Type(); }
protected:
const OpDesc& ForwardOp() const { return fwd_op_; }
private:
const OpDesc& fwd_op_;
const std::unordered_set<std::string>& no_grad_set_;
......
......@@ -183,28 +183,5 @@ void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
output->clear_blocks();
prune_impl(input, output, 0, -1, &dependent_vars);
}
void inference_optimize_impl(proto::ProgramDesc* input, int block_id) {
auto* op_field = input->mutable_blocks(block_id)->mutable_ops();
for (auto& op_desc : *op_field) {
for (auto& attr : *op_desc.mutable_attrs()) {
if (attr.name() == "is_test") {
attr.set_b(true);
break;
}
}
}
}
void InferenceOptimize(const proto::ProgramDesc& input,
proto::ProgramDesc* output) {
*output = input;
int num_blocks = output->blocks_size();
PADDLE_ENFORCE_GT(num_blocks, 0, "ProgramDesc must have at least one block");
for (int i = 0; i < num_blocks; ++i) {
inference_optimize_impl(output, i);
}
}
} // namespace framework
} // namespace paddle
......@@ -22,8 +22,5 @@ namespace framework {
void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output);
void InferenceOptimize(const proto::ProgramDesc& input,
proto::ProgramDesc* output);
} // namespace framework
} // namespace paddle
......@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/operators/math/concat.h>
#include <numeric>
#include "paddle/fluid/framework/lod_rank_table.h"
......@@ -24,6 +25,50 @@ namespace operators {
using LoD = framework::LoD;
class ArrayToLoDFunctor;
template <typename DeviceContext>
struct ArrayToLoDFunctorImpl {
const ArrayToLoDFunctor *prev_functor_;
DeviceContext *dev_ctx_;
template <typename T>
void apply();
};
struct ArrayToLoDFunctor : public boost::static_visitor<void> {
std::vector<framework::Tensor> in;
mutable framework::Tensor *out;
template <typename Place>
void operator()(Place place) const {
auto &pool = platform::DeviceContextPool::Instance();
if (std::is_same<Place, platform::CPUPlace>::value) {
Apply(static_cast<platform::CPUDeviceContext *>(pool.Get(place)));
} else {
#ifdef PADDLE_WITH_CUDA
Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place)));
#else
PADDLE_THROW("Fluid is not compiled with CUDA");
#endif
}
}
template <typename DeviceContext>
void Apply(DeviceContext *dev_ctx) const {
ArrayToLoDFunctorImpl<DeviceContext> functor;
functor.dev_ctx_ = dev_ctx;
functor.prev_functor_ = this;
framework::VisitDataType(framework::ToDataType(out->type()), functor);
}
};
template <typename DeviceContext>
template <typename T>
void ArrayToLoDFunctorImpl<DeviceContext>::apply() {
math::ConcatFunctor<DeviceContext, T> func;
func(*dev_ctx_, prev_functor_->in, 0, prev_functor_->out);
}
class ArrayToLoDTensorOp : public framework::OperatorBase {
public:
ArrayToLoDTensorOp(const std::string &type,
......@@ -47,14 +92,18 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
int rank = x[0].dims().size();
platform::Place place = x[0].place();
std::type_index data_type = x[0].type();
framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
int64_t batch_size = x[0].dims()[0];
framework::DDim ins_dims = rank > 1
? framework::slice_ddim(x[0].dims(), 1, rank)
: framework::make_ddim({0});
for (size_t i = 1; i < x.size(); ++i) {
PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
auto ins_i_dims = rank > 1 ? framework::slice_ddim(x[i].dims(), 1, rank)
: framework::make_ddim({0});
PADDLE_ENFORCE_EQ(ins_i_dims, ins_dims,
"The dimension of the %zu'th element in LoDTensorArray "
"differs from previous ones.",
i);
PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
PADDLE_ENFORCE(x[i].place() == place,
"The place class of the %zu'th element in LoDTensorArray "
"differs from previous ones.",
i);
......@@ -82,13 +131,14 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
// Build LoDTensor `out`
framework::LoD *out_lod = out->mutable_lod();
out_lod->clear();
size_t out_offset = 0;
auto prefix_lod = rank_table.coarse_lod();
prefix_lod.emplace_back();
auto &cur_level_lod = prefix_lod.back();
cur_level_lod.push_back(0);
ArrayToLoDFunctor functor;
for (size_t idx : table_item_idx) {
cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
PADDLE_ENFORCE_LE(table_items[idx].length, x.size());
for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
x[x_idx].lod(), idx, idx + 1, 0);
......@@ -106,17 +156,11 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
if (len == 0) {
continue;
}
auto slice = out->Slice(out_offset, out_offset + len);
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::TensorCopy(x[x_idx].Slice(start_offset, end_offset), place,
dev_ctx, &slice);
out_offset += len;
functor.in.emplace_back(x[x_idx].Slice(start_offset, end_offset));
}
}
functor.out = out;
platform::VisitPlace(place, functor);
out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
}
};
......
......@@ -29,6 +29,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
std::vector<int> output_size =
ctx->Attrs().Get<std::vector<int>>("output_size");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
......@@ -42,6 +44,10 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
if (output_size.size())
PADDLE_ENFORCE_EQ(output_size.size(), strides.size(),
"ConvTransposeOp output_size dimension and strides "
"dimension should be the same.");
PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
"ConvTransposeOp paddings dimension and strides "
"dimension should be the same.");
......@@ -55,8 +61,17 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
filter_extent);
auto infer_shape =
(in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + filter_extent;
if (output_size.size()) {
PADDLE_ENFORCE((output_size[i] >= infer_shape &&
output_size[i] < infer_shape + strides[i]),
"ConvTransposeOp output_size should be "
"in appropriate range.");
output_shape.push_back(output_size[i]);
} else {
output_shape.push_back(infer_shape);
}
}
ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
}
......@@ -103,6 +118,10 @@ void Conv2DTransposeOpMaker::Make() {
AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator. "
"The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("output_size",
"(vector<int> default: []), the "
"size of the output tensor")
.SetDefault({});
AddAttr<int>("groups",
"(int default:1), the groups number of the convolution "
"transpose operator. ")
......@@ -192,7 +211,10 @@ void Conv3DTransposeOpMaker::Make() {
"Where N is batch size, C is "
"the number of channels, D is the depth of the feature, H is the "
"height of the feature, and W is the width of the feature.");
AddAttr<std::vector<int>>("output_size",
"(vector<int> default: []), the "
"size of the output tensor")
.SetDefault({});
AddAttr<std::vector<int>>(
"dilations",
"(vector<int> default:{1, 1, 1}), the "
......@@ -247,7 +269,7 @@ Parameters(strides, paddings) are three elements. These three elements represent
depth, height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Example:
Input:
Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
......
......@@ -86,10 +86,10 @@ class XeGradFunctor {
auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
for (size_t x_offset = sample_id * num_classes_;
x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
dx_[x_offset] =
(x_offset != x_is_true_offset || label_[sample_id] == ignore_index_)
? static_cast<T>(0)
: -dy_[sample_id] / x_[x_offset];
dx_[x_offset] = (x_offset != x_is_true_offset ||
label_[sample_id] == static_cast<int64_t>(ignore_index_))
? static_cast<T>(0)
: -dy_[sample_id] / x_[x_offset];
}
}
......
......@@ -13,9 +13,45 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise_mul_op.h"
#include <string>
#include "paddle/fluid/operators/elementwise_op.h"
namespace paddle {
namespace operators {
class ElementwiseMulOpGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("elementwise_mul_grad");
op->SetInput("X", Input("X"));
op->SetInput("Y", Input("Y"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
return op;
}
};
class ElementwiseMulOpMaker : public ElementwiseOpMaker {
protected:
virtual std::string GetName() const { return "Mul"; }
virtual std::string GetEquation() const { return "Out = X \\\\odot Y"; }
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\\\odot Y");
REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp,
ops::ElementwiseMulOpMaker, ops::ElementwiseOpInferVarType,
ops::ElementwiseMulOpGradDescMaker);
REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL(
elementwise_mul,
ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -93,8 +93,8 @@ class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* out = dout; // out is not necessary
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
......
......@@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <map>
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
......@@ -26,6 +29,61 @@ struct CopyRange {
size_t end;
};
struct LoDTensorToArrayFunctor;
template <typename DeviceContext>
struct LoDTensorToArrayFunctorImpl {
const LoDTensorToArrayFunctor *prev_functor_;
DeviceContext *dev_ctx_;
template <typename T>
void apply();
};
struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
std::vector<const framework::Tensor *> ref_inputs_;
mutable std::vector<framework::Tensor *> outputs_;
const framework::Tensor &input_;
explicit LoDTensorToArrayFunctor(const framework::Tensor &input)
: input_(input) {}
void AddOutput(framework::Tensor *t) {
outputs_.emplace_back(t);
ref_inputs_.emplace_back(t);
}
template <typename Place>
void operator()(Place place) const {
auto &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = pool.Get(place);
if (std::is_same<Place, platform::CPUPlace>::value) {
Apply(static_cast<platform::CPUDeviceContext *>(dev_ctx));
} else {
#ifdef PADDLE_WITH_CUDA
Apply(static_cast<platform::CUDADeviceContext *>(dev_ctx));
#else
PADDLE_THROW("Not compiled with cuda");
#endif
}
}
template <typename DeviceContext>
void Apply(DeviceContext *dev_ctx) const {
LoDTensorToArrayFunctorImpl<DeviceContext> func;
func.prev_functor_ = this;
func.dev_ctx_ = dev_ctx;
framework::VisitDataType(framework::ToDataType(input_.type()), func);
}
};
template <typename DeviceContext>
template <typename T>
void LoDTensorToArrayFunctorImpl<DeviceContext>::apply() {
math::ConcatGradFunctor<DeviceContext, T> func;
func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0,
&prev_functor_->outputs_);
}
class LoDTensorToArrayOp : public framework::OperatorBase {
public:
LoDTensorToArrayOp(const std::string &type,
......@@ -72,6 +130,11 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
}
}
auto &outputs = *const_cast<framework::Scope &>(scope)
.Var()
->GetMutable<std::map<size_t, framework::Tensor>>();
for (size_t i = 0; i < max_seq_len; ++i) {
auto &ranges = copy_ranges[i];
size_t height = std::accumulate(
......@@ -90,17 +153,16 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
// out[i][offset: offset+len] = x[each_range.begin: each_range.end]
auto slice = out[i].Slice(static_cast<int>(offset),
static_cast<int>(offset + len));
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::TensorCopy(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)),
x.place(), dev_ctx, &slice);
outputs.insert({each_range.begin, slice});
offset += len;
}
}
LoDTensorToArrayFunctor functor(x);
for (auto &out_pair : outputs) {
functor.AddOutput(&out_pair.second);
}
platform::VisitPlace(place, functor);
}
};
......
......@@ -59,7 +59,8 @@ class MatMulKernel : public framework::OpKernel<T> {
RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X"));
auto mat_dim_b = math::CreateMatrixDescriptor(
ColumnMatrixFromVector(y.dims()), 0, context.Attr<bool>("transpose_Y"));
blas.MatMul(x, mat_dim_a, y, mat_dim_b, T(1), out, T(0));
auto scale = static_cast<T>(context.Attr<float>("alpha"));
blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0));
}
};
......@@ -185,7 +186,8 @@ class MatMulGradKernel : public framework::OpKernel<T> {
auto blas = math::GetBlas<DeviceContext, T>(context);
auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0));
blas.MatMul(a, mat_dim_a, b, mat_dim_b,
static_cast<T>(context.Attr<float>("alpha")), out, T(0));
}
void CalcInputGrad(const framework::ExecutionContext &context,
......@@ -334,6 +336,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
R"DOC(If true, use the transpose of `Y`.
)DOC")
.SetDefault(false);
AddAttr<float>("alpha", "The scale of Out").SetDefault(1.0f);
AddComment(R"DOC(
MatMul Operator.
......
......@@ -156,12 +156,29 @@ class MulGradOp : public framework::OperatorWithKernel {
}
};
class MulOpGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
retv->SetType("mul_grad");
retv->SetInput("X", Input("X"));
retv->SetInput("Y", Input("Y"));
retv->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
retv->SetOutput(framework::GradVarName("X"), InputGrad("X"));
retv->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
retv->SetAttrMap(Attrs());
return retv;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGradMaker);
REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -52,6 +52,12 @@ $$Out = scale*X$$
)DOC");
AddAttr<float>("scale", "The scaling factor of the scale operator.")
.SetDefault(1.0);
AddAttr<float>("bias", "The bias of the scale operator.").SetDefault(0.0);
AddAttr<bool>(
"bias_after_scale",
"Apply bias addition after or before scaling. It is useful for "
"numeric stability in some circumstances.")
.SetDefault(true);
}
};
......@@ -80,6 +86,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
grad_op->SetInput("X", OutputGrad("Out"));
grad_op->SetOutput("Out", InputGrad("X"));
grad_op->SetAttr("scale", GetAttr("scale"));
grad_op->SetAttr("bias", 0.0f);
grad_op->SetAttr("bias_after_scale", true);
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
......
......@@ -34,6 +34,8 @@ class ScaleKernel : public framework::OpKernel<T> {
"in and out should have the same dim");
auto scale = static_cast<T>(ctx.Attr<float>("scale"));
auto bias = static_cast<T>(ctx.Attr<float>("bias"));
auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
auto& in_slr = in_var->Get<framework::SelectedRows>();
......@@ -45,7 +47,11 @@ class ScaleKernel : public framework::OpKernel<T> {
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
eigen_out.device(dev) = scale * eigen_in;
if (bias_after_scale) {
eigen_out.device(dev) = scale * eigen_in + bias;
} else {
eigen_out.device(dev) = scale * (eigen_in + bias);
}
}
};
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <limits>
#include <random>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
// reference: https://gist.github.com/lakshayg/d80172fe5ae3c5d2c2aedb53c250320e
template <typename T>
T Erfinv(T x) {
if (x < -1 || x > 1) {
return std::numeric_limits<T>::quiet_NaN();
} else if (x == 1.0) {
return std::numeric_limits<T>::infinity();
} else if (x == -1.0) {
return -std::numeric_limits<T>::infinity();
}
const T LN2 = 6.931471805599453094172321214581e-1;
const T A0 = 1.1975323115670912564578e0;
const T A1 = 4.7072688112383978012285e1;
const T A2 = 6.9706266534389598238465e2;
const T A3 = 4.8548868893843886794648e3;
const T A4 = 1.6235862515167575384252e4;
const T A5 = 2.3782041382114385731252e4;
const T A6 = 1.1819493347062294404278e4;
const T A7 = 8.8709406962545514830200e2;
const T B0 = 1.0000000000000000000e0;
const T B1 = 4.2313330701600911252e1;
const T B2 = 6.8718700749205790830e2;
const T B3 = 5.3941960214247511077e3;
const T B4 = 2.1213794301586595867e4;
const T B5 = 3.9307895800092710610e4;
const T B6 = 2.8729085735721942674e4;
const T B7 = 5.2264952788528545610e3;
const T C0 = 1.42343711074968357734e0;
const T C1 = 4.63033784615654529590e0;
const T C2 = 5.76949722146069140550e0;
const T C3 = 3.64784832476320460504e0;
const T C4 = 1.27045825245236838258e0;
const T C5 = 2.41780725177450611770e-1;
const T C6 = 2.27238449892691845833e-2;
const T C7 = 7.74545014278341407640e-4;
const T D0 = 1.4142135623730950488016887e0;
const T D1 = 2.9036514445419946173133295e0;
const T D2 = 2.3707661626024532365971225e0;
const T D3 = 9.7547832001787427186894837e-1;
const T D4 = 2.0945065210512749128288442e-1;
const T D5 = 2.1494160384252876777097297e-2;
const T D6 = 7.7441459065157709165577218e-4;
const T D7 = 1.4859850019840355905497876e-9;
const T E0 = 6.65790464350110377720e0;
const T E1 = 5.46378491116411436990e0;
const T E2 = 1.78482653991729133580e0;
const T E3 = 2.96560571828504891230e-1;
const T E4 = 2.65321895265761230930e-2;
const T E5 = 1.24266094738807843860e-3;
const T E6 = 2.71155556874348757815e-5;
const T E7 = 2.01033439929228813265e-7;
const T F0 = 1.414213562373095048801689e0;
const T F1 = 8.482908416595164588112026e-1;
const T F2 = 1.936480946950659106176712e-1;
const T F3 = 2.103693768272068968719679e-2;
const T F4 = 1.112800997078859844711555e-3;
const T F5 = 2.611088405080593625138020e-5;
const T F6 = 2.010321207683943062279931e-7;
const T F7 = 2.891024605872965461538222e-15;
T abs_x = abs(x);
if (abs_x <= 0.85) {
T r = 0.180625 - 0.25 * x * x;
T num =
(((((((A7 * r + A6) * r + A5) * r + A4) * r + A3) * r + A2) * r + A1) *
r +
A0);
T den =
(((((((B7 * r + B6) * r + B5) * r + B4) * r + B3) * r + B2) * r + B1) *
r +
B0);
return x * num / den;
}
T r = sqrt(LN2 - log(1.0 - abs_x));
T num, den;
if (r <= 5.0) {
r = r - 1.6;
num =
(((((((C7 * r + C6) * r + C5) * r + C4) * r + C3) * r + C2) * r + C1) *
r +
C0);
den =
(((((((D7 * r + D6) * r + D5) * r + D4) * r + D3) * r + D2) * r + D1) *
r +
D0);
} else {
r = r - 5.0;
num =
(((((((E7 * r + E6) * r + E5) * r + E4) * r + E3) * r + E2) * r + E1) *
r +
E0);
den =
(((((((F7 * r + F6) * r + F5) * r + F4) * r + F3) * r + F2) * r + F1) *
r +
F0);
}
if (x < 0) {
return -num / den;
} else {
return num / den;
}
}
template <typename T>
struct TruncatedNormal {
T mean, std;
T a_normal_cdf;
T b_normal_cdf;
TruncatedNormal(T mean, T std) : mean(mean), std(std) {
auto normal_cdf = [](T x) {
return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
};
a_normal_cdf = normal_cdf(-2.0);
b_normal_cdf = normal_cdf(2.0);
}
T operator()(T value) const {
auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
return (std::sqrt(2.0) * Erfinv(2 * p - 1) + mean) * std;
}
};
template <typename T>
class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
float mean = context.Attr<float>("mean");
float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out");
T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
std::minstd_rand engine;
if (seed == 0) {
seed = std::random_device()();
}
engine.seed(seed);
std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
1.0);
TruncatedNormal<T> truncated_normal(mean, std);
int64_t size = tensor->numel();
for (int64_t i = 0; i < size; ++i) {
data[i] = truncated_normal(dist(engine));
}
}
};
class TruncatedGaussianRandomOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of TruncatedGaussianRandomOp should not be null.");
auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
std::vector<int64_t> out_dim;
out_dim.reserve(shape.size());
for (auto dim : shape) {
out_dim.push_back(static_cast<int64_t>(dim));
}
PADDLE_ENFORCE(shape.size() > 0UL,
"shape can be one int or array. shape must be set.");
ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library{framework::LibraryType::kPlain};
framework::DataLayout layout{framework::DataLayout::kAnyLayout};
return framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
ctx.device_context(), layout, library);
}
};
class TruncatedGaussianRandomOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddOutput("Out", "Output tensor of truncated gaussian random op.");
AddAttr<std::vector<int>>("shape",
"(vector<int>) "
"The dimension of random tensor.");
AddAttr<float>("mean",
"(float, default 0.0) "
"mean of random tensor.")
.SetDefault(.0f);
AddAttr<float>("std",
"(float, default 1.0) "
"std of random tensor.")
.SetDefault(1.0f);
AddAttr<int>("seed",
"(int, default 0) "
"Random seed of generator."
"0 means use system wide seed."
"Note that if seed is not 0, this operator will always "
"generate the same random numbers every time.")
.SetDefault(0);
AddAttr<int>("dtype",
"(int, default 5(FP32)) "
"Output data type.")
.SetDefault(framework::proto::VarType::FP32);
AddComment(R"DOC(
TruncatedGaussianRandom Operator.
Used to initialize tensors with truncated gaussian random generator.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random,
ops::TruncatedGaussianRandomOp,
ops::TruncatedGaussianRandomOpMaker);
REGISTER_OP_CPU_KERNEL(truncated_gaussian_random,
ops::CPUTruncatedGaussianRandomKernel<float>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/random.h>
#include <thrust/transform.h>
#include <limits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace operators {
template <typename T>
struct TruncatedNormal {
T mean, std;
T a_normal_cdf;
T b_normal_cdf;
unsigned int seed;
T numeric_min;
__host__ __device__ TruncatedNormal(T mean, T std, T numeric_min, int seed)
: mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
}
__host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng;
rng.seed(seed);
thrust::uniform_real_distribution<T> dist(numeric_min, 1);
rng.discard(n);
T value = dist(rng);
auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
return (std::sqrt(2.0) * erfinvf(2 * p - 1) + mean) * std;
}
};
template <typename T>
class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* tensor = context.Output<framework::Tensor>("Out");
T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
if (seed == 0) {
std::random_device rd;
seed = rd();
}
T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
int64_t size = tensor->numel();
thrust::transform(
index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data),
TruncatedNormal<T>(mean, std, std::numeric_limits<T>::min(), seed));
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
truncated_gaussian_random,
paddle::operators::GPUTruncatedGaussianRandomKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/fluid/framework/executor.h"
......@@ -138,6 +138,10 @@ class WhileGradOp : public framework::OperatorBase {
auto inside_og_name = inside_og_names[i];
VLOG(8) << "Linking outside " << outside_og_name << " --> inside "
<< inside_og_name;
if (scope.FindVar(outside_og_name) == nullptr) {
continue;
}
auto &og_outside =
detail::Ref(scope.FindVar(outside_og_name),
"Cannot find Outside Gradient %s", outside_og_name);
......@@ -167,20 +171,46 @@ class WhileGradOp : public framework::OperatorBase {
PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
}
}
} else {
PADDLE_THROW("Currently only support LoDTensor and LoDTensorArray.");
}
}
executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false, true,
true);
auto &pg_names = Outputs(kXGRAD);
// The Outputs(kXGRAD) contains the names of the gradient of parameters
// and inputs.
auto &pg_ig_names = Outputs(kXGRAD);
auto &p_names = Inputs(kX);
PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
if (pg_names[param_id] == framework::kEmptyVarName) {
PADDLE_ENFORCE_EQ(pg_ig_names.size(), p_names.size());
for (size_t param_id = 0; param_id < pg_ig_names.size(); ++param_id) {
if (pg_ig_names[param_id] == framework::kEmptyVarName) {
continue; // parameter doesn't have gradient
}
auto inside_grad_name = framework::GradVarName(p_names[param_id]);
// for some grad_op, their input doesn't have gradient,
// for example lookup_table_grad_op, the input(Idx) doesn't have
// gradient.
auto pg_ig_var = cur_scope.FindVar(inside_grad_name);
PADDLE_ENFORCE(pg_ig_var != nullptr);
if (pg_ig_var->IsType<framework::LoDTensorArray>()) {
auto pg_ig_lod_t_arr =
pg_ig_var->GetMutable<framework::LoDTensorArray>();
bool empty = true;
for (auto &each : *pg_ig_lod_t_arr) {
if (each.numel() != 0) {
empty = false;
break;
}
}
if (empty) {
LOG(WARNING) << pg_ig_names[param_id]
<< " is not found in cur_scope.";
continue;
}
}
// // TODO(tonyyang-svail): Not sure we need the following
// // If does not compute gradient of that variable inside rnn,
// just
......@@ -194,6 +224,11 @@ class WhileGradOp : public framework::OperatorBase {
if (cur_scope_iter == step_scopes->rbegin()) {
auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
PADDLE_ENFORCE(var->IsType<framework::LoDTensorArray>() ||
var->IsType<LoDTensor>(),
"Currently the type of var only can be LoDTensorArray "
"or LoDTensor.");
if (var->IsType<LoDTensor>()) {
auto &inside_tensor = var->Get<framework::LoDTensor>();
framework::AttributeMap attrs;
......@@ -201,7 +236,7 @@ class WhileGradOp : public framework::OperatorBase {
attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
attrs["value"] = 0.0f;
auto var_name = pg_names[param_id];
auto var_name = pg_ig_names[param_id];
auto zero_op = framework::OpRegistry::CreateOp(
"fill_constant", framework::VariableNameMap{},
{{"Out", {var_name}}}, attrs);
......@@ -213,8 +248,8 @@ class WhileGradOp : public framework::OperatorBase {
}
auto new_inside_name = cur_scope.Rename(inside_grad_name);
auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {pg_names[param_id], new_inside_name}}},
{{"Out", {pg_names[param_id]}}},
"sum", {{"X", {pg_ig_names[param_id], new_inside_name}}},
{{"Out", {pg_ig_names[param_id]}}},
framework::AttributeMap{{"use_mkldnn", {false}}});
sum_op->Run(cur_scope, dev_place);
cur_scope.Rename(new_inside_name, inside_grad_name);
......@@ -281,6 +316,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
parent_block->FindVarRecursive(input_name) != nullptr)) {
continue;
}
output_grads.insert(input_name);
}
for (auto &output_name : op->OutputArgumentNames()) {
......@@ -309,13 +345,13 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
auto p_names = op_desc.Input(kX);
auto pg_names = op_desc.Output(framework::GradVarName(kX));
auto pg_ig_names = op_desc.Output(framework::GradVarName(kX));
for (size_t i = 0; i < p_names.size(); ++i) {
auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
auto *g_var = block->FindVarRecursive(pg_names[i]);
auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
if (g_var != nullptr) { // Gradient could be @EMPTY@
VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i]
VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
<< " type: " << p_var.GetType();
g_var->SetType(p_var.GetType());
g_var->SetDataType(p_var.GetDataType());
......@@ -333,21 +369,21 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
ctx->HasInputs(framework::GradVarName(kOutputs));
auto p_names = ctx->Inputs(kX);
auto pg_names = ctx->Outputs(kXGRAD);
auto pg_ig_names = ctx->Outputs(kXGRAD);
auto var_types = ctx->GetInputsVarType(kX);
std::vector<std::string> names_to_set;
std::vector<framework::DDim> dims_to_set;
for (size_t i = 0; i < p_names.size(); ++i) {
if (pg_names[i] == framework::kEmptyVarName) {
if (pg_ig_names[i] == framework::kEmptyVarName) {
continue;
}
auto dims = ctx->GetInputsElementDim(kX, i);
if (var_types[i] == framework::proto::VarType::LOD_TENSOR) {
names_to_set.push_back(pg_names[i]);
names_to_set.push_back(pg_ig_names[i]);
dims_to_set.push_back(dims);
} else if (var_types[i] == framework::proto::VarType::LOD_TENSOR_ARRAY) {
// not sure how to set the dim of LOD_TENSOR_ARRAY
names_to_set.push_back(pg_names[i]);
names_to_set.push_back(pg_ig_names[i]);
dims_to_set.push_back(dims);
}
}
......
......@@ -396,11 +396,6 @@ All parameter, weight, gradient are variables in Paddle.
Prune(*prog_with_targets.Proto(), &pruned_desc);
return new ProgramDesc(pruned_desc);
});
m.def("inference_optimize", [](ProgramDesc &origin) {
proto::ProgramDesc pruned_desc;
InferenceOptimize(*(origin.Proto()), &pruned_desc);
return new ProgramDesc(pruned_desc);
});
m.def("empty_var_name",
[]() { return std::string(framework::kEmptyVarName); });
m.def("grad_var_suffix",
......
......@@ -347,7 +347,7 @@ def _append_backward_ops_(block,
# If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr_id("sub_block"))
grad_sub_block = program.create_block()
grad_sub_block = program._create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx)
cb = _callback_lookup_(op)
if cb is not None:
......@@ -361,7 +361,7 @@ def _append_backward_ops_(block,
_append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
no_grad_dict, grad_to_var, callbacks)
program.rollback()
program._rollback()
grad_sub_block_list.append(grad_sub_block.desc)
# Getting op's corresponding grad_op
......
......@@ -331,7 +331,7 @@ def append_gradient_clip_ops(param_grads):
for p, g in param_grads:
if g is None:
continue
with p.block.program.optimized_guard([p, g]):
with p.block.program._optimized_guard([p, g]):
clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
if clip_attr is None:
clip_attr = NullGradientClipAttr()
......@@ -346,7 +346,7 @@ def append_gradient_clip_ops(param_grads):
for p, g in param_grads:
if g is None:
continue
with p.block.program.optimized_guard([p, g]):
with p.block.program._optimized_guard([p, g]):
res.append(clip_attr._create_operators(param=p, grad=g))
return res
......
......@@ -126,7 +126,7 @@ class SelectCase(object):
self.channel = channel
def __enter__(self):
self.block = self.main_program.create_block()
self.block = self.main_program._create_block()
def construct_op(self):
main_program = self.helper.main_program
......@@ -187,7 +187,7 @@ class SelectCase(object):
if self.value else '')
def __exit__(self, exc_type, exc_val, exc_tb):
self.main_program.rollback()
self.main_program._rollback()
if exc_type is not None:
return False # re-raise exception
return True
......
......@@ -935,7 +935,7 @@ class Block(object):
Notes:
The constructor of Block should not be invoked directly. Please
use `Program.create_block()` to create a block.
use `Program._create_block()` to create a block.
Examples:
.. code-block:: python
......@@ -1483,7 +1483,7 @@ class Program(object):
self._op_role_var = [var_name]
@contextlib.contextmanager
def optimized_guard(self, param_and_grads):
def _optimized_guard(self, param_and_grads):
"""
A with guard to set :code:`Optimization` :code:`OpRole` and
:code:`OpRoleVar` automatically.
......@@ -1496,7 +1496,7 @@ class Program(object):
Examples:
>>> p, g = backward(...)
>>> with program.optimized_guard([p,g]):
>>> with program._optimized_guard([p,g]):
>>> p = p - 0.001 * g
"""
OpRole = core.op_proto_and_checker_maker.OpRole
......@@ -1554,7 +1554,7 @@ class Program(object):
res_str = _debug_string_(proto, throw_on_error)
return res_str
def get_desc(self):
def _get_desc(self):
"""
Get the C++ side of `ProgramDesc` object pointer. The C++ object is
exposed by :code:`pybind`.
......@@ -1647,7 +1647,7 @@ class Program(object):
The two code snippets above will generate same programs.
"""
if for_test:
p = self.inference_optimize(export_for_deployment=False)
p = self._inference_optimize(export_for_deployment=False)
else:
p = Program()
p.current_block_idx = self.current_block_idx
......@@ -1663,10 +1663,10 @@ class Program(object):
p._sync_with_cpp()
p._copy_param_info_from(self)
p.copy_data_info_from(self)
p._copy_data_info_from(self)
return p
def prune(self, targets):
def _prune(self, targets):
"""
Prune operators and variables which are not needed to generate
:code:`targets`.
......@@ -1717,7 +1717,7 @@ class Program(object):
res._sync_with_cpp()
return res
def inference_optimize(self, export_for_deployment=True):
def _inference_optimize(self, export_for_deployment=True):
"""
This method will create a new program and do following adjustments on it:
1. Remove all reader variables and their creator ops if exist.
......@@ -1738,8 +1738,6 @@ class Program(object):
Returns:
Program: The new program.
"""
# this is an alternative implement before
# core.inference_optimize being fixed.
res = Program()
res.desc = core.ProgramDesc(self.desc)
......@@ -1841,7 +1839,7 @@ class Program(object):
"""
return self.blocks[self.current_block_idx]
def create_block(self, parent_idx=None):
def _create_block(self, parent_idx=None):
"""
Create a new block with the :code:`parent_idx` and change the current block
to new block.
......@@ -1860,7 +1858,7 @@ class Program(object):
self.blocks.append(Block(self, self.current_block_idx))
return self.current_block()
def rollback(self):
def _rollback(self):
"""
Exit a code block, i.e., roll back to the parent block.
Returns:
......@@ -1906,7 +1904,7 @@ class Program(object):
"program, with represent the same topology")
self.global_block()._copy_param_info_from(other.global_block())
def copy_data_info_from(self, other):
def _copy_data_info_from(self, other):
"""
Copy the information of data variables from other program.
......
......@@ -20,10 +20,10 @@ import contextlib
from .core import VarDesc
__all__ = [
'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
'UniformInitializer', 'NormalInitializer', 'XavierInitializer',
'BilinearInitializer', 'MSRAInitializer'
'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
'MSRA', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
'UniformInitializer', 'NormalInitializer', 'TruncatedNormalInitializer',
'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer'
]
_force_init_on_cpu_ = False
......@@ -33,6 +33,8 @@ def force_init_on_cpu():
"""
The flag of whether force to init variables on CPU.
Returns::
Examples:
.. code-block:: python
......@@ -272,6 +274,60 @@ class NormalInitializer(Initializer):
return op
class TruncatedNormalInitializer(Initializer):
"""Implements the Random TruncatedNormal(Gaussian) distribution initializer
Args:
loc (float): mean of the normal distribution
scale (float): standard deviation of the normal distribution
seed (int): random seed
Examples:
.. code-block:: python
fc = fluid.layers.fc(input=x, size=10,
param_attr=fluid.initializer.TruncatedNormal(loc=0.0, scale=2.0))
"""
def __init__(self, loc=0.0, scale=1.0, seed=0):
assert loc is not None
assert scale is not None
assert seed is not None
super(NormalInitializer, self).__init__()
self._mean = loc
self._std_dev = scale
self._seed = seed
def __call__(self, var, block):
"""Add truncated normal distribution initialization ops for a variable
Args:
var: Variable that needs to be initialized
block: The block in which initialization ops
should be added
Returns:
the initialization op
"""
assert isinstance(var, framework.Variable)
assert isinstance(block, framework.Block)
# Initialization Ops should be prepended and not appended
if self._seed == 0:
self._seed = block.program.random_seed
op = block._prepend_op(
type="truncated_gaussian_random",
outputs={"Out": var},
attrs={
"shape": var.shape,
"dtype": int(var.dtype),
"mean": self._mean,
"std": self._std_dev,
"seed": self._seed
})
var.op = op
return op
class XavierInitializer(Initializer):
"""
This class implements the Xavier weight initializer from the paper
......@@ -583,6 +639,7 @@ class BilinearInitializer(Initializer):
Constant = ConstantInitializer
Uniform = UniformInitializer
Normal = NormalInitializer
TruncatedNormal = TruncatedNormalInitializer
Xavier = XavierInitializer
MSRA = MSRAInitializer
Bilinear = BilinearInitializer
......@@ -515,8 +515,8 @@ def get_inference_program(target_vars, main_program=None):
vars.extend(var.metrics)
else:
vars.append(var)
pruned_program = main_program.prune(targets=vars)
inference_program = pruned_program.inference_optimize()
pruned_program = main_program._prune(targets=vars)
inference_program = pruned_program._inference_optimize()
return inference_program
......@@ -644,8 +644,8 @@ def save_inference_model(dirname,
global_block._remove_op(i)
copy_program.desc.flush()
pruned_program = copy_program.prune(targets=target_vars)
inference_program = pruned_program.inference_optimize(
pruned_program = copy_program._prune(targets=target_vars)
inference_program = pruned_program._inference_optimize(
export_for_deployment=export_for_deployment)
fetch_var_names = [v.name for v in target_vars]
......
......@@ -217,10 +217,10 @@ class BlockGuard(object):
self.main_program = main_program
def __enter__(self):
self.main_program.create_block()
self.main_program._create_block()
def __exit__(self, exc_type, exc_val, exc_tb):
self.main_program.rollback()
self.main_program._rollback()
if exc_type is not None:
return False # re-raise exception
return True
......
......@@ -723,11 +723,10 @@ def ssd_loss(location,
target_label.stop_gradient = True
conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
# 3. Mining hard examples
actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2])
actual_shape.stop_gradient = True
conf_loss = nn.reshape(
x=conf_loss,
shape=(num, num_prior),
actual_shape=ops.slice(
conf_shape, axes=[0], starts=[0], ends=[2]))
x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape)
conf_loss.stop_gradient = True
neg_indices = helper.create_tmp_variable(dtype='int32')
dtype = matched_indices.dtype
......@@ -796,11 +795,7 @@ def ssd_loss(location,
# 5.3 Compute overall weighted loss.
loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
# reshape to [N, Np], N is the batch size and Np is the prior box number.
loss = nn.reshape(
x=loss,
shape=(num, num_prior),
actual_shape=ops.slice(
conf_shape, axes=[0], starts=[0], ends=[2]))
loss = nn.reshape(x=loss, shape=(num, num_prior), actual_shape=actual_shape)
loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
if normalize:
normalizer = nn.reduce_sum(target_loc_weight)
......
......@@ -29,9 +29,8 @@ from ..layer_helper import LayerHelper
from ..unique_name import generate as unique_name
__all__ = [
'data', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
'double_buffer', 'random_data_generator', 'py_reader', 'Preprocessor',
'load'
'data', 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
'random_data_generator', 'py_reader', 'Preprocessor', 'load'
]
......@@ -1008,9 +1007,9 @@ class Preprocessor(object):
@contextlib.contextmanager
def block(self):
self.status = Preprocessor.IN_SUB_BLOCK
self.sub_block = self.main_prog.create_block()
self.sub_block = self.main_prog._create_block()
yield
self.main_prog.rollback()
self.main_prog._rollback()
self.status = Preprocessor.AFTER_SUB_BLOCK
if not self._is_completed():
raise RuntimeError(
......
......@@ -113,6 +113,7 @@ __all__ = [
'pad2d',
'unstack',
'sequence_enumerate',
'expand',
'sequence_concat',
]
......@@ -2342,16 +2343,20 @@ def conv2d_transpose(input,
.. math::
H_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
H_{out} \in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
W_{out} \in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
Args:
input(Variable): The input image with [N, C, H, W] format.
num_filters(int): The number of the filter. It is as same as the output
image channel.
output_size(int|tuple|None): The output image size. If output size is a
tuple, it must contain two integers, (image_H, image_W). This
parameter only works when filter_size is None.
tuple, it must contain two integers, (image_H, image_W). None if use
filter_size, padding, and stride to calculate output_size.
if output_size and filter_size are specified at the same time, They
should follow the formula above.
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
it must contain two integers, (filter_size_H, filter_size_W).
Otherwise, the filter will be a square. None if use output size to
......@@ -2429,7 +2434,13 @@ def conv2d_transpose(input,
else:
filter_size = utils.convert_to_list(filter_size, 2,
'conv2d_transpose.filter_size')
if output_size is None:
output_size = []
elif isinstance(output_size, list) or isinstance(output_size, int):
output_size = utils.convert_to_list(output_size, 2, 'output_size')
else:
raise ValueError("output_size should be list or int")
padding = utils.convert_to_list(padding, 2, 'padding')
groups = 1 if groups is None else groups
filter_shape = [input_channel, num_filters // groups] + filter_size
img_filter = helper.create_parameter(
......@@ -2442,6 +2453,7 @@ def conv2d_transpose(input,
'Filter': [img_filter]},
outputs={'Output': pre_bias},
attrs={
'output_size': output_size,
'strides': stride,
'paddings': padding,
'dilations': dilation,
......@@ -3487,7 +3499,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
return out
def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
"""
Applies matrix multiplication to two tensors.
......@@ -3521,6 +3533,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
y (Variable): The input variable which is a Tensor or LoDTensor.
transpose_x (bool): Whether to transpose :math:`x` before multiplication.
transpose_y (bool): Whether to transpose :math:`y` before multiplication.
alpha (float): The scale of output. Default 1.0.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
......@@ -3588,8 +3601,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
inputs={'X': x,
'Y': y},
outputs={'Out': out},
attrs={'transpose_X': transpose_x,
'transpose_Y': transpose_y})
attrs={
'transpose_X': transpose_x,
'transpose_Y': transpose_y,
'alpha': alpha,
})
return out
......@@ -6107,3 +6123,53 @@ def unstack(x, axis=0, num=None):
attrs={'axis': axis,
'num': num})
return outs
def expand(x, expand_times, name=None):
"""Expand operator tiles the input by given times number. You should set times
number for each dimension by providing attribute 'expand_times'. The rank of X
should be in [1, 6]. Please note that size of 'expand_times' must be the same
with X's rank. Following is a using case:
.. code-block:: text
Input(X) is a 3-D tensor with shape [2, 3, 1]:
[
[[1], [2], [3]],
[[4], [5], [6]]
]
Attr(expand_times): [1, 2, 2]
Output(Out) is a 3-D tensor with shape [2, 6, 2]:
[
[[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
[[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
]
Args:
x (Variable): A tensor with rank in [1, 6].
expand_times (list|tuple): Expand times number for each dimension.
Returns:
Variable: The expanded variable which is a LoDTensor. After expanding, size of each dimension of Output(Out) is equal to ithe size of the corresponding dimension of Input(X) multiplying the corresponding value given by expand_times.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[10], dtype='float32')
out = fluid.layers.expand(x=x, expand_times=[1, 2, 2])
"""
helper = LayerHelper('expand', input=x, **locals())
dtype = helper.input_dtype(input_param_name='x')
out = helper.create_tmp_variable(dtype)
helper.append_op(
type='expand',
inputs={'X': x},
outputs={'Out': out},
attrs={'expand_times': expand_times})
return out
......@@ -236,7 +236,7 @@ class Optimizer(object):
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
with param_and_grad[0].block.program.optimized_guard(
with param_and_grad[0].block.program._optimized_guard(
param_and_grad), name_scope("optimizer"):
if param_and_grad[0].trainable is True:
optimize_op = self._append_optimize_op(loss.block,
......@@ -580,7 +580,7 @@ class AdamOptimizer(Optimizer):
for param, grad in param_and_grads:
if grad is None:
continue
with param.block.program.optimized_guard([param, grad]):
with param.block.program._optimized_guard([param, grad]):
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param)
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
......@@ -709,7 +709,7 @@ class AdamaxOptimizer(Optimizer):
for param, grad in parameters_and_grads:
if grad is None:
continue
with param.block.program.optimized_guard([param, grad]):
with param.block.program._optimized_guard([param, grad]):
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param)
main_block.append_op(
......@@ -1198,7 +1198,7 @@ class ModelAverage(Optimizer):
for param, grad in self.params_grads:
if grad is None:
continue
with param.block.program.optimized_guard([param, grad]):
with param.block.program._optimized_guard([param, grad]):
self._append_average_accumulate_op(param)
self.apply_program = Program()
......
......@@ -47,7 +47,7 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
if grad is None:
params_and_grads.append((param, grad))
continue
with param.block.program.optimized_guard([param, grad]):
with param.block.program._optimized_guard([param, grad]):
regularization_term = None
if param.regularizer is not None:
# Add variable for regularization term in grad block
......
......@@ -35,6 +35,10 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
d_bolck_w = dilations[1] * (f_w - 1) + 1
out_h = (in_h - 1) * stride[0] + d_bolck_h
out_w = (in_w - 1) * stride[1] + d_bolck_w
if 'output_size' in attrs:
output_size = attrs['output_size']
out_h = output_size[0] + 2 * pad[0]
out_w = output_size[1] + 2 * pad[1]
out = np.zeros((in_n, out_c, out_h, out_w))
......@@ -65,6 +69,7 @@ class TestConv2dTransposeOp(OpTest):
def setUp(self):
# init as conv transpose
self.use_cudnn = False
self.output_size = None
self.init_op_type()
self.init_test_case()
......@@ -80,6 +85,8 @@ class TestConv2dTransposeOp(OpTest):
'use_cudnn': self.use_cudnn,
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
}
if self.output_size is not None:
self.attrs['output_size'] = self.output_size
output = conv2dtranspose_forward_naive(input_, filter_,
self.attrs).astype('float32')
......@@ -192,6 +199,18 @@ class TestWithDilation(TestConv2dTransposeOp):
self.filter_size = [f_c, 6, 3, 3]
class TestWithEvenUpsample(TestConv2dTransposeOp):
def init_test_case(self):
self.pad = [2, 2]
self.stride = [2, 2]
self.groups = 1
self.dilations = [1, 1]
self.output_size = [14, 14]
self.input_size = [2, 3, 7, 7] # NCHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 5, 5]
# ------------ test_cudnn ------------
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
......@@ -265,6 +284,15 @@ class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
self.op_type = "depthwise_conv2d_transpose"
# ------------ test_cudnn ------------
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNNWithEvenUpsample(TestWithEvenUpsample):
def init_op_type(self):
self.use_cudnn = True
self.op_type = "conv2d_transpose"
# Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation):
......
......@@ -144,6 +144,142 @@ class TestDynRNN(unittest.TestCase):
# loss should be small after 100 mini-batch
self.assertLess(val[0], loss_0[0])
# this unit test is just used to the two layer nested dyn_rnn.
def test_train_nested_dyn_rnn(self):
word_dict = [i for i in range(30)]
def fake_reader():
seq_len, label = [[2, 2]], [0, 1]
data = []
for ele in seq_len:
for j in ele:
data.append([numpy.random.randint(30) \
for _ in range(j)])
while True:
yield data, label
train_data = paddle.batch(fake_reader, batch_size=2)
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
sentence = fluid.layers.data(
name='word', shape=[1], dtype='int64', lod_level=2)
label = fluid.layers.data(
name='label', shape=[1], dtype='float32', lod_level=1)
rnn = fluid.layers.DynamicRNN()
with rnn.block():
in_ = rnn.step_input(sentence)
sent_emb = fluid.layers.embedding(
input=in_, size=[len(word_dict), 32], dtype='float32')
out_ = fluid.layers.fc(input=sent_emb, size=100, act='tanh')
rnn1 = fluid.layers.DynamicRNN()
with rnn1.block():
in_1 = rnn1.step_input(out_)
out_1 = fluid.layers.fc(input=[in_1], size=100, act='tanh')
rnn1.output(out_1)
last = fluid.layers.sequence_last_step(input=rnn1())
rnn.output(last)
last = rnn()
logits = fluid.layers.fc(input=last, size=1, act=None)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=logits, label=label)
loss = fluid.layers.mean(loss)
sgd = fluid.optimizer.SGD(1e-3)
#sgd = fluid.optimizer.Adam(1e-3)
sgd.minimize(loss=loss)
cpu = fluid.CPUPlace()
exe = fluid.Executor(cpu)
exe.run(startup_program)
feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
data = next(train_data())
val = exe.run(main_program, feed=feeder.feed(data),
fetch_list=[loss])[0]
for _ in range(100):
val = exe.run(main_program,
feed=feeder.feed(data),
fetch_list=[loss])[0]
print(val)
# this unit test is just used to the two layer nested dyn_rnn.
def test_train_nested_dyn_rnn2(self):
word_dict = [i for i in range(30)]
def fake_reader():
seq_len, label = [[2, 2]], [0, 1]
data = []
for ele in seq_len:
for j in ele:
data.append([numpy.random.randint(30) \
for _ in range(j)])
while True:
yield data, label
train_data = paddle.batch(fake_reader, batch_size=2)
hidden_size = 32
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
sentence = fluid.layers.data(
name='word', shape=[1], dtype='int64', lod_level=2)
label = fluid.layers.data(
name='label', shape=[1], dtype='float32', lod_level=1)
rnn = fluid.layers.DynamicRNN()
with rnn.block():
in_ = rnn.step_input(sentence)
sent_emb = fluid.layers.embedding(
input=in_,
size=[len(word_dict), hidden_size],
dtype='float32')
input_forward_proj = fluid.layers.fc(input=sent_emb,
size=hidden_size * 4,
act=None,
bias_attr=False)
forward, _ = fluid.layers.dynamic_lstm(
input=input_forward_proj,
size=hidden_size * 4,
use_peepholes=False)
rnn1 = fluid.layers.DynamicRNN()
with rnn1.block():
in_1 = rnn1.step_input(forward)
out_1 = fluid.layers.fc(input=[in_1], size=100, act='tanh')
rnn1.output(out_1)
last = fluid.layers.sequence_last_step(input=rnn1())
rnn.output(last)
last = rnn()
logits = fluid.layers.fc(input=last, size=1, act=None)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=logits, label=label)
loss = fluid.layers.mean(loss)
sgd = fluid.optimizer.SGD(1e-3)
#sgd = fluid.optimizer.Adam(1e-3)
sgd.minimize(loss=loss)
cpu = fluid.CPUPlace()
exe = fluid.Executor(cpu)
exe.run(startup_program)
feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
data = next(train_data())
val = exe.run(main_program, feed=feeder.feed(data),
fetch_list=[loss])[0]
for _ in range(100):
val = exe.run(main_program,
feed=feeder.feed(data),
fetch_list=[loss])[0]
if __name__ == '__main__':
unittest.main()
......@@ -565,6 +565,13 @@ class TestBook(unittest.TestCase):
out = layers.cross_entropy(x, label, False, 4)
self.assertIsNotNone(out)
def test_expand(self):
program = Program()
with program_guard(program):
x = layers.data(name="input", shape=[10], dtype='int32')
out = layers.expand(x, [1, 2])
print(str(program))
if __name__ == '__main__':
unittest.main()
......@@ -15,6 +15,7 @@
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.optimizer as optimizer
from paddle.fluid.framework import Program, program_guard
......@@ -67,5 +68,34 @@ class TestMemoryTranspiler2(unittest.TestCase):
print(str(result_program))
class TestMemoryTranspiler3(unittest.TestCase):
def setUp(self):
program = Program()
with program_guard(program, startup_program=Program()):
word = fluid.layers.data(name='word', shape=[1], dtype='int64')
emb = [
fluid.layers.embedding(
word, size=[65536, 256], param_attr='emb') for _ in range(6)
]
left = emb.pop(0)
while len(emb) != 0:
right = emb.pop(0)
left = fluid.layers.concat([left, right])
emb = fluid.layers.mean(left)
fluid.backward.append_backward(emb)
self.program = program
def test_cascade_reuse(self):
block = self.program.block(0)
# variable reuse in programdesc
# TODO(dzhwinter): confirm cascade strategy. disable temporialy
self.assertTrue("concat_4.tmp_0@GRAD" in block.vars)
# self.assertTrue("concat_3.tmp_0@GRAD" not in block.vars)
# self.assertTrue("concat_2.tmp_0@GRAD" not in block.vars)
# self.assertTrue("concat_1.tmp_0@GRAD" not in block.vars)
# self.assertTrue("concat_0.tmp_0@GRAD" not in block.vars)
if __name__ == "__main__":
unittest.main()
......@@ -19,6 +19,7 @@ import unittest
import paddle.fluid as fluid
import paddle
import paddle.dataset.mnist as mnist
from paddle.fluid.layers.io import open_recordio_file
class TestMultipleReader(unittest.TestCase):
......@@ -41,7 +42,7 @@ class TestMultipleReader(unittest.TestCase):
def test_main(self):
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = fluid.layers.open_recordio_file(
data_file = open_recordio_file(
filename='./mnist.recordio',
shapes=[(-1, 784), (-1, 1)],
lod_levels=[0, 0],
......
......@@ -26,7 +26,7 @@ main_program = default_startup_program()
class TestOperator(unittest.TestCase):
def test_error_type(self):
block = main_program.create_block()
block = main_program._create_block()
try:
block.append_op()
self.assertFail()
......
......@@ -20,6 +20,7 @@ import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.dataset.mnist as mnist
from paddle.fluid.layers.io import open_recordio_file
class TestPreprocessor(unittest.TestCase):
......@@ -43,7 +44,7 @@ class TestPreprocessor(unittest.TestCase):
img_expected_res = []
lbl_expected_res = []
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = fluid.layers.io.open_recordio_file(
data_file = open_recordio_file(
'./mnist_for_preprocessor_test.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
......@@ -64,7 +65,7 @@ class TestPreprocessor(unittest.TestCase):
img_actual_res = []
lbl_actual_res = []
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = fluid.layers.io.open_recordio_file(
data_file = open_recordio_file(
'./mnist_for_preprocessor_test.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
......
......@@ -28,25 +28,25 @@ class TestProgram(unittest.TestCase):
self.assertEqual(-1, b.parent_idx)
self.assertEqual(0, b.idx)
b = main_program.create_block()
b = main_program._create_block()
self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx)
b = main_program.create_block()
b = main_program._create_block()
self.assertEqual(2, b.idx)
self.assertEqual(1, b.parent_idx)
main_program.rollback()
main_program._rollback()
b = main_program.current_block()
self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx)
b = main_program.create_block()
b = main_program._create_block()
self.assertEqual(3, b.idx)
self.assertEqual(1, b.parent_idx)
main_program.rollback()
main_program._rollback()
b = main_program.current_block()
self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx)
......@@ -120,8 +120,8 @@ class TestProgram(unittest.TestCase):
main_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
net()
no_read_program = main_program.inference_optimize()
keep_read_program = main_program.inference_optimize(
no_read_program = main_program._inference_optimize()
keep_read_program = main_program._inference_optimize(
export_for_deployment=False)
no_read_ops = no_read_program.global_block().ops
keep_read_ops = keep_read_program.global_block().ops
......
......@@ -19,6 +19,7 @@ import unittest
import paddle.fluid as fluid
import paddle
import paddle.dataset.mnist as mnist
from paddle.fluid.layers.io import open_recordio_file
class TestRecordIO(unittest.TestCase):
......@@ -40,7 +41,7 @@ class TestRecordIO(unittest.TestCase):
def test_main(self, decorator_callback=None):
# use new program
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = fluid.layers.open_recordio_file(
data_file = open_recordio_file(
'./mnist.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.executor import Executor
class TestTrunctedGaussianRandomOp(unittest.TestCase):
def setUp(self):
self.op_type = "truncated_gaussian_random"
self.inputs = {}
self.attrs = {
"shape": [10000],
"mean": .0,
"std": 1.,
"seed": 10,
}
self.outputs = ["Out"]
def test_cpu(self):
self.gaussian_random_test(place=fluid.CPUPlace())
def test_gpu(self):
if core.is_compiled_with_cuda():
self.gaussian_random_test(place=fluid.CUDAPlace(0))
def gaussian_random_test(self, place):
program = fluid.Program()
block = program.global_block()
vout = block.create_var(name="Out")
op = block.append_op(
type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
fetch_list = []
for var_name in self.outputs:
fetch_list.append(block.var(var_name))
exe = Executor(place)
outs = exe.run(program, fetch_list=fetch_list)
tensor = outs[0]
self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
self.assertAlmostEqual(numpy.var(tensor), 0.773, delta=0.1)
if __name__ == "__main__":
unittest.main()
......@@ -19,6 +19,7 @@ import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.layers.io import open_recordio_file
pos_enc_param_names = (
"src_pos_enc_table",
......@@ -405,7 +406,7 @@ def transformer(
src_pad_idx,
trg_pad_idx,
pos_pad_idx, ):
file_obj = fluid.layers.open_recordio_file(
file_obj = open_recordio_file(
filename='/tmp/wmt16.recordio',
shapes=[
[batch_size * max_length, 1],
......
......@@ -580,7 +580,7 @@ class DistributeTranspiler(object):
assert isinstance(origin_block, Block)
# we put the new sub block to new block to follow the block
# hierarchy of the original blocks
new_sub_block = program.create_block(lr_block.idx)
new_sub_block = program._create_block(lr_block.idx)
# clone vars
for var in origin_block.vars:
......@@ -600,7 +600,7 @@ class DistributeTranspiler(object):
# record optimize blocks and we can run them on pserver parallel
optimize_blocks = []
if len(lr_ops) > 0:
lr_decay_block = pserver_program.create_block(
lr_decay_block = pserver_program._create_block(
pserver_program.num_blocks - 1)
optimize_blocks.append(lr_decay_block)
for _, op in enumerate(lr_ops):
......@@ -613,7 +613,7 @@ class DistributeTranspiler(object):
grad_to_block_id = []
pre_block_idx = pserver_program.num_blocks - 1
for idx, opt_op in enumerate(opt_op_on_pserver):
per_opt_block = pserver_program.create_block(pre_block_idx)
per_opt_block = pserver_program._create_block(pre_block_idx)
optimize_blocks.append(per_opt_block)
# append grad merging ops before clip and weight decay
# cases may like:
......@@ -636,7 +636,7 @@ class DistributeTranspiler(object):
grad_to_block_id = list(set(grad_to_block_id))
# append global ops
if global_ops:
opt_state_block = pserver_program.create_block(
opt_state_block = pserver_program._create_block(
pserver_program.num_blocks - 1)
optimize_blocks.append(opt_state_block)
for glb_op in global_ops:
......@@ -1073,7 +1073,7 @@ class DistributeTranspiler(object):
table_var = pserver_program.global_block().vars[self.table_name]
prefetch_var_name_to_block_id = []
for index in range(len(self.all_prefetch_input_vars)):
prefetch_block = pserver_program.create_block(optimize_block.idx)
prefetch_block = pserver_program._create_block(optimize_block.idx)
trainer_ids = self.all_prefetch_input_vars[index][pserver_index]
pserver_ids = pserver_program.global_block().create_var(
name=trainer_ids.name,
......@@ -1131,7 +1131,7 @@ class DistributeTranspiler(object):
if 'Param' in op.input_names and op.input("Param")[0] ==
self.table_name
][0]
table_opt_block = pserver_program.create_block(pre_block_idx)
table_opt_block = pserver_program._create_block(pre_block_idx)
if self.sync_mode:
# create grad vars in pserver program
......@@ -1194,7 +1194,7 @@ class DistributeTranspiler(object):
persistable=True,
type=core.VarDesc.VarType.RAW)
checkpoint_save_block = pserver_program.create_block(pre_block_idx)
checkpoint_save_block = pserver_program._create_block(pre_block_idx)
# this 'file_path' do not be used in save lookup table variable
checkpoint_save_block.append_op(
type='save',
......
......@@ -56,6 +56,7 @@ class ControlFlowGraph(object):
self._live_in = defaultdict(set)
self._live_out = defaultdict(set)
self._skip_opt = skip_opt
self.pool = []
def _add_connections(self, connections):
"""Populates _successors and _presuccessors for two neighbor nodes."""
......@@ -77,6 +78,7 @@ class ControlFlowGraph(object):
for i in range(self.op_size):
self._uses[i].update(self._ops[i].input_arg_names())
self._defs[i].update(self._ops[i].output_arg_names())
self._live_in[i] = self._uses[i]
def _update_graph(self, old_name, new_name, begin_idx=0):
for i in range(begin_idx, self.op_size):
......@@ -88,39 +90,39 @@ class ControlFlowGraph(object):
self._defs[i].add(new_name)
if old_name in self._live_in[i]:
self._live_in[i].remove(old_name)
self._live_out[i].add(new_name)
self._live_in[i].add(new_name)
if old_name in self._live_out[i]:
self._live_out[i].remove(old_name)
self._live_out[i].add(new_name)
def _reach_fixed_point(self, live_in, live_out):
"""Check if the liveness set has stablized."""
if len(live_in) != len(self._live_in):
return False
if len(live_out) != len(self._live_out):
return False
for i in range(self.op_size):
if (live_in[i] != self._live_in[i] or
live_out[i] != self._live_out[i]):
return False
return True
def _dataflow_analyze(self):
self._build_graph()
live_in = defaultdict(set)
live_out = defaultdict(set)
# Repeatedly apply liveness updates until the algorithm stablize
# on a complete set live input vars and live output vars.
while True:
for i in reversed(list(range(self.op_size))):
live_in[i] = set(self._live_in[i])
live_out[i] = set(self._live_out[i])
for s in self._successors[i]:
self._live_out[i] |= self._live_in[s]
self._live_in[i] = self._uses[i] | (
self._live_out[i] - self._defs[i])
if self._reach_fixed_point(live_in, live_out):
break
worklist = list(range(len(self._ops) - 1, -1, -1))
while worklist:
i = worklist.pop(0)
live_in[i] = set(self._live_in[i])
for s in self._successors[i]:
self._live_out[i] |= self._live_in[s]
self._live_in[i] = self._uses[i] | (
self._live_out[i] - self._defs[i])
if live_in[i] != self._live_in[i]:
for d in self._presuccessors[i]:
worklist.append(d)
def _fill_pool(self, i, is_forward):
block_desc = self._ops[i].block()
in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
can_optimize = [
x for x in in_diff
if self._check_var_validity(block_desc, x, is_forward)
]
if can_optimize:
for var_name in can_optimize:
cache = (var_name, self._find_var(block_desc, var_name,
is_forward).shape())
if cache not in self.pool:
self.pool.append(cache)
def _get_diff(self, a, b):
u = a & b
......@@ -211,7 +213,6 @@ class ControlFlowGraph(object):
# update skip set to meet users' demand
if skip_opt_set:
self._skip_opt.update(skip_opt_set)
self.pool = []
for i in range(self.op_size):
op = self._ops[i]
if op.type() in SUB_BLOCK_OPS:
......@@ -234,16 +235,24 @@ class ControlFlowGraph(object):
for index, cache_pair in enumerate(self.pool):
cache_var = cache_pair[0]
cache_shape = cache_pair[1]
if not compare_shape(x_shape, cache_shape, level):
continue
if not self._has_var(block_desc, cache_var, is_forward):
if PRINT_LOG:
print("cache %s not exists!" %
(cpt.to_text(cache_var)))
continue
if x == cache_var:
if PRINT_LOG:
print("x : ", cpt.to_text(x), " cache : ",
cpt.to_text(cache_var), " is same var!")
break
x_dtype = self._find_var(block_desc, x,
is_forward).dtype()
cache_dtype = self._find_var(block_desc, cache_var,
is_forward).dtype()
if not compare_shape(x_shape, cache_shape, level):
continue
# TODO(qijun): actually, we should compare
# dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
if x_dtype != cache_dtype:
......@@ -256,8 +265,6 @@ class ControlFlowGraph(object):
"var shape is %s ") % (index, x, cache_var,
str(cache_shape)))
self.pool.pop(index)
if x == cache_var:
break
# Rename the var to the cache var already with
# memory allocated in order to reuse the memory.
_rename_arg_(self._ops, x, cache_var, begin_idx=i)
......@@ -266,16 +273,7 @@ class ControlFlowGraph(object):
is_forward)
self._update_graph(x, cache_var, begin_idx=i)
break
in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
can_optimize = [
x for x in in_diff
if self._check_var_validity(block_desc, x, is_forward)
]
if can_optimize:
for var_name in can_optimize:
self.pool.append((var_name, self._find_var(
block_desc, var_name, is_forward).shape()))
self._fill_pool(i, is_forward)
def _process_sub_block_pair(pdesc, sub_block_pair):
......@@ -357,7 +355,7 @@ def _get_cfgs(input_program):
:return: A list of ControlFlowGraph, each corresponds to a block.
"""
ops_list = []
pdesc = input_program.get_desc()
pdesc = input_program._get_desc()
block_desc = pdesc.block(0)
op_size = block_desc.op_size()
......@@ -383,10 +381,13 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
Note: it doesn't not support subblock nested in subblock.
:param input_program: Input Program
:param print_log: whether to print debug log.
:param level: If level=0, reuse if the shape is completely equal, o
:return:
Args:
input_program(str): Input Program
skip_opt_set(set): vars wil be skipped in memory optimze
print_log(bool): whether to print debug log.
level(int): If level=0, reuse if the shape is completely equal, o
Returns:
None
"""
if level != 0 and level != 1:
raise ValueError("only support opt_level 0 or 1.")
......@@ -407,6 +408,9 @@ def release_memory(input_program, skip_opt_set=None):
Args:
input_program(Program): The program will be inserted :code:`delete_op`.
skip_opt_set(set): vars wil be skipped in memory optimze
Returns:
None
"""
cfgs = _get_cfgs(input_program)
for cfg in cfgs:
......
......@@ -2,7 +2,7 @@ requests==2.9.2
numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
protobuf==3.1
recordio>=0.1.0
matplotlib
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
rarfile
scipy>=0.19.0
Pillow
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册