提交 13509da6 编写于 作者: Y Yibing Liu

Merge upstream to branch wrap_squeezes

......@@ -113,6 +113,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
......@@ -148,6 +149,7 @@ paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=No
paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None))
paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
......@@ -166,6 +168,7 @@ paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], vara
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
......@@ -380,7 +383,7 @@ paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, a
paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool
paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None
paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None 24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
......
......@@ -64,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() {
RegType(size_t, proto::VarType::SIZE_T);
RegType(int16_t, proto::VarType::INT16);
RegType(uint8_t, proto::VarType::UINT8);
RegType(int8_t, proto::VarType::INT8);
#undef RegType
return retv;
......
......@@ -54,6 +54,9 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
case proto::VarType::INT16:
visitor.template operator()<int16_t>();
break;
case proto::VarType::INT8:
visitor.template operator()<int8_t>();
break;
default:
PADDLE_THROW("Not supported %d", type);
}
......
......@@ -754,17 +754,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
node->Op()->Type());
CreateComputationalOp(result, node, op_dev_id);
if (node->Op()->Type() == "concat") {
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
"fetch_barrier");
}
void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
for (ir::Node *input : node->inputs) {
VarHandle *var = nullptr;
for (int place_offset = 0; place_offset < num_places; ++place_offset) {
auto &var_holders = result->Get<GraphVars>(kGraphVars)[place_offset];
auto &var_holder = var_holders[input->Name()];
if (!var_holder.empty()) {
var = var_holder.rbegin()->get();
op_handle->AddInput(var);
}
}
}
}
// Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
ir::Node *node) const {
// FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode
// put them into transpiler.
int op_dev_id = -1;
if (node->Op()->Type() == "send") {
// TODO(paddle-dev): getting the first var is not safe.
......@@ -799,8 +808,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
}
auto recv_param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
// FIXME(typhoonzero): assume each recv op output one param
// Use the same place as send.
if (recv_param_grad.size() == 2U) {
op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
VLOG(10) << "recv param " << recv_param_grad[0]
......@@ -814,34 +821,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
.emplace(varname, op_dev_id);
}
} else {
// send_barrier and fetch_barrier op can be scheduled on device 0
// send_barrier, fetch_barrier will run on place 0;
op_dev_id = 0;
}
PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
node->Op()->Type());
result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
node->Op()->Type(), places_[op_dev_id]));
// TODO(panyx0718): This might not be needed anymore.
if (node->Op()->Type() == "send_barrier") {
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "send");
} else if (node->Op()->Type() == "recv") {
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
"send_barrier");
} else if (node->Op()->Type() == "fetch_barrier") {
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "recv");
} else if (node->Op()->Type() == "send") {
// do nothing
if (node->Op()->Type() == "send") {
CreateOpHandleIOs(result, node, op_dev_id);
} else {
PADDLE_THROW(
"rpc op should be in ["
"send, send_barrier. recv, fetch_barrier]");
}
// send_barrier, recv, fetch_barrier's inputs are deps var, get them from
// all places
auto p = places_[op_dev_id];
auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
op_handle->SetDeviceContext(p,
platform::DeviceContextPool::Instance().Get(p));
CreateOpHandleIOs(result, node, op_dev_id);
SetOpInputsAllPlaces(result, node, places_.size());
for (ir::Node *output : node->outputs) {
int outvar_dev_id = op_dev_id;
if (node->Op()->Type() == "fetch_barrier") {
outvar_dev_id = GetVarDeviceID(*result, output->Name());
PADDLE_ENFORCE_NE(outvar_dev_id, -1);
}
p = places_[outvar_dev_id];
ir::Node *new_node = nullptr;
if (output->Var()) {
new_node = result->CreateVarNode(output->Var());
} else {
new_node =
result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
}
CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
}
}
}
bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
......
......@@ -107,6 +107,7 @@ message VarType {
// Tensor<size_t> is used in C++.
SIZE_T = 19;
UINT8 = 20;
INT8 = 21;
// Other types that may need additional descriptions
LOD_TENSOR = 7;
......
......@@ -132,63 +132,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
}
}
std::vector<ir::Node *> send_ops;
ir::Node *send_bar = nullptr;
std::vector<ir::Node *> recv_ops;
ir::Node *fetch_bar = nullptr;
for (ir::Node *node : Nodes()) {
if (node->Name() == "send") {
send_ops.push_back(node);
} else if (node->Name() == "send_barrier") {
PADDLE_ENFORCE(!send_bar, "only has one send barrier");
send_bar = node;
} else if (node->Name() == "recv") {
recv_ops.push_back(node);
} else if (node->Name() == "fetch_barrier") {
PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier");
fetch_bar = node;
}
}
if (send_bar) {
for (ir::Node *send : send_ops) {
ir::Node *dep_var = CreateControlDepVar();
send->outputs.push_back(dep_var);
dep_var->inputs.push_back(send);
send_bar->inputs.push_back(dep_var);
dep_var->outputs.push_back(send_bar);
}
for (ir::Node *recv : recv_ops) {
ir::Node *dep_var = CreateControlDepVar();
recv->inputs.push_back(dep_var);
dep_var->outputs.push_back(recv);
send_bar->outputs.push_back(dep_var);
dep_var->inputs.push_back(send_bar);
}
}
if (fetch_bar) {
for (ir::Node *recv : recv_ops) {
ir::Node *dep_var = CreateControlDepVar();
recv->outputs.push_back(dep_var);
dep_var->inputs.push_back(recv);
fetch_bar->inputs.push_back(dep_var);
dep_var->outputs.push_back(fetch_bar);
}
}
std::vector<std::string> send_vars = FindDistTrainSendVars(send_ops);
std::vector<std::string> recv_vars = FindDistTrainRecvVars(recv_ops);
for (ir::Node *node : Nodes()) {
if (IsDistTrainOp(node, send_vars, recv_vars)) {
if (fetch_bar && node->Name() == "concat") {
ir::Node *dep_var = CreateControlDepVar();
fetch_bar->outputs.push_back(dep_var);
dep_var->inputs.push_back(fetch_bar);
node->inputs.push_back(dep_var);
dep_var->outputs.push_back(node);
}
}
}
/**
* We should handle write after read(WAR) and write after write(WAW) here.
* Because some of the operators of the program can be executed parallelly.
......
......@@ -40,7 +40,11 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
"When calling this method, the Tensor's numel must be "
"equal or larger than zero. "
"Please check Tensor::Resize has been called first.");
size_t size = requested_size ? requested_size : numel() * SizeOfType(type);
size_t size = numel() * SizeOfType(type);
if (requested_size) {
PADDLE_ENFORCE_GE(requested_size, size);
size = requested_size;
}
/* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) {
......
......@@ -72,7 +72,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
"depthwise_conv2d", "batch_norm"});
"depthwise_conv2d", "batch_norm", "concat"});
if (!node->IsFunction()) return false;
const auto* func = static_cast<const Function*>(node);
......
......@@ -32,6 +32,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
: NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true;
VLOG(3) << "Predictor::init()";
FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
FLAGS_tensorrt_workspace_size = config_.workspace_size;
......@@ -161,3 +162,4 @@ USE_TRT_CONVERTER(fc);
USE_TRT_CONVERTER(pool2d);
USE_TRT_CONVERTER(softmax);
USE_TRT_CONVERTER(batch_norm);
USE_TRT_CONVERTER(concat);
......@@ -37,6 +37,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
config1.use_gpu = true;
config1.fraction_of_gpu_memory = 0.3;
config1.device = 0;
config1.max_batch_size = 10;
auto predictor0 =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
......
......@@ -23,9 +23,11 @@ include_directories("${PADDLE_LIB}")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include")
include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
if (NOT WIN32)
include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
endif(NOT WIN32)
include_directories("${PADDLE_LIB}/third_party/boost")
include_directories("${PADDLE_LIB}/third_party/eigen3")
......
# Add TRT tests
nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS
......@@ -18,12 +18,12 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
*/
class ConcatOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
std::vector<nvinfer1::ITensor*> itensors;
for (auto& input_name : op_desc.Input("X")) {
itensors.push_back(engine_->GetITensor(input_name));
}
int axis = boost::get<int>(op_desc.GetAttr("axis"));
PADDLE_ENFORCE(axis > 0,
"The axis attr of Concat op should be large than 0 for trt");
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
itensors.size());
axis = axis - 1; // Remove batch dim
layer->setAxis(axis);
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(concat, ConcatOpConverter);
......@@ -79,6 +79,14 @@ class OpConverter {
it =
Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
}
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
op_desc.Type());
}
if (op_desc.Type() == "depthwise_conv2d") {
it = Registry<OpConverter>::Lookup("conv2d");
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
op_desc.Type());
}
if (!it) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(concat_op, test) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("concat_x1", nvinfer1::DimsCHW(10, 3, 1));
validator.DeclInputVar("concat_x2", nvinfer1::DimsCHW(3, 3, 1));
validator.DeclInputVar("concat_x3", nvinfer1::DimsCHW(7, 3, 1));
validator.DeclOutputVar("concat_out", nvinfer1::DimsCHW(20, 3, 1));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("concat");
desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
desc.SetOutput("Out", {"concat_out"});
int axis = 1;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(5);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(concat);
......@@ -11,12 +11,18 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES
#include "paddle/fluid/memory/detail/system_allocator.h"
#include <stdlib.h> // for malloc and free
#ifdef _WIN32
#include <malloc.h>
#include <windows.h> // VirtualLock/VirtualUnlock
#else
#include <sys/mman.h> // for mlock and munlock
#include <algorithm> // for std::max
#endif
#include <stdlib.h> // for malloc and free
#include <algorithm> // for std::max
#include "gflags/gflags.h"
#include "paddle/fluid/platform/assert.h"
......@@ -35,31 +41,42 @@ namespace paddle {
namespace memory {
namespace detail {
void* CPUAllocator::Alloc(size_t* index, size_t size) {
// According to http://www.cplusplus.com/reference/cstdlib/malloc/,
// malloc might not return nullptr if size is zero, but the returned
// pointer shall not be dereferenced -- so we make it nullptr.
if (size <= 0) return nullptr;
*index = 0; // unlock memory
void* AlignedMalloc(size_t size) {
void* p = nullptr;
size_t alignment = 32ul;
#ifdef PADDLE_WITH_MKLDNN
// refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
// memory alignment
PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!",
size);
alignment = 4096ul;
#endif
#ifdef _WIN32
p = _aligned_malloc(size, alignment);
#else
PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!",
PADDLE_ENFORCE_EQ(posix_memalign(&p, alignment, size), 0, "Alloc %ld error!",
size);
#endif
PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
return p;
}
void* CPUAllocator::Alloc(size_t* index, size_t size) {
// According to http://www.cplusplus.com/reference/cstdlib/malloc/,
// malloc might not return nullptr if size is zero, but the returned
// pointer shall not be dereferenced -- so we make it nullptr.
if (size <= 0) return nullptr;
*index = 0; // unlock memory
void* p = AlignedMalloc(size);
if (p != nullptr) {
if (FLAGS_use_pinned_memory) {
*index = 1;
#ifdef _WIN32
VirtualLock(p, size);
#else
mlock(p, size); // lock memory
#endif
}
}
......@@ -68,7 +85,11 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
void CPUAllocator::Free(void* p, size_t size, size_t index) {
if (p != nullptr && index == 1) {
#ifdef _WIN32
VirtualUnlock(p, size);
#else
munlock(p, size);
#endif
}
free(p);
}
......
......@@ -291,6 +291,8 @@ op_library(unsqueeze_op DEPS reshape_op)
op_library(squeeze_op DEPS reshape_op)
op_library(extract_rows_op DEPS memory)
op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op)
if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col)
......
......@@ -60,6 +60,20 @@ class AucKernel : public framework::OpKernel<T> {
const T* inference_data = predict->data<T>();
const auto* label_data = label->data<int64_t>();
// check if states are inited.
auto* tp_in = ctx.Input<Tensor>("TP");
auto* fp_in = ctx.Input<Tensor>("FP");
auto* tn_in = ctx.Input<Tensor>("TN");
auto* fn_in = ctx.Input<Tensor>("FN");
PADDLE_ENFORCE(tp_in->IsInitialized(), "true_positive is not inited!");
PADDLE_ENFORCE(fp_in->IsInitialized(), "false_negative is not inited!");
PADDLE_ENFORCE(tn_in->IsInitialized(), "true_negative is not inited!");
PADDLE_ENFORCE(fn_in->IsInitialized(), "false_positive is not inited!");
PADDLE_ENFORCE_EQ(tp_in->numel(), num_thresholds, "");
PADDLE_ENFORCE_EQ(fp_in->numel(), num_thresholds, "");
PADDLE_ENFORCE_EQ(tn_in->numel(), num_thresholds, "");
PADDLE_ENFORCE_EQ(fn_in->numel(), num_thresholds, "");
auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
......
......@@ -18,15 +18,32 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
struct DequantizeFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor* scale,
T max_range, framework::Tensor* out) {
auto in_e = framework::EigenVector<T>::Flatten(*in);
const T* scale_factor = scale->data<T>();
auto out_e = framework::EigenVector<T>::Flatten(*out);
auto& dev = *dev_ctx.eigen_device();
out_e.device(dev) = (scale_factor[0] / max_range) * in_e;
}
};
template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
public:
FakeDequantizeMaxAbsOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
FakeDequantizeMaxAbsOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of FakeDequantizeMaxAbsOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
......@@ -42,21 +59,17 @@ class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X",
"(Tensor) The input with float-32/64 type is the "
"low precision tensor.");
AddInput("Scale", "(float) The scale in quantization stage.");
AddOutput("Out",
"(Tensor) The output is the dequantized high "
"precision tensor.");
AddAttr<int>("num_bits",
"(int) `num_bits` is the quantization level bits, "
"such as 2, 5, 8.");
AddAttr<float>("scale",
"(float) The maximum absolute value of low precision tensor."
"It is usually calculated by the fake_quantize_max_abs_op.");
AddAttr<float>("max_range", "(float) The max range in quantization stage.");
AddComment(R"DOC(
FakeDequantizeMaxAbsOp operator.
This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
$$Out = \frac{scale*X}{2^{num_bits} - 1}$$
$$Out = \frac{scale*X}{ max_range }$$
)DOC");
}
......
......@@ -14,6 +14,42 @@ limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void KeDequantize(const T* in, const T* scale, T max_range, int num,
T* out) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num) {
out[idx] = in[idx] * scale[0] / max_range;
}
}
template <typename T>
struct DequantizeFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor* scale,
T max_range, framework::Tensor* out) {
const T* in_data = in->data<T>();
const T* scale_factor = scale->data<T>();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = in->numel();
int block = 512;
int grid = (num + block - 1) / block;
KeDequantize<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, out_data);
}
};
template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
......
......@@ -19,22 +19,29 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
struct DequantizeFunctor {
void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
const framework::Tensor* scale, T max_range,
framework::Tensor* out);
};
template <typename DeviceContext, typename T>
class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
public:
virtual void Compute(const framework::ExecutionContext& ctx) const {
auto* in = ctx.Input<framework::Tensor>("X");
auto* scale = ctx.Input<framework::Tensor>("Scale");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(in->place());
int num_bits = ctx.Attr<int>("num_bits");
T scale = static_cast<T>(ctx.Attr<float>("scale"));
int range = std::pow(2, num_bits) - 1;
float max_range = ctx.Attr<float>("max_range");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
out->mutable_data<T>(dev_ctx.GetPlace());
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
eigen_out.device(dev) = (scale / range) * eigen_in;
DequantizeFunctor<DeviceContext, T>()(dev_ctx, in, scale,
static_cast<T>(max_range), out);
}
};
......
......@@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase {
class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
.AsDuplicable();
AddComment(R"DOC(
SendBarrier operator
......
......@@ -15,10 +15,14 @@ limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
DEFINE_bool(seq_mode, true, "Use sequence mode");
namespace paddle {
namespace operators {
......@@ -98,7 +102,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
ctx->ShareLoD("X", "Hidden");
ctx->ShareLoD("X", "Cell");
int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
int xx_width;
if (FLAGS_seq_mode) {
xx_width = wx_dims[1];
} else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
}
ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->ShareLoD("X", "XX");
}
......@@ -205,10 +214,138 @@ inline void ReorderInitState(const DeviceContext& ctx,
row_shuffle(ctx, src, index_lod, dst, indexed_src);
}
template <typename DeviceContext, typename T>
template <typename T>
class FuisonLSTMKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
void SeqCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
auto* h0 = ctx.Input<Tensor>("H0");
auto* c0 = ctx.Input<Tensor>("C0");
auto* wx = ctx.Input<Tensor>("WeightX");
auto* wh = ctx.Input<Tensor>("WeightH");
auto* bias = ctx.Input<Tensor>("Bias");
auto* xx = ctx.Output<LoDTensor>("XX");
auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
auto* cell_out = ctx.Output<LoDTensor>("Cell");
bool is_reverse = ctx.Attr<bool>("is_reverse");
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
if (platform::jit::MayIUse(platform::jit::avx)) {
math::VecActivations<T, platform::jit::avx> act_functor;
act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str);
} else {
math::VecActivations<T, platform::jit::isa_any> act_functor;
act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str);
}
auto x_lod = x->lod();
auto x_dims = x->dims(); // T x M
auto wh_dims = wh->dims(); // D x 4D
const int total_T = x_dims[0];
const int N = x_lod[0].size() - 1; // batch size
const int M = x_dims[1]; // x frame size
const int D = wh_dims[0];
const int D2 = D * 2;
const int D3 = D * 3;
const int D4 = wh_dims[1];
const T* x_data = x->data<T>();
const T* h0_data = h0 ? h0->data<T>() : NULL;
const T* c0_data = c0 ? c0->data<T>() : NULL;
const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>();
T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(ctx);
math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
xx_data, bias->data<T>());
int xx_offset = D4;
int gate_offset = D;
if (is_reverse) {
const int offset = (total_T - 1) * D;
xx_data = xx_data + offset * 4;
hidden_out_data = hidden_out_data + offset;
cell_out_data = cell_out_data + offset;
xx_offset = -D4;
gate_offset = -D;
}
auto move_step = [&]() {
xx_data = xx_data + xx_offset;
hidden_out_data = hidden_out_data + gate_offset;
cell_out_data = cell_out_data + gate_offset;
};
for (int i = 0; i < N; ++i) {
int bid = is_reverse ? N - 1 - i : i;
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
const T* prev_cell_data = NULL;
const T* prev_hidden_data = NULL;
int tstart = 0;
if (h0_data) {
prev_hidden_data = h0_data + bid * D;
prev_cell_data = c0_data + bid * D;
} else {
// W_ch, W_ih, W_fh, W_oh
act_gate(D3, xx_data + D, xx_data + D);
act_cand(D, xx_data, xx_data);
// cell out= input*tilde
blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
// hidden out= act_state(cellout) * outgate
act_cell(D, cell_out_data, xx_data + D2);
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev
prev_hidden_data = hidden_out_data;
prev_cell_data = cell_out_data;
tstart = 1;
move_step();
}
for (int step = tstart; step < seq_len; ++step) {
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
prev_hidden_data, D, wh_data, D4, static_cast<T>(1), xx_data,
D4);
// W_ch, W_ih, W_fh, W_oh
act_gate(D3, xx_data + D, xx_data + D);
act_cand(D, xx_data, xx_data);
// a = forget * prev_cell
blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2);
// b = input * tilde
blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
// cell out= a+b
blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
// hidden out= act_state(cellout) * outgate
act_cell(D, cell_out_data, xx_data + D2);
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev
prev_hidden_data = hidden_out_data;
prev_cell_data = cell_out_data;
move_step();
}
}
}
void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
auto* wx = ctx.Input<Tensor>("WeightX");
auto* wh = ctx.Input<Tensor>("WeightH");
......@@ -339,6 +476,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
// restore the output cell state in LoDTensor from the batch cell
to_seq(dev_ctx, batch_cell, cell_out);
}
void Compute(const framework::ExecutionContext& ctx) const override {
if (FLAGS_seq_mode) {
SeqCompute(ctx);
} else {
BatchCompute(ctx);
}
}
};
} // namespace operators
......@@ -348,7 +492,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL(
fusion_lstm,
ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(fusion_lstm, ops::FuisonLSTMKernel<float>,
ops::FuisonLSTMKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
void FusionSeqExpandConcatFCOp::InferShape(
framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_GT(
ctx->Inputs("X").size(), 1UL,
"Inputs(X) of FusionSeqExpandConcatFCOp should larger than 1.");
PADDLE_ENFORCE(
ctx->HasInput("FCWeight"),
"Input(FCWeight) of FusionSeqExpandConcatFCOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FusionSeqExpandConcatFCOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("FCOut"),
"Output(FCOut) of FusionSeqExpandConcatFCOp should not be null.");
auto ins_dims = ctx->GetInputsDim("X");
auto w_dims = ctx->GetInputDim("FCWeight"); // (M0+M1+M2+..) x D
PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2.");
const int D = w_dims[1];
int sum = ins_dims[0][1];
for (size_t i = 1; i < ins_dims.size(); ++i) {
sum += ins_dims[i][1];
}
PADDLE_ENFORCE_EQ(sum, w_dims[0],
"FC height should be sum of all inputs width.");
if (ctx->HasInput("FCBias")) {
auto b_dims = ctx->GetInputDim("FCBias");
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(FCBias)'s rank must be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1, "FCBias shapes must be 1 * %d.", D);
PADDLE_ENFORCE_EQ(b_dims[1], D, "FCBias shapes must be 1 * %d.", D);
}
ctx->SetOutputDim("Out", {ins_dims[0][0], D});
// fcout should be reshape when run since can not get lod in infershape
// explicit share the ref lod
ctx->ShareLoD("X", "Out", 0);
}
framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
return framework::OpKernelType(
framework::ToDataType(ctx.MultiInput<LoDTensor>("X")[0]->type()),
ctx.device_context());
}
void FusionSeqExpandConcatFCOpMaker::Make() {
AddInput("X",
"(LoDTensor) input LodDTensors, the first one must be have ref lod "
"for sequence expand, and the rest input should have same lod.")
.AsDuplicable();
AddInput("FCWeight", "(Tensor) the weights of fc.");
AddInput("FCBias", "(Tensor, optional) the bias of fc.").AsDispensable();
AddOutput("Out", "(LoDTensor) Output LodTensor.");
AddOutput(
"FCOut",
"(Tensor) the intermediate tensor to keep the result of fc."
"Shape is (N x D), where N is the batch size, D is the output dim of fc")
.AsIntermediate();
AddAttr<std::string>("fc_activation",
"(string, default: identity)"
"The activation for the result of fc."
"`identity` by default.")
.SetDefault("identity")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddComment(R"DOC(
Fusion Sequence expand + concat + fc Operator.
All below conditions should be meet:
The ref_level of seq_expand should be 0.
The ref lod of seq_expand level is the first input of concat.
The other inputs should have same lod and same batch size of ref lod.
The seq len of other inputs should be 1.
The concat axis should be 1.
)DOC");
}
template <typename T>
class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto ins = ctx.MultiInput<LoDTensor>("X");
auto* w = ctx.Input<Tensor>("FCWeight");
auto* b = ctx.Input<Tensor>("FCBias");
auto* out = ctx.Output<LoDTensor>("Out");
auto* fc_out = ctx.Output<Tensor>("FCOut");
auto* ref_in = ins[0];
auto ref_lod = ref_in->lod();
auto in1_lod = ins[1]->lod();
auto ref_dims = ref_in->dims(); // T x M0
auto in1_dims = ins[1]->dims(); // N x M1
auto w_dims = w->dims();
const int N = ref_lod[0].size() - 1;
const int total_T = ref_dims[0];
const int M0 = ref_dims[1];
const int M1 = in1_dims[1];
const int D = w_dims[1];
// some check and fcout should be reshape here
// since infershape can not get lod info
PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1.");
PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1.");
PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N,
"Batch size of all inputs should be equal.");
PADDLE_ENFORCE_EQ(in1_lod[0][N], N,
"Seq_length of other inputs should be 1.");
PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size.");
for (size_t i = 2; i < ins.size(); ++i) {
PADDLE_ENFORCE_EQ(ins[i]->dims()[0], N,
"All other inputs height should be equal");
PADDLE_ENFORCE_EQ(ins[i]->lod(), in1_lod,
"All other inputs should have same lod");
}
fc_out->Resize({N, D});
std::function<void(const int, const T*, T*)> fc_act;
auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
if (platform::jit::MayIUse(platform::jit::avx)) {
math::VecActivations<T, platform::jit::avx> act_functor;
fc_act = act_functor(fc_act_str);
} else {
math::VecActivations<T, platform::jit::isa_any> act_functor;
fc_act = act_functor(fc_act_str);
}
const T* ref_in_data = ref_in->data<T>();
const T* in1_data = ins[1]->data<T>();
const T* w_data = w->data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(ctx);
math::FCCompute<DeviceContext, T>(blas, total_T, D, M0, ref_in_data, w_data,
out_data, b ? b->data<T>() : NULL);
w_data = w_data + M0 * D;
// first write on
blas.MatMul(N, D, M1, in1_data, w_data, fc_out_data);
w_data = w_data + M1 * D;
for (size_t i = 2; i < ins.size(); ++i) {
// add on
const T* in_data = ins[i]->data<T>();
const int K = ins[i]->dims()[1];
blas.GEMM(CblasNoTrans, CblasNoTrans, N, D, K, static_cast<T>(1), in_data,
K, w_data, D, static_cast<T>(1), fc_out_data, D);
w_data = w_data + K * D;
}
T* cur_out_data = out_data;
for (int i = 0; i < N; ++i) {
int seq_len = ref_lod[0][i + 1] - ref_lod[0][i];
T* src = fc_out_data + i * D;
for (int step = 0; step < seq_len; ++step) {
blas.VADD(D, cur_out_data, src, cur_out_data);
cur_out_data = cur_out_data + D;
}
}
fc_act(total_T * D, out_data, out_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOp,
ops::FusionSeqExpandConcatFCOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc,
ops::FusionSeqExpandConcatFCOpKernel<float>,
ops::FusionSeqExpandConcatFCOpKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
class FusionSeqExpandConcatFCOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class FusionSeqExpandConcatFCOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
} // namespace operators
} // namespace paddle
......@@ -177,6 +177,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
out_row, out_col, output->data<T>());
}
// Wait() must be called because `inputs_data` may be destructed before
// kernel ends
context.Wait();
}
};
......@@ -252,6 +255,9 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
input.data<T>(), in_row, in_col, dev_outs_col_data,
static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
}
// Wait() must be called because `outputs_data` may be destructed before
// kernel ends
context.Wait();
}
};
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include <sys/time.h>
#include <cmath>
#include <cstring>
#include <random>
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
......
......@@ -41,7 +41,8 @@ template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, bool, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;
template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
DEFINE_CPU_TRANS(1);
DEFINE_CPU_TRANS(2);
......
......@@ -33,10 +33,11 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
template struct SetConstant<platform::CUDADeviceContext, int64_t>;
template struct SetConstant<platform::CUDADeviceContext, bool>;
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<platform::CUDADeviceContext, double, RANK>; \
template struct Transpose<platform::CUDADeviceContext, float16, RANK>;
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<platform::CUDADeviceContext, double, RANK>; \
template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;
DEFINE_GPU_TRANS(1);
DEFINE_GPU_TRANS(2);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename DeviceContext, typename T, size_t D>
void PadFunction(const framework::ExecutionContext& context,
const std::vector<int>& pads, const framework::Tensor& src,
T pad_value, framework::Tensor* out) {
Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = pads[i * 2];
paddings[i].second = pads[i * 2 + 1];
}
auto src_tensor = EigenTensor<T, D>::From(src);
auto out_tensor = EigenTensor<T, D>::From(*out);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
out_tensor.device(place) = src_tensor.pad(paddings, pad_value);
}
template <typename DeviceContext, typename T, size_t D>
void PadGradFunction(const framework::ExecutionContext& context,
const std::vector<int>& pads, const framework::Tensor& src,
framework::Tensor* d_out) {
Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = -pads[i * 2];
paddings[i].second = -pads[i * 2 + 1];
}
auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
auto src_tensor = EigenTensor<T, D>::From(src);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
d_out_tensor.device(place) = src_tensor.pad(paddings, 0);
}
template <typename DeviceContext, typename T>
void PaddingFunctor(int rank, const framework::ExecutionContext& context,
const std::vector<int>& pads, T pad_value,
const framework::Tensor& src, framework::Tensor* out) {
switch (rank) {
case 1:
PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
break;
case 2:
PadFunction<DeviceContext, T, 2>(context, pads, src, pad_value, out);
break;
case 3:
PadFunction<DeviceContext, T, 3>(context, pads, src, pad_value, out);
break;
case 4:
PadFunction<DeviceContext, T, 4>(context, pads, src, pad_value, out);
break;
case 5:
PadFunction<DeviceContext, T, 5>(context, pads, src, pad_value, out);
break;
case 6:
PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
}
}
template <typename DeviceContext, typename T>
void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
const std::vector<int>& pads,
const framework::Tensor& src, framework::Tensor* out) {
switch (rank) {
case 1:
PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
break;
case 2:
PadGradFunction<DeviceContext, T, 2>(context, pads, src, out);
break;
case 3:
PadGradFunction<DeviceContext, T, 3>(context, pads, src, out);
break;
case 4:
PadGradFunction<DeviceContext, T, 4>(context, pads, src, out);
break;
case 5:
PadGradFunction<DeviceContext, T, 5>(context, pads, src, out);
break;
case 6:
PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
}
}
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -18,65 +18,86 @@ namespace paddle {
namespace operators {
namespace math {
template <typename T>
void CopyValidData(framework::Tensor* dst_tensor,
const framework::Tensor* src_tensor,
const framework::Vector<size_t>& seq_offsets,
int pad_seq_len, int step_width, bool norm_by_len,
CopyType type, PadLayout layout) {
int seq_num = seq_offsets.size() - 1;
const T* src_data = src_tensor->data<T>();
T* dst_data = dst_tensor->data<T>();
int seq_cpy_gap = step_width;
int pad_cpy_gap =
layout == kBatchLengthWidth ? step_width : seq_num * step_width;
for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
PADDLE_ENFORCE_GE(
pad_seq_len, valid_seq_len,
"The padded sequence length can not be less than its original length.");
int seq_data_offset = seq_offsets[seq_idx] * step_width;
int pad_data_offset = layout == kBatchLengthWidth
? seq_idx * pad_seq_len * step_width
: seq_idx * step_width;
float scale = 1.0f / static_cast<float>(valid_seq_len);
for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
const T* src =
src_data + (type == kSeqToPad ? seq_data_offset : pad_data_offset);
T* dst =
dst_data + (type == kSeqToPad ? pad_data_offset : seq_data_offset);
memcpy(dst, src, step_width * sizeof(T));
if (norm_by_len) {
for (int i = 0; i < step_width; ++i) {
*(dst + i) *= scale;
}
}
seq_data_offset += seq_cpy_gap;
pad_data_offset += pad_cpy_gap;
}
}
}
template <typename T>
class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor* padding,
bool norm_by_times) {
auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The LoD of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims();
PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding->dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width].");
const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be the "
"maximum length of all sequences in LoDTensor seq.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be the "
"number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq.numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
const T* seq_data = seq.data<T>();
T* padding_data = padding->data<T>();
for (int64_t i = 0; i < max_sequence_length; ++i) {
for (int64_t j = 0; j < num_sequences; ++j) {
int64_t start_pos = abs_offset_lod[level][j];
int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
if (i < sequence_length) {
// i > 0 => sequence_length > 0
T scale =
norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
for (int64_t k = 0; k < sequence_width; ++k) {
padding_data[(i * num_sequences + j) * sequence_width + k] =
seq_data[(start_pos + i) * sequence_width + k] * scale;
}
} else {
memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
sequence_width * sizeof(T));
}
const framework::LoDTensor& seq_tensor,
framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod();
const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
const auto& seq_tensor_dims = seq_tensor.dims();
const auto& pad_tensor_dims = pad_tensor->dims();
if (pad_seq_len == -1) {
pad_seq_len = MaximumSequenceLength(seq_offsets);
}
int step_width = seq_tensor.numel() / seq_tensor_dims[0];
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
step_width, layout);
PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
"The numel of 'pad_value' can only be 1 or be equal to the "
"'step_width'.");
// fill padding value
T* pad_data = pad_tensor->data<T>();
const T* pad_value_data = pad_value.data<T>();
if (pad_value.numel() == 1) {
for (int i = 0; i < pad_tensor->numel(); ++i) {
pad_data[i] = *pad_value_data;
}
} else {
for (int i = 0; i < pad_tensor->numel(); i += step_width) {
memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
}
}
CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len,
step_width, norm_by_times, kSeqToPad, layout);
}
};
......@@ -84,62 +105,35 @@ template <typename T>
class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
framework::LoDTensor* seq, const framework::Tensor& padding,
bool norm_by_times) {
auto lod = seq->lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The LoD of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq->dims();
PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding.dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequnece_length, num_sequences, sequence_width].");
const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be "
"the maximum length of all sequences in LoDTensor seq.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq->numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
const T* padding_data = padding.data<T>();
T* seq_data = seq->data<T>();
for (int64_t i = 0; i < num_sequences; ++i) {
int64_t start_pos = abs_offset_lod[level][i];
int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
for (int64_t j = 0; j < sequence_length; ++j) {
// sequence_width > j > 0
T scale =
norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
for (int64_t k = 0; k < sequence_width; ++k) {
seq_data[(start_pos + j) * sequence_width + k] =
padding_data[(j * num_sequences + i) * sequence_width + k] *
scale;
}
}
const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims();
const auto& pad_tensor_dims = pad_tensor.dims();
if (pad_seq_len == -1) {
pad_seq_len = MaximumSequenceLength(seq_offsets);
}
int step_width = seq_tensor->numel() / seq_tensor_dims[0];
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
step_width, layout);
CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len,
step_width, norm_by_times, kPadToSeq, layout);
}
};
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
......
......@@ -19,41 +19,32 @@ namespace paddle {
namespace operators {
namespace math {
template <typename T, bool NormByTimes, bool Padding>
__global__ void SequencePaddingKernel(T* padding, T* sequence,
const size_t* sequence_start_positions,
const size_t sequence_width,
const size_t max_sequence_length,
const size_t num_sequences) {
size_t padding_idx = blockIdx.y;
size_t start_pos = sequence_start_positions[padding_idx];
size_t sequence_length =
sequence_start_positions[padding_idx + 1] - start_pos;
size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y;
size_t padding_base_idx =
(sequence_idx * num_sequences + padding_idx) * sequence_width;
size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width;
if (sequence_idx < sequence_length) {
T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
if (Padding) {
/* sequence -> padding */
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i];
}
} else {
/* padding -> sequence */
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
}
template <typename T, CopyType Type>
__global__ void SequencePaddingKernel(
T* dst, const T* src, const T* pad_value, bool is_constant_pad,
const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len,
const size_t step_width, bool norm_by_len, const PadLayout layout) {
size_t seq_idx = blockIdx.y;
size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y;
size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width;
size_t pad_data_offset = layout == kBatchLengthWidth
? (seq_idx * pad_seq_len + step_idx) * step_width
: (step_idx * seq_num + seq_idx) * step_width;
T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset);
const T* src_data =
src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
if (step_idx < seq_len) {
float scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
dst_data[i] = scale * src_data[i];
}
} else if (sequence_idx < max_sequence_length) {
if (Padding) {
/* sequence -> padding */
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
padding[padding_base_idx + i] = 0;
}
} else if (step_idx < pad_seq_len && Type == kSeqToPad) {
for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i];
}
}
}
......@@ -62,74 +53,59 @@ template <typename T>
class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor* padding,
bool norm_by_times) {
auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The lod of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims();
PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding->dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width].");
int64_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be the "
"maximum length of all sequences in LoDTensor seq.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be the "
"number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq.numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) {
TensorCopy(seq, context.GetPlace(), context, padding);
padding->Resize(padding_dims);
const framework::LoDTensor& seq_tensor,
framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod();
const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
const auto& seq_tensor_dims = seq_tensor.dims();
const auto& pad_tensor_dims = pad_tensor->dims();
int max_seq_len = MaximumSequenceLength(seq_offsets);
if (pad_seq_len == -1) {
pad_seq_len = max_seq_len;
}
PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len,
"The pad_seq_len must be equal to or greater than the "
"original max sequence length.");
int step_width = seq_tensor.numel() / seq_tensor_dims[0];
int seq_num = seq_offsets.size() - 1;
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
step_width, layout);
PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
"The numel of 'pad_value' can only be 1 or be equal to the "
"'step_width'.");
if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor);
pad_tensor->Resize(pad_tensor_dims);
return;
}
const int64_t kBlockSize = 512;
const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = num_sequences;
size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
const T* seq_data = seq.data<T>();
T* padding_data = padding->data<T>();
if (norm_by_times) {
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences);
} else {
SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences);
}
const T* seq_data = seq_tensor.data<T>();
T* pad_data = pad_tensor->data<T>();
const T* pad_value_data = pad_value.data<T>();
SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
step_width, norm_by_times, layout);
}
};
......@@ -137,79 +113,62 @@ template <typename T>
class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
framework::LoDTensor* seq, const framework::Tensor& padding,
bool norm_by_times) {
auto lod = seq->lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The lod of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq->dims();
PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding.dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequnece_length, num_sequences, sequence_width].");
int64_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be "
"the maximum length of all sequences in LoDTensor seq.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq->numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) {
TensorCopy(padding, context.GetPlace(), context, seq);
seq->Resize(seq_dims);
const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims();
const auto& pad_tensor_dims = pad_tensor.dims();
int max_seq_len = MaximumSequenceLength(seq_offsets);
if (pad_seq_len == -1) {
pad_seq_len = max_seq_len;
}
int step_width = seq_tensor->numel() / seq_tensor_dims[0];
int seq_num = seq_offsets.size() - 1;
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
step_width, layout);
if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor);
seq_tensor->Resize(seq_tensor_dims);
return;
}
const int64_t kBlockSize = 512;
const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = num_sequences;
size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
const T* padding_data = padding.data<T>();
T* seq_data = seq->data<T>();
if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences);
} else {
SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences);
}
const T* pad_data = pad_tensor.data<T>();
T* seq_data = seq_tensor->data<T>();
SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
seq_data, pad_data, nullptr, false,
seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
step_width, norm_by_times, layout);
}
};
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
} // namespace math
} // namespace operators
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device_context.h"
......@@ -22,17 +23,33 @@ namespace paddle {
namespace operators {
namespace math {
inline static size_t MaximumSequenceLength(const framework::LoD& lod,
const size_t level) {
const size_t num_sequences = lod[level].size() - 1;
size_t max_sequence_length = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
for (size_t i = 0; i < num_sequences; ++i) {
max_sequence_length =
std::max(max_sequence_length,
abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]);
enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
enum CopyType { kSeqToPad, kPadToSeq };
inline static size_t MaximumSequenceLength(
const framework::Vector<size_t>& seq_offset) {
size_t seq_num = seq_offset.size() - 1;
size_t max_seq_len = 0;
for (size_t i = 0; i < seq_num; ++i) {
max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
}
return max_sequence_length;
return max_seq_len;
}
inline static void CheckDims(const framework::DDim& seq_tensor_dims,
const framework::DDim& pad_tensor_dims,
const framework::Vector<size_t>& seq_offset,
int64_t padded_seq_len, int64_t step_width,
const PadLayout& layout) {
PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
"Value of 1st dimension of the sequence tensor should be "
"equal to sum of lengths of all sequences.");
PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
seq_tensor_dims.size() == pad_tensor_dims.size(),
"pad_tensor's rank should be 1 greater than seq_tensor's "
"rank, or be equal with it.");
}
/*
......@@ -64,15 +81,22 @@ inline static size_t MaximumSequenceLength(const framework::LoD& lod,
template <typename DeviceContext, typename T>
class PaddingLoDTensorFunctor {
public:
void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
framework::Tensor* padding, bool norm_by_times);
void operator()(const DeviceContext& context,
const framework::LoDTensor& seq_tensor,
framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth);
};
template <typename DeviceContext, typename T>
class UnpaddingLoDTensorFunctor {
public:
void operator()(const DeviceContext& context, framework::LoDTensor* seq,
const framework::Tensor& padding, bool norm_by_times);
void operator()(const DeviceContext& context,
const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth);
};
} // namespace math
......
......@@ -23,7 +23,9 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
paddle::framework::LoDTensor cpu_seq_back;
paddle::framework::LoDTensor seq;
paddle::framework::LoDTensor seq_back;
paddle::framework::Tensor padding;
paddle::framework::LoDTensor padding;
paddle::framework::LoDTensor cpu_pad_value;
paddle::framework::LoDTensor pad_value;
const size_t level = lod.size() - 1;
auto seq_dims =
......@@ -46,20 +48,33 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
}
const size_t max_sequence_length =
paddle::operators::math::MaximumSequenceLength(lod, level);
paddle::operators::math::MaximumSequenceLength(lod[level]);
const size_t num_sequences = lod[level].size() - 1;
auto padding_dims =
paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
static_cast<int64_t>(num_sequences),
static_cast<int64_t>(sequence_width)});
padding.mutable_data<T>(padding_dims, *place);
T* pad_value_data =
cpu_pad_value.mutable_data<T>({1}, paddle::platform::CPUPlace());
*pad_value_data = static_cast<T>(0);
if (paddle::platform::is_cpu_place(*place)) {
pad_value = cpu_pad_value;
} else {
TensorCopySync(cpu_pad_value, *place, &pad_value);
}
paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
*context, seq, &padding, false);
*context, seq, &padding, pad_value, -1, 0, false,
paddle::operators::math::kLengthBatchWidth);
seq_back.set_lod(lod);
seq_back.mutable_data<T>(seq_dims, *place);
paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
*context, &seq_back, padding, false);
*context, padding, &seq_back, -1, 0, false,
paddle::operators::math::kLengthBatchWidth);
if (paddle::platform::is_cpu_place(*place)) {
cpu_seq_back = seq_back;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pad_constant_like_op.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class PadConstantLikeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of PadConstantLikeOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"),
"Input(Y) of PadConstantLikeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of PadConstantLikeOp should not be null.");
auto x_dim = ctx->GetInputDim("X");
auto y_dim = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(x_dim.size(), y_dim.size(),
"The dimention of X and Y should be the same.");
for (int i = 0; i < x_dim.size(); ++i) {
PADDLE_ENFORCE_GE(x_dim[i], y_dim[i]);
}
ctx->SetOutputDim("Out", x_dim);
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
ctx.device_context());
}
};
class PadConstantLikeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input of pad_constant_like op. "
"The input should be a k-D tensor(k > 0 and k < 7)");
AddInput("Y",
"The input of pad_constant_like op. "
"The input should be a k-D tensor(k > 0 and k < 7)");
AddOutput("Out",
"The output of pad_constant_like op. "
"A tensor with the same shape as X.");
AddAttr<float>("pad_value",
"(float, default 0.0) "
"The value to fill the padded areas.")
.SetDefault(0.0f);
AddComment(R"DOC(
PadConstantLikeOp Operator.
Pad input(Y) with a pad_value, the number of values padded to the edges of each
axis is specified by the difference of the shape of X and Y.
((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for
each axis.
The input should be a k-D tensor(k > 0 and k < 7). As an example:
case1:
Given:
X = [[1, 2],
[3, 4],
[1, 2],
[3, 4]]],
X.shape = (4, 2)
Y = [[5, 6],
[7, 8]],
Y.shape = (2, 2)
And
pad_value = 0,
Return:
Out = [[5, 6],
[7, 8],
[0, 0],
[0, 0]]
Out.shape = (4, 2)
case2:
Given:
X = [[[[ 0, 1, 2],
[ 3, 4, 5]],
[[ 6, 7, 8],
[ 9, 10, 11]],
[[12, 13, 14],
[15, 16, 17]]],
[[[18, 19, 20],
[21, 22, 23]],
[[24, 25, 26],
[27, 28, 29]],
[[30, 31, 32],
[33, 34, 35]]]]
X.shape = (2, 3, 2, 3)
Y = [[[[35, 36, 37]],
[[38, 39, 40]],
[[41, 42, 43]]]]
Y.shape = (1, 3, 1, 3)
And
pad_value = -1,
Return:
Out = [[[[35, 36, 37],
[-1, -1, -1]],
[[38, 39, 40],
[-1, -1, -1]],
[[41, 42, 43],
[-1, -1, -1]]],
[[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]]]]
Out.shape = (2, 3, 2, 3)
)DOC");
}
};
class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
auto y_dim = ctx->GetInputDim("Y");
auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(dout_dim.size(), y_dim.size(),
"The dimention of X and Y should be the same.");
auto y_grad_name = framework::GradVarName("Y");
if (ctx->HasOutput(y_grad_name)) {
ctx->SetOutputDim(y_grad_name, y_dim);
ctx->ShareLoD("Y", /*->*/ y_grad_name);
for (int i = 0; i < y_dim.size(); ++i) {
PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i]);
}
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
ctx.device_context());
}
};
class PadConstantLikeOpGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *bind = new framework::OpDesc();
bind->SetType("pad_constant_like_grad");
bind->SetInput("Y", Input("Y"));
bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
bind->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
bind->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(bind);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(pad_constant_like, ops::PadConstantLikeOp,
ops::PadConstantLikeOpMaker, ops::PadConstantLikeOpGradMaker);
REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad);
REGISTER_OP_CPU_KERNEL(
pad_constant_like,
ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, float>,
ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
pad_constant_like_grad,
ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/pad_constant_like_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
pad_constant_like,
ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, float>,
ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
pad_constant_like_grad,
ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/padding.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class PadConstantLikeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto in_x = context.Input<framework::Tensor>("X");
auto in_y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
if (in_x->dims() == in_y->dims()) {
// TensorCopy(in_y, context.GetPlace(), context, out);
out->ShareDataWith(*in_y);
return;
}
T pad_value = context.Attr<T>("pad_value");
out->mutable_data<T>(context.GetPlace());
int rank = context.Input<framework::Tensor>("X")->dims().size();
std::vector<int> pads(rank * 2, 0);
for (int j = 0; j < rank; ++j) {
pads[j * 2] = 0;
pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
}
math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value,
*in_y, out);
}
};
template <typename DeviceContext, typename T>
class PadConstantLikeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto in_y = context.Input<framework::Tensor>("Y");
auto in_dout =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_y = context.Output<framework::Tensor>(framework::GradVarName("Y"));
if (d_y == nullptr) {
return;
}
if (in_dout->dims() == in_y->dims()) {
// TensorCopy(in_dout, context.GetPlace(), context, d_y);
d_y->ShareDataWith(*in_dout);
return;
}
d_y->mutable_data<T>(context.GetPlace());
int rank = in_dout->dims().size();
std::vector<int> pads(static_cast<size_t>(rank) * 2, 0);
for (int j = 0; j < rank; ++j) {
pads[j * 2] = 0;
pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
}
math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout,
d_y);
}
};
} // namespace operators
} // namespace paddle
......@@ -18,117 +18,44 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/padding.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename DeviceContext, typename T, size_t D>
void PadFunction(const framework::ExecutionContext& context) {
auto pads = context.Attr<std::vector<int>>("paddings");
Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = pads[i * 2];
paddings[i].second = pads[i * 2 + 1];
}
T pad_value = context.Attr<T>("pad_value");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
auto x_tensor = EigenTensor<T, D>::From(*x);
auto out_tensor = EigenTensor<T, D>::From(*out);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
out_tensor.device(place) = x_tensor.pad(paddings, pad_value);
}
template <typename DeviceContext, typename T>
class PadKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
int rank = context.Input<Tensor>("X")->dims().size();
switch (rank) {
case 1:
PadFunction<DeviceContext, T, 1>(context);
break;
case 2:
PadFunction<DeviceContext, T, 2>(context);
break;
case 3:
PadFunction<DeviceContext, T, 3>(context);
break;
case 4:
PadFunction<DeviceContext, T, 4>(context);
break;
case 5:
PadFunction<DeviceContext, T, 5>(context);
break;
case 6:
PadFunction<DeviceContext, T, 6>(context);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
}
auto pads = context.Attr<std::vector<int>>("paddings");
T pad_value = context.Attr<T>("pad_value");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
int rank = x->dims().size();
math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value, *x,
out);
}
};
template <typename DeviceContext, typename T, size_t D>
void PadGradFunction(const framework::ExecutionContext& context) {
auto pads = context.Attr<std::vector<int>>("paddings");
Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = -pads[i * 2];
paddings[i].second = -pads[i * 2 + 1];
}
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
if (d_x != nullptr) {
d_x->mutable_data<T>(context.GetPlace());
auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
}
}
template <typename DeviceContext, typename T>
class PadGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
size_t rank =
context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
switch (rank) {
case 1:
PadGradFunction<DeviceContext, T, 1>(context);
break;
case 2:
PadGradFunction<DeviceContext, T, 2>(context);
break;
case 3:
PadGradFunction<DeviceContext, T, 3>(context);
break;
case 4:
PadGradFunction<DeviceContext, T, 4>(context);
break;
case 5:
PadGradFunction<DeviceContext, T, 5>(context);
break;
case 6:
PadGradFunction<DeviceContext, T, 6>(context);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
auto pads = context.Attr<std::vector<int>>("paddings");
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
if (d_x == nullptr) {
return;
}
d_x->mutable_data<T>(context.GetPlace());
int rank = d_out->dims().size();
math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
d_x);
}
};
......
......@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/scale_op.h"
#include <string>
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle {
namespace operators {
......@@ -52,6 +55,21 @@ $$Out = scale*X$$
}
};
class ScaleOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
auto &in_var_name = op_desc.Input("X").front();
auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name));
auto out_var_name = op_desc.Output("Out").front();
auto *out_var = block->FindVarRecursive(out_var_name);
out_var->SetType(in_var.GetType());
out_var->SetDataType(in_var.GetDataType());
}
};
class ScaleGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
......@@ -71,7 +89,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators;
REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker);
REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker,
ops::ScaleOpVarTypeInference);
REGISTER_OP_CPU_KERNEL(
scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
......
......@@ -22,17 +22,29 @@ namespace operators {
template <typename DeviceContext, typename T>
class ScaleKernel : public framework::OpKernel<T> {
public:
virtual void Compute(const framework::ExecutionContext& context) const {
auto* tensor = context.Output<framework::Tensor>("Out");
auto* in = context.Input<framework::Tensor>("X");
tensor->mutable_data<T>(in->place());
virtual void Compute(const framework::ExecutionContext& ctx) const {
auto* in_var = ctx.InputVar("X");
auto* in = ctx.Input<framework::Tensor>("X");
auto scale = static_cast<T>(context.Attr<float>("scale"));
auto* out_var = ctx.OutputVar("Out");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(in->place());
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
"in and out should have the same dim");
auto scale = static_cast<T>(ctx.Attr<float>("scale"));
if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
auto& in_slr = in_var->Get<framework::SelectedRows>();
auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
out_slr->set_rows(in_slr.rows());
out_slr->set_height(in_slr.height());
}
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto& dev =
*context.template device_context<DeviceContext>().eigen_device();
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
eigen_out.device(dev) = scale * eigen_in;
}
};
......
......@@ -56,6 +56,10 @@ class SendBarrierOp : public framework::OperatorBase {
class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Any) Dummy inputs, used for control dependency")
.AsDuplicable();
AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
.AsDuplicable();
AddComment(R"DOC(
SendBarrier operator
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_pad_op.h"
namespace paddle {
namespace operators {
class SequencePadOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SequencePadOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("PadValue"),
"Input(PadValue) of SequencePadOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SequencePadOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_GE(x_dims.size(), 2,
"The rank of Input(x) can't be less than 2.");
auto time_step_dims = framework::slice_ddim(x_dims, 1, x_dims.size());
auto pad_value_dims = ctx->GetInputDim("PadValue");
PADDLE_ENFORCE(pad_value_dims == framework::make_ddim({1}) ||
pad_value_dims == time_step_dims,
"The Input(PadValue) must be a scalar or a tensor whose "
"shape equals to time steps in sequences");
int out_dim_0 = -1;
int out_dim_1 = -1;
if (ctx->IsRuntime()) {
// run time
framework::Variable* x_var =
boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
const auto& x_lod = x_var->Get<LoDTensor>().lod();
PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
const auto& x_lod_0 = x_lod[0];
PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
"The Input(X)'s lod info is corrupted.");
PADDLE_ENFORCE_EQ(
x_dims[0], static_cast<int64_t>(x_lod_0.back()),
"The Input(X)'s lod info mismatches the actual tensor shape.");
int seq_num = x_lod_0.size() - 1;
int max_seq_len = math::MaximumSequenceLength(x_lod_0);
int padded_length = ctx->Attrs().Get<int>("padded_length");
if (padded_length == -1) {
padded_length = max_seq_len;
}
PADDLE_ENFORCE_GE(padded_length, max_seq_len,
"The Attr(padded_length) must be -1 or an int greater "
"than the length of the longest original sequence.");
out_dim_0 = seq_num;
out_dim_1 = padded_length;
} else {
// compile time
framework::VarDesc* x_desc =
boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
}
std::vector<int> out_dims_vec{out_dim_0, out_dim_1};
auto time_step_dims_vec = framework::vectorize2int(time_step_dims);
out_dims_vec.insert(out_dims_vec.end(), time_step_dims_vec.begin(),
time_step_dims_vec.end());
ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
}
};
class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(LoDTensor, default LoDTensor<float>) Input variable which "
"should contain lod information.");
AddInput("PadValue",
"(LoDTensor), this Tensor holds values that will be fill into "
"padded steps. It can be a scalar or a tensor whose shape equals "
"to time steps in sequences. If it's a scalar, it will be "
"automatically broadcasted to the shape of time step.");
AddOutput(
"Out",
"(LoDTensor) The output vairable, which contains padded sequences.");
AddAttr<int>(
"padded_length",
"The length of padded sequences. It can be setted to -1 or "
"any positive int. When it is -1, all sequences will be padded up to "
"the length of the longest one among them; when it a certain positive "
"value, it must be greater than the length of the longest original "
"sequence.")
.SetDefault(-1);
AddComment(R"DOC(
Sequence Pad Operator
This operator pads sequences in a same batch to a consistent length.
The length is specified by attribute 'padded_length'. New elements,
whose values are specified by input 'PadValue', will be appended to
the end of each sequence, to make their final lengths consistent.
Following are cases to better explain how this works:
Case 1:
Given a 1-level LoDTensor input(X):
X.lod = [[0, 2, 5]]
X.data = [a, b, c, d, e]
and Input(PadValue):
PadValue.data = [0]
and attribite 'padded_length' = 4,
then we get LoDTensor:
Out.data = [[a, b, 0, 0],
[c, d, e, 0]]
Case 2:
Given a 1-level LoDTensor input(X):
X.lod = [[0, 2, 5]]
X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
and Input(PadValue):
PadValue.data = [0]
and attribite 'padded_length' = -1, which mean using the length
of longest input sequence(3 in this case),
then we get LoDTensor:
Out.data = [[[a1, a2], [b1, b2], [0, 0]],
[[c1, c2], [d1, d2], [e1, e2]]]
Case 3:
Given a 1-level LoDTensor input(X):
X.lod = [[0, 2, 5]]
X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
and Input(PadValue):
PadValue.data = [p1, p2]
and attribite 'padded_length' = -1, which mean using the length
of longest input sequence(3 in this case),
then we get LoDTensor:
Out.data = [[[a1, a2], [b1, b2], [p1, p2]],
[[c1, c2], [d1, d2], [e1, e2]]]
)DOC");
}
};
class SequencePadGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SequencePadGradOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) of SequencePadGradOp should not be null.");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp);
REGISTER_OP_CPU_KERNEL(
sequence_pad,
ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
sequence_pad_grad,
ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_pad_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
sequence_pad,
ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
sequence_pad_grad,
ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using LoD = framework::LoD;
template <typename DeviceContext, typename T>
class SequencePadOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* x = ctx.Input<LoDTensor>("X");
auto* out = ctx.Output<LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
const auto* pad_value = ctx.Input<LoDTensor>("PadValue");
int padded_length = ctx.Attr<int>("padded_length");
math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *x, out, *pad_value,
padded_length, 0, false, math::kBatchLengthWidth);
}
};
template <typename DeviceContext, typename T>
class SequencePadGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
if (d_x) {
const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
d_x->mutable_data<T>(ctx.GetPlace());
int padded_length = ctx.Attr<int>("padded_length");
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *d_out, d_x,
padded_length, 0, false, math::kBatchLengthWidth);
}
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/unstack_op.h"
namespace plat = paddle::platform;
namespace ops = paddle::operators;
USE_OP(stack);
REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker,
ops::UnStackOpInferShape, ops::UnStackGradOpDescMaker);
REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp,
ops::UnStackOpGradInferShape);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class UnStackOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist.");
int axis = ctx->Attrs().Get<int>("axis");
int num = ctx->Attrs().Get<int>("num");
auto x_dim = ctx->GetInputDim("X");
int rank = x_dim.size();
PADDLE_ENFORCE(axis >= -rank && axis < rank,
"Attr(axis) must be inside [-rank, rank), where rank = %d",
rank);
if (axis < 0) axis += rank;
PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast<size_t>(num),
"Number of Outputs(Y) is wrong");
if (x_dim[axis] > 0) {
PADDLE_ENFORCE_EQ(num, x_dim[axis], "Number of Outputs(Y) is wrong");
}
auto vec = framework::vectorize2int(x_dim);
vec.erase(vec.begin() + axis);
ctx->SetOutputsDim("Y", std::vector<framework::DDim>( // NOLINT
x_dim[axis], framework::make_ddim(vec)));
}
};
class UnStackOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "The input of unstack op.");
AddOutput("Y", "The output of unstack op.").AsDuplicable();
AddAttr<int>("axis", "The axis along which Input(X) should be unstacked.")
.SetDefault(0);
AddAttr<int>("num", "The number of outputs(Y).").GreaterThan(0);
AddComment(R"DOC(
UnStack Operator.
UnStack Input(X) into several tensors along Attr(axis).
)DOC");
}
};
class UnStackOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto stack_grad_op = framework::OpRegistry::CreateOp(
"stack_grad", {{framework::GradVarName("Y"), {Input("X")}}},
{{framework::GradVarName("X"), Outputs("Y")}}, Attrs());
stack_grad_op->Run(scope, place);
}
};
class UnStackOpGradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0,
"Number of Inputs(Y@Grad) must be larger than 0");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad) must exist.");
auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
for (size_t i = 1; i < input_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
"Dims of all Inputs(Y@Grad) must be the same");
}
int axis = ctx->Attrs().Get<int>("axis");
int rank = input_dims[0].size();
PADDLE_ENFORCE(
axis >= -(rank + 1) && axis < rank + 1,
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank);
if (axis < 0) axis += (rank + 1);
auto vec = framework::vectorize2int(input_dims[0]);
vec.insert(vec.begin() + axis, input_dims.size());
ctx->SetOutputDim(framework::GradVarName("X"), framework::make_ddim(vec));
}
};
class UnStackGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("unstack_grad");
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
class UnStackGradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto stack_op = framework::OpRegistry::CreateOp(
"stack", {{"X", Inputs(framework::GradVarName("Y"))}},
{{"Y", {Output(framework::GradVarName("X"))}}}, Attrs());
stack_op->Run(scope, place);
}
};
} // namespace operators
} // namespace paddle
......@@ -153,17 +153,29 @@ class WarpCTCKernel : public framework::OpKernel<T> {
framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
// warpctc needs sequences data stored in transposed padding format
Tensor warpctc_logits;
LoDTensor warpctc_logits;
const size_t max_sequence_length =
math::MaximumSequenceLength(logits_lod, level);
math::MaximumSequenceLength(logits_lod[level]);
auto warpctc_logits_dims =
framework::make_ddim({static_cast<int64_t>(max_sequence_length),
static_cast<int64_t>(num_sequences),
static_cast<int64_t>(sequence_width)});
warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
LoDTensor cpu_pad_value;
T* pad_value_data =
cpu_pad_value.mutable_data<T>({1}, platform::CPUPlace());
*pad_value_data = static_cast<T>(0);
LoDTensor pad_value;
if (platform::is_cpu_place(ctx.GetPlace())) {
pad_value = cpu_pad_value;
} else {
TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value);
}
math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits, &warpctc_logits,
false);
pad_value, -1, 0, false /* norm_by_times */, math::kLengthBatchWidth);
const T* warpctc_logits_data = warpctc_logits.data<T>();
std::vector<int> warpctc_label_lengths(num_sequences);
......@@ -209,15 +221,15 @@ template <typename DeviceContext, typename T>
class WarpCTCGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* warpctc_grad = ctx.Input<Tensor>("WarpCTCGrad");
auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times");
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), logits_grad,
*warpctc_grad, norm_by_times);
ctx.template device_context<DeviceContext>(), *warpctc_grad,
logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
const T* loss_grad_data = loss_grad->data<T>();
math::ScaleLoDTensorFunctor<DeviceContext, T>()(
......
if (NOT WIN32)
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
py_proto_compile(profiler_py_proto SRCS profiler.proto)
......@@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32)
if(WITH_GPU)
nv_library(enforce SRCS enforce.cc)
......@@ -58,9 +60,12 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
if (NOT WIN32)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
endif(NOT WIN32)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
......
......@@ -22,9 +22,13 @@ limitations under the License. */
#ifdef __APPLE__
#include <sys/sysctl.h>
#include <sys/types.h>
#elif defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h>
#else
#include <unistd.h>
#endif
#endif // _WIN32
#include <algorithm>
#include "gflags/gflags.h"
......@@ -32,16 +36,20 @@ limitations under the License. */
DEFINE_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
#if !defined(_WIN32)
DEFINE_uint64(initial_cpu_memory_in_mb,
#ifdef PADDLE_WITH_MKLDNN
/* Aligned with mozga-intel, MKLDNN need at least 5000 MB
* to obtain the best performance*/
5000,
5000ul,
#else
500,
500ul,
#endif
"Initial CPU memory for PaddlePaddle, in MD unit.");
#else
DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
"Initial CPU memory for PaddlePaddle, in MD unit.");
#endif // !defined(_WIN32)
DEFINE_double(
fraction_of_cuda_pinned_memory_to_use, 0.5,
......@@ -60,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() {
size_t len = sizeof(size);
if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
return 0L;
#elif defined(_WIN32)
MEMORYSTATUSEX sMeminfo;
sMeminfo.dwLength = sizeof(sMeminfo);
GlobalMemoryStatusEx(&sMeminfo);
return sMeminfo.ullTotalPhys;
#else
int64_t pages = sysconf(_SC_PHYS_PAGES);
int64_t page_size = sysconf(_SC_PAGE_SIZE);
......
......@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if !defined(_WIN32)
#include <sys/time.h>
#else
#include <windows.h>
#endif // !_WIN32
#include <time.h>
#include <chrono> // NOLINT
#include <string>
......@@ -27,12 +32,15 @@ namespace platform {
///////////////////////
// WARN: Under Development. Don't depend on it yet.
//////////////////////
#if !defined(_WIN32)
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
#else
inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
#endif // !_WIN32
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
......
......@@ -16,7 +16,9 @@ if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
if (NOT WIN32)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
endif(NOT WIN32)
if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif()
......
......@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include <dlfcn.h>
#include <memory>
#include <mutex> // NOLINT
#include <string>
......@@ -23,6 +21,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, "
......
......@@ -18,6 +18,11 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__
#if defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#endif
#ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h>
#include <cudnn.h>
......@@ -117,7 +122,12 @@ struct EOFException : public std::exception {
// always forces branch prediction of true.
// This generates faster binary code. __builtin_expect is since C++11.
// For more details, please check https://stackoverflow.com/a/43870188/724872.
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition == 0)
#endif
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
......@@ -230,6 +240,7 @@ inline void throw_on_error(T e) {
throw_on_error(e, "");
}
#if !defined(_WIN32)
#define PADDLE_THROW(...) \
do { \
throw ::paddle::platform::EnforceNotMet( \
......@@ -248,15 +259,28 @@ inline void throw_on_error(T e) {
__FILE__, __LINE__); \
} \
} while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
#endif
#define PADDLE_THROW_EOF() \
do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
__LINE__); \
} while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__)
#endif // REPLACE_ENFORCE_GLOG
#else // !_WIN32
// disable enforce, caused by the varardic macro exception error
#define PADDLE_THROW(x) \
do { \
throw std::make_exception_ptr( \
std::runtime_error("Windows disable the enforce.")); \
} while (false)
#define PADDLE_ENFORCE(x, ...) x
#endif // !_WIN32
/*
* Some enforce helpers here, usage:
* int a = 1;
......
......@@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
#if !defined(_WIN32)
struct RecordEvent {
RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
......@@ -94,6 +95,15 @@ struct RecordBlock {
std::string name_;
uint64_t start_ns_;
};
#else
// windows do not support profiler temporarily.
struct RecordEvent {
RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
};
struct RecordBlock {
explicit RecordBlock(int block_id) {}
};
#endif
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
......@@ -234,6 +234,7 @@ void BindVarDsec(pybind11::module *m) {
pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
.value("BOOL", pd::proto::VarType::BOOL)
.value("UINT8", pd::proto::VarType::UINT8)
.value("INT8", pd::proto::VarType::INT8)
.value("INT16", pd::proto::VarType::INT16)
.value("INT32", pd::proto::VarType::INT32)
.value("INT64", pd::proto::VarType::INT64)
......
......@@ -130,6 +130,7 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCPUTensorSetFromArray<bool>)
.def("set", PyCPUTensorSetFromArray<uint16_t>)
.def("set", PyCPUTensorSetFromArray<uint8_t>)
.def("set", PyCPUTensorSetFromArray<int8_t>)
#ifdef PADDLE_WITH_CUDA
.def("set", PyCUDATensorSetFromArray<float>)
.def("set", PyCUDATensorSetFromArray<int>)
......@@ -138,6 +139,7 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCUDATensorSetFromArray<bool>)
.def("set", PyCUDATensorSetFromArray<uint16_t>)
.def("set", PyCUDATensorSetFromArray<uint8_t>)
.def("set", PyCUDATensorSetFromArray<int8_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<float>)
.def("set", PyCUDAPinnedTensorSetFromArray<int>)
.def("set", PyCUDAPinnedTensorSetFromArray<double>)
......@@ -145,6 +147,7 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCUDAPinnedTensorSetFromArray<bool>)
.def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<int8_t>)
#endif
.def("shape", [](Tensor &self) { return vectorize(self.dims()); })
.def("_set_float_element", TensorSetElement<float>)
......
......@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
auto buffer_info =
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
uint8_t, platform::float16>()(tensor);
uint8_t, int8_t, platform::float16>()(tensor);
return buffer_info;
}
......
......@@ -335,12 +335,18 @@ function assert_api_not_changed() {
fi
python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
deactivate
}
function assert_api_spec_approvals() {
if [ -z ${BRANCH} ]; then
BRANCH="develop"
fi
API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true`
API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/API.spec" || true`
echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# TODO: curl -H 'Authorization: token ${TOKEN}'
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews | \
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then
......@@ -622,11 +628,12 @@ function main() {
cicheck)
cmake_gen ${PYTHON_ABI:-""}
build
assert_api_not_changed ${PYTHON_ABI:-""}
run_test
gen_capi_package
gen_fluid_inference_lib
test_fluid_inference_lib
assert_api_not_changed ${PYTHON_ABI:-""}
assert_api_spec_approvals
;;
*)
print_usage
......
......@@ -24,6 +24,7 @@ set and test set into paddle reader creators.
from __future__ import print_function
import numpy as np
import zipfile
import paddle.dataset.common
import re
......@@ -150,12 +151,12 @@ def __initialize_meta_info__():
def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
fn = __initialize_meta_info__()
rand = random.Random(x=rand_seed)
np.random.seed(rand_seed)
with zipfile.ZipFile(file=fn) as package:
with package.open('ml-1m/ratings.dat') as rating:
for line in rating:
line = cpt.to_text(line, encoding='latin')
if (rand.random() < test_ratio) == is_test:
if (np.random.random() < test_ratio) == is_test:
uid, mov_id, rating, _ = line.strip().split("::")
uid = int(uid)
mov_id = int(mov_id)
......
......@@ -95,6 +95,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
return core.VarDesc.VarType.INT16
elif dtype == np.uint8:
return core.VarDesc.VarType.UINT8
elif dtype == np.int8:
return core.VarDesc.VarType.INT8
else:
raise ValueError("Not supported numpy dtype %s" % dtype)
......
......@@ -246,7 +246,11 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True):
rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
})
if sync:
helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
helper.append_op(
type="send_barrier",
inputs={"X": dummy_output},
outputs={"Out": []},
attrs={"endpoints": endpoints})
def Recv(endpoints, get_vars, dummy_input=None, sync=True):
......@@ -282,7 +286,10 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True):
attrs={"endpoints": endpoints,
"epmap": epmap})
if sync:
helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints})
helper.append_op(
type="fetch_barrier",
outputs={"Out": get_vars},
attrs={"endpoints": endpoints})
return get_vars
......
......@@ -119,10 +119,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
helper = LayerHelper("auc", **locals())
auc_out = helper.create_tmp_variable(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches.
tp = helper.create_global_variable(persistable=True, dtype='int64')
tn = helper.create_global_variable(persistable=True, dtype='int64')
fp = helper.create_global_variable(persistable=True, dtype='int64')
fn = helper.create_global_variable(persistable=True, dtype='int64')
tp = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds])
tn = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds])
fp = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds])
fn = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds])
for var in [tp, tn, fp, fn]:
helper.set_variable_initializer(
var, Constant(
......
......@@ -17,6 +17,7 @@ All layers just related to the neural network.
from __future__ import print_function
import numpy as np
from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant
from ..framework import Variable
......@@ -24,7 +25,6 @@ from ..param_attr import ParamAttr
from .layer_function_generator import autodoc, templatedoc
from .tensor import concat
from . import utils
import random
from .. import unique_name
from functools import reduce
......@@ -54,6 +54,7 @@ __all__ = [
'conv2d_transpose',
'conv3d_transpose',
'sequence_expand',
'sequence_pad',
'lstm_unit',
'reduce_sum',
'reduce_mean',
......@@ -89,6 +90,7 @@ __all__ = [
'lod_reset',
'lrn',
'pad',
'pad_constant_like',
'label_smooth',
'roi_pool',
'dice_loss',
......@@ -107,6 +109,7 @@ __all__ = [
'flatten',
'sequence_mask',
'stack',
'unstack',
]
......@@ -2657,6 +2660,51 @@ def sequence_expand(x, y, ref_level=-1, name=None):
return tmp
@templatedoc()
def sequence_pad(x, pad_value, maxlen=None):
"""
${comment}
Args:
x(Variable): Input variable which should contain lod information.
pad_value(Variable): The Variable that holds values that will be fill
into padded steps. It can be a scalar or a tensor whose shape
equals to time steps in sequences. If it's a scalar, it will be
automatically broadcasted to the shape of time step.
maxlen(int, default None): The length of padded sequences. It can be
None or any positive int. When it is None, all sequences will be
padded up to the length of the longest one among them; when it a
certain positive value, it must be greater than the length of the
longest original sequence."
Returns:
Variable: The padded sequence batch. All sequences has the same length.
Examples:
.. code-block:: python
import numpy
x = fluid.layers.data(name='y', shape=[10, 5],
dtype='float32', lod_level=1)
pad_value = fluid.layers.assign(input=numpy.array([0]))
out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
"""
helper = LayerHelper('sequence_pad', input=x, **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
if maxlen is None:
maxlen = -1
helper.append_op(
type='sequence_pad',
inputs={'X': x,
'PadValue': pad_value},
outputs={'Out': out},
attrs={'padded_length': maxlen})
return out
def beam_search(pre_ids,
pre_scores,
ids,
......@@ -4793,6 +4841,86 @@ def pad(x, paddings, pad_value=0., name=None):
return out
def pad_constant_like(x, y, pad_value=0., name=None):
"""
Pad input(Y) with :attr:`pad_value`, the number of values padded to
the edges of each axis is specified by the difference of the shape
of X and Y. ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n))
unique pad widths for each axis. The input should be a k-D
tensor(k > 0 and k < 7).
See below for an example.
.. code-block:: text
Given:
X = [[[[ 0, 1, 2],
[ 3, 4, 5]],
[[ 6, 7, 8],
[ 9, 10, 11]],
[[12, 13, 14],
[15, 16, 17]]],
[[[18, 19, 20],
[21, 22, 23]],
[[24, 25, 26],
[27, 28, 29]],
[[30, 31, 32],
[33, 34, 35]]]]
X.shape = (2, 3, 2, 3)
Y = [[[[35, 36, 37]],
[[38, 39, 40]],
[[41, 42, 43]]]]
Y.shape = (1, 3, 1, 3)
And
pad_value = -1,
Return:
Out = [[[[35, 36, 37],
[-1, -1, -1]],
[[38, 39, 40],
[-1, -1, -1]],
[[41, 42, 43],
[-1, -1, -1]]],
[[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]]]]
Out.shape = (2, 3, 2, 3)
Args:
x (Variable): The input tensor variable.
y (Variable): The input tensor variable.
pad_value (float): The constant value used to pad.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The padded tensor variable.
Examples:
.. code-block:: python
# x is a rank 4 tensor variable, x.shape = (2, 3, 2, 3)
# y is a rank 4 tensor variable, y.shape = (1, 3, 1, 3)
out = fluid.layers.pad_constant_like(x=x, y=y, pad_value=0.)
# out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3]
"""
helper = LayerHelper('pad_constant_like', input=x, **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
helper.append_op(
type='pad_constant_like',
inputs={'X': x,
'Y': y},
outputs={'Out': out},
attrs={'pad_value': float(pad_value)})
return out
def label_smooth(label,
prior_dist=None,
epsilon=0.1,
......@@ -5187,7 +5315,7 @@ def random_crop(x, shape, seed=None):
dtype = x.dtype
out = helper.create_tmp_variable(dtype)
if seed is None:
seed = random.randint(-65536, 65535)
seed = np.random.randint(-65536, 65536)
op_attrs = {"shape": shape}
if isinstance(seed, int):
op_attrs["startup_seed"] = seed
......@@ -5389,7 +5517,7 @@ def crop(x, shape=None, offsets=None, name=None):
helper = LayerHelper('crop', **locals())
if not (isinstance(shape, list) or isinstance(shape, tuple) or \
isinstance(shape, Variable)):
isinstance(shape, Variable)):
raise ValueError("The shape should be a list, tuple or Variable.")
if offsets is None:
......@@ -5501,7 +5629,7 @@ def prelu(x, mode, param_attr=None, name=None):
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
......@@ -5615,23 +5743,23 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
:code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
.. math::
y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
Args:
x (Variable): Input tensor of sequence_mask layer,
x (Variable): Input tensor of sequence_mask layer,
whose elements are integers less than :code:`maxlen`.
maxlen (int|None): Maximum length of the sequence. If :code:`maxlen`
is None, it would be replace with :math:`max(x)`.
dtype (np.dtype|core.VarDesc.VarType|str): Data type of the output.
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
Returns:
Variable: The output sequence mask.
"""
helper = LayerHelper('sequence_mask', **locals())
......@@ -5656,23 +5784,23 @@ def stack(x, axis=0):
**Stack Layer**
This layer stacks all of the input :code:`x` along axis.
Input :code:`x` can be a single variable, a :code:`list` of variables,
or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or
:code:`tuple`, the shapes of all these variables must be the same.
Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`,
the shape of the output variable would be
:math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`.
Input :code:`x` can be a single variable, a :code:`list` of variables,
or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or
:code:`tuple`, the shapes of all these variables must be the same.
Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`,
the shape of the output variable would be
:math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`.
If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
If :code:`axis` is None, it would be replaced with 0.
If :code:`axis` is None, it would be replaced with 0.
Args:
x (Variable|list(Variable)|tuple(Variable)): Input variables.
x (Variable|list(Variable)|tuple(Variable)): Input variables.
axis (int|None): The axis along which all inputs are stacked.
Returns:
Variable: The stacked variable.
"""
helper = LayerHelper('stack', **locals())
......@@ -5686,3 +5814,44 @@ def stack(x, axis=0):
type='stack', inputs={'X': x}, outputs={'Y': out},
attrs={'axis': axis})
return out
def unstack(x, axis=0, num=None):
"""
**UnStack Layer**
This layer unstacks input :code:`x` into several tensors along axis.
If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`.
If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`,
and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is
raised.
Args:
x (Variable): Input variable.
axis (int): The axis along which the input is unstacked.
num (int|None): The number of output variables.
Returns:
list(Variable): The unstacked variables.
"""
helper = LayerHelper('unstack', **locals())
if num is None:
if axis is None or x.shape[axis] <= 0:
raise ValueError('unknown unstack number')
else:
num = x.shape[axis]
outs = []
for _ in num:
outs.append(helper.create_tmp_variable(x.dtype))
helper.append_op(
type='unstack',
inputs={'X': [x]},
outputs={'Y': outs},
attrs={'axis': axis,
'num': num})
return outs
......@@ -46,10 +46,12 @@ class Optimizer(object):
def __init__(self,
learning_rate,
regularization=None,
LARS_weight_decay=0.0):
LARS_weight_decay=0.0,
name=None):
if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable):
raise TypeError("learning rate should be float or Variable")
self._name = name
self.regularization = regularization
self._learning_rate = learning_rate
# the learning rate type should be inferenced from loss
......@@ -153,6 +155,8 @@ class Optimizer(object):
dtype: data type of the accumulator variable
fill_value: value to initialize the accumulator variable
"""
if self._name is not None:
name = self._name + "_" + name
if (name in self._accumulators and
param.name in self._accumulators[name]):
raise Exception("Accumulator {} already exists for parameter {}".
......@@ -181,6 +185,8 @@ class Optimizer(object):
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
if (name not in self._accumulators or
param.name not in self._accumulators[name]):
raise Exception("Accumulator {} does not exist for parameter {}".
......
......@@ -134,7 +134,7 @@ class SE_ResNeXt():
size=class_dim,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.2)))
initializer=fluid.initializer.Constant(value=0.05)))
return out
def shortcut(self, input, ch_out, stride):
......@@ -184,7 +184,7 @@ class SE_ResNeXt():
act=None,
# avoid pserver CPU init differs from GPU
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.2)),
initializer=fluid.initializer.Constant(value=0.05)),
bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act)
......@@ -192,13 +192,19 @@ class SE_ResNeXt():
pool = fluid.layers.pool2d(
input=input, pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
squeeze = fluid.layers.fc(input=pool,
size=num_channels // reduction_ratio,
act='relu')
squeeze = fluid.layers.fc(
input=pool,
size=num_channels // reduction_ratio,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)),
act='relu')
stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
excitation = fluid.layers.fc(input=squeeze,
size=num_channels,
act='sigmoid')
excitation = fluid.layers.fc(
input=squeeze,
size=num_channels,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)),
act='sigmoid')
scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
return scale
......
......@@ -49,28 +49,32 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(
name='shared_w', initializer=fluid.initializer.Constant()))
name='shared_w',
initializer=fluid.initializer.Constant(value=0.1)))
embed_second = fluid.layers.embedding(
input=words[1],
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(
name='shared_w', initializer=fluid.initializer.Constant()))
name='shared_w',
initializer=fluid.initializer.Constant(value=0.1)))
embed_third = fluid.layers.embedding(
input=words[2],
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(
name='shared_w', initializer=fluid.initializer.Constant()))
name='shared_w',
initializer=fluid.initializer.Constant(value=0.1)))
embed_forth = fluid.layers.embedding(
input=words[3],
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(
name='shared_w', initializer=fluid.initializer.Constant()))
name='shared_w',
initializer=fluid.initializer.Constant(value=0.1)))
concat_embed = fluid.layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth],
......@@ -80,13 +84,13 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
size=HIDDEN_SIZE,
act='sigmoid',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant()))
initializer=fluid.initializer.Constant(value=0.1)))
predict_word = fluid.layers.fc(
input=hidden1,
size=dict_size,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant()))
initializer=fluid.initializer.Constant(value=0.1)))
cost = fluid.layers.cross_entropy(
input=predict_word, label=words[4])
avg_cost = fluid.layers.mean(cost)
......
......@@ -100,7 +100,7 @@ class TestSendOp(unittest.TestCase):
main.global_block().append_op(
type="fetch_barrier",
inputs={},
outputs={},
outputs={"Out": []},
attrs={
"endpoints": ["127.0.0.1:{0}".format(port)],
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
......
......@@ -22,7 +22,7 @@ class TestDistSeResneXt2x2(TestDistBase):
self._sync_mode = True
def test_se_resnext(self):
self.check_with_place("dist_word2vec.py", delta=1e-7)
self.check_with_place("dist_word2vec.py", delta=1e-4)
class TestDistSeResneXt2x2Async(TestDistBase):
......
......@@ -20,41 +20,50 @@ import math
from op_test import OpTest
def quantize_max_abs(x, num_bits):
range = math.pow(2, num_bits) - 1
def quantize_max_abs(x, max_range):
scale = np.max(np.abs(x).flatten())
y = np.round(x / scale * range)
y = np.round(x / scale * max_range)
return y, scale
def dequantize_max_abs(x, num_bits, scale):
range = math.pow(2, num_bits) - 1
y = (scale / range) * x
def dequantize_max_abs(x, scale, max_range):
y = (scale / max_range) * x
return y
class TestFakeDequantizeMaxAbsOp(OpTest):
def set_args(self):
self.num_bits = 8
self.max_range = math.pow(2, self.num_bits - 1) - 1
self.data_type = "float32"
def setUp(self):
self.set_args()
self.op_type = "fake_dequantize_max_abs"
x = np.random.randn(31, 65).astype("float32")
yq, scale = quantize_max_abs(x, self.num_bits)
ydq = dequantize_max_abs(yq, self.num_bits, scale)
x = np.random.randn(31, 65).astype(self.data_type)
yq, scale = quantize_max_abs(x, self.max_range)
ydq = dequantize_max_abs(yq, scale, self.max_range)
self.inputs = {'X': yq}
self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)}
self.inputs = {'X': yq, 'Scale': np.array(scale).astype(self.data_type)}
self.attrs = {'max_range': self.max_range}
self.outputs = {'Out': ydq}
def test_check_output(self):
self.check_output()
class TestFakeDequantizeMaxAbsOp5Bits(OpTest):
class TestFakeDequantizeMaxAbsOpDouble(TestFakeDequantizeMaxAbsOp):
def set_args(self):
self.num_bits = 8
self.max_range = math.pow(2, self.num_bits - 1) - 1
self.data_type = "float64"
class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp):
def set_args(self):
self.num_bits = 5
self.max_range = math.pow(2, self.num_bits - 1) - 1
self.data_type = "float32"
if __name__ == "__main__":
......
......@@ -43,13 +43,13 @@ def fusion_lstm(
act_cell, act_cand)
class TestLstmOp(OpTest):
def set_argument(self):
self.lod = [[2, 3, 2]]
class TestFusionLSTMOp(OpTest):
def set_conf(self):
pass
def setUp(self):
self.op_type = 'fusion_lstm'
self.lod = [[2, 3, 2]]
self.lod = [[2, 3, 5, 4]]
self.M = 8
self.D = 16
self.has_initial_state = False
......@@ -58,33 +58,33 @@ class TestLstmOp(OpTest):
self.act_cell = 'tanh'
self.act_cand = 'tanh'
self.use_peepholes = False
self.set_argument()
self.set_conf()
T = sum(self.lod[0])
bs = len(self.lod[0])
x = np.random.normal(size=(T, self.M)).astype('float64')
x = np.random.normal(size=(T, self.M)).astype('float32')
if self.has_initial_state:
h0 = np.random.normal(size=(bs, self.D)).astype('float64')
c0 = np.random.normal(size=(bs, self.D)).astype('float64')
h0 = np.random.normal(size=(bs, self.D)).astype('float32')
c0 = np.random.normal(size=(bs, self.D)).astype('float32')
else:
h0 = np.zeros((bs, self.D)).astype('float64')
c0 = np.zeros((bs, self.D)).astype('float64')
h0 = np.zeros((bs, self.D)).astype('float32')
c0 = np.zeros((bs, self.D)).astype('float32')
wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32')
if self.use_peepholes:
b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
b = np.random.normal(size=(1, 7 * self.D)).astype('float32')
else:
b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
b = np.random.normal(size=(1, 4 * self.D)).astype('float32')
w_b = np.copy(b[:, 0:4 * self.D])
w_c = b[:, 4 * self.D:] if self.use_peepholes else None
# this is the weight of fc
wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float64')
wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float32')
# this is the bias of fc
# and it should be manually added into the bias of this fusion LSTM
bx = np.random.normal(size=(1, 4 * self.D)).astype('float64')
bx = np.random.normal(size=(1, 4 * self.D)).astype('float32')
b[0, 0:4 * self.D] += bx[0, :]
h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c,
self.is_reverse, ACTIVATION[self.act_gate],
......@@ -114,35 +114,45 @@ class TestLstmOp(OpTest):
}
def test_check_output(self):
self.check_output(atol=1e-8)
self.check_output()
class TestLstmOpInitReverse(TestLstmOp):
def set_argument(self):
class TestFusionLSTMOpInit(TestFusionLSTMOp):
def set_conf(self):
self.has_initial_state = True
class TestFusionLSTMOpReverse(TestFusionLSTMOp):
def set_conf(self):
self.is_reverse = True
class TestFusionLSTMOpInitReverse(TestFusionLSTMOp):
def set_conf(self):
self.has_initial_state = True
self.is_reverse = True
class TestLstmOpMD1(TestLstmOp):
def set_argument(self):
class TestFusionLSTMOpMD1(TestFusionLSTMOp):
def set_conf(self):
self.M = 36
self.D = 8
class TestLstmOpMD2(TestLstmOp):
def set_argument(self):
class TestFusionLSTMOpMD2(TestFusionLSTMOp):
def set_conf(self):
self.M = 8
self.D = 8
class TestLstmOpMD3(TestLstmOp):
def set_argument(self):
class TestFusionLSTMOpMD3(TestFusionLSTMOp):
def set_conf(self):
self.M = 15
self.D = 3
class TestLstmOpBS1(TestLstmOp):
def set_argument(self):
class TestFusionLSTMOpBS1(TestFusionLSTMOp):
def set_conf(self):
self.lod = [[3]]
self.D = 16
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
from test_fusion_lstm_op import fc, ACTIVATION
def fusion_seqexpand_concat_fc(xs, lod, w, b, fc_act):
T = sum(lod[0])
N = len(lod[0])
num_inputs = len(xs)
D = w.shape[1]
expanded_inputs = [xs[0]]
for i in range(num_inputs - 1):
x = xs[i + 1]
assert x.shape[0] == N
expanded = np.repeat(x, lod[0], axis=0)
assert expanded.shape[0] == T
assert expanded.shape[1] == x.shape[1]
expanded_inputs.append(expanded)
fc_input = np.concatenate(expanded_inputs, axis=1)
assert fc_input.shape[0] == T
assert fc_input.shape[1] == w.shape[0]
fc_out = fc(fc_input, w, b)
fc_out = fc_act(fc_out)
assert fc_out.shape[0] == T
assert fc_out.shape[1] == D
return fc_out
class TestFusionSeqExpandConcatFCOp(OpTest):
def set_conf(self):
pass
def setUp(self):
self.op_type = 'fusion_seqexpand_concat_fc'
self.lod = [[3, 5, 8, 2]]
self.inputs_M = [15, 10, 10]
self.D = 20
self.with_bias = True
self.fc_act = 'relu'
self.set_conf()
T = sum(self.lod[0])
bs = len(self.lod[0])
num_inputs = len(self.inputs_M)
x0 = np.random.normal(size=(T, self.inputs_M[0])).astype('float32')
xs = [x0]
for i in range(num_inputs - 1):
xi = np.random.normal(size=(bs,
self.inputs_M[i + 1])).astype('float32')
xs.append(xi)
# fc weight and bias
w = np.random.normal(size=(sum(self.inputs_M),
self.D)).astype('float32')
b = np.random.normal(size=(
1, self.D)).astype('float32') if self.with_bias else np.zeros(
(1, self.D)).astype('float32')
out = fusion_seqexpand_concat_fc(xs, self.lod, w, b,
ACTIVATION[self.fc_act])
self.inputs = {'X': [('x0', (x0, self.lod))], 'FCWeight': w}
normal_lod = [[1] * bs]
for i in range(num_inputs - 1):
self.inputs['X'].append(('x%d' % (i + 1), (xs[i + 1], normal_lod)))
if self.with_bias:
self.inputs['FCBias'] = b
self.outputs = {'Out': (out, self.lod)}
self.attrs = {'fc_activation': self.fc_act}
def test_check_output(self):
self.check_output()
class TestFusionSECFCOpNonBias(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.with_bias = False
class TestFusionSECFCOpNonAct(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.fc_act = 'identity'
class TestFusionSECFCOpMD1(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.inputs_M = [3, 4, 2, 1, 5]
self.D = 8
class TestFusionSECFCOpMD2(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.lod = [[5, 6]]
self.inputs_M = [1, 1]
class TestFusionSECFCOpBS1_1(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.lod = [[1]]
self.inputs_M = [3, 4, 2]
class TestFusionSECFCOpBS1_2(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.lod = [[1]]
self.inputs_M = [3, 4]
class TestFusionSECFCOpBS1_3(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.lod = [[5]]
self.inputs_M = [6, 3]
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
class TestPadOp(OpTest):
def setUp(self):
self.initTestCase()
self.op_type = "pad_constant_like"
self.inputs = {
'X': np.random.random(self.x_shape).astype("float32"),
'Y': np.random.random(self.y_shape).astype("float32")
}
self.attrs = {}
self.attrs['pad_value'] = self.pad_value
self.outputs = {
'Out': np.pad(self.inputs['Y'],
self.paddings,
mode='constant',
constant_values=self.pad_value)
}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['Y'], 'Out', max_relative_error=0.006)
def initTestCase(self):
self.x_shape = (16, 16)
self.y_shape = (3, 16)
self.pad_value = 0.1
self.paddings = [(0, 13), (0, 0)]
class TestCase1(TestPadOp):
def initTestCase(self):
self.x_shape = (4, 3, 4, 4)
self.y_shape = (2, 3, 4, 4)
self.paddings = [(0, 2), (0, 0), (0, 0), (0, 0)]
self.pad_value = 0.5
class TestCase2(TestPadOp):
def initTestCase(self):
self.x_shape = (4, 3, 4, 4)
self.y_shape = (2, 3, 2, 4)
self.paddings = [(0, 2), (0, 0), (0, 2), (0, 0)]
self.pad_value = 0.5
if __name__ == '__main__':
unittest.main()
......@@ -17,6 +17,8 @@ from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
class TestScaleOp(OpTest):
......@@ -33,5 +35,57 @@ class TestScaleOp(OpTest):
self.check_grad(['X'], 'Out')
class TestScaleOpSelectedRows(unittest.TestCase):
def check_with_place(self, place, in_name, out_name):
scope = core.Scope()
# create and initialize Grad Variable
in_height = 10
in_rows = [0, 4, 7]
in_row_numel = 12
scale = 2.0
in_selected_rows = scope.var(in_name).get_selected_rows()
in_selected_rows.set_height(in_height)
in_selected_rows.set_rows(in_rows)
in_array = np.random.random(
(len(in_rows), in_row_numel)).astype("float32")
in_tensor = in_selected_rows.get_tensor()
in_tensor.set(in_array, place)
# create and initialize Param Variable
out_selected_rows = scope.var(out_name).get_selected_rows()
out_tensor = out_selected_rows.get_tensor()
out_tensor._set_dims(in_tensor._get_dims())
# create and run sgd operator
scale_op = Operator("scale", X=in_name, Out=out_name, scale=scale)
scale_op.run(scope, place)
# get and compare result
out_height = out_selected_rows.height()
out_rows = out_selected_rows.rows()
result_array = np.array(out_tensor)
assert (in_array * scale == result_array).all()
assert in_height == out_height
assert in_rows == out_rows
def test_scale_selected_rows(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place, 'in', 'out')
def test_scale_selected_rows_inplace(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place, 'in', 'in')
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestSequencePadOp(OpTest):
def set_attr(self):
self.x_shape = [12, 4]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0]
self.padded_length = -1
self.dtype = 'float32'
def set_data(self):
x_data = np.random.uniform(0.1, 0.5, self.x_shape).astype(self.dtype)
pad_value_data = np.array(self.pad_value).astype(self.dtype)
self.inputs = {
'X': (x_data, self.x_len_lod),
'PadValue': pad_value_data
}
self.attrs = {'padded_length': self.padded_length}
def compute(self):
# get padded length
padded_length = self.padded_length
x_len_lod_0 = self.x_len_lod[0]
if padded_length == -1:
max_seq_len = 0
for l in x_len_lod_0:
max_seq_len = max(max_seq_len, l)
padded_length = max_seq_len
# do padding
x_data = self.inputs['X'][0]
pad_value_data = self.inputs['PadValue']
if pad_value_data.shape == (1, ):
pad_value_data = np.broadcast_to(
pad_value_data, shape=x_data.shape[1:])
padded_sequences = []
start_idx = 0
for l in x_len_lod_0:
end_idx = start_idx + l
seq = x_data[start_idx:end_idx]
to_pad_len = padded_length - l
for _ in range(to_pad_len):
seq = np.append(seq, pad_value_data[np.newaxis, :], axis=0)
padded_sequences.append(seq)
start_idx = end_idx
out_data = np.array(padded_sequences)
self.outputs = {'Out': out_data}
def setUp(self):
self.op_type = 'sequence_pad'
self.set_attr()
self.set_data()
self.compute()
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestSequencePadOp2(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 4]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0, 2.0, 3.0, 4.0]
self.padded_length = -1
self.dtype = 'float32'
class TestSequencePadOp3(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 4]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0]
self.padded_length = 7
self.dtype = 'float32'
class TestSequencePadOp4(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 4]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0, 2.0, 3.0, 4.0]
self.padded_length = 7
self.dtype = 'float32'
class TestSequencePadOp5(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 2, 2]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0]
self.padded_length = -1
self.dtype = 'float32'
class TestSequencePadOp6(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 2, 2]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [[1.0, 2.0], [3.0, 4.0]]
self.padded_length = -1
self.dtype = 'float32'
class TestSequencePadOp7(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 2, 2]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0]
self.padded_length = 7
self.dtype = 'float32'
......@@ -59,6 +59,27 @@ class TestTensor(unittest.TestCase):
self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
def test_int8_tensor(self):
scope = core.Scope()
var = scope.var("int8_tensor")
cpu_tensor = var.get_tensor()
tensor_array = numpy.random.randint(
-127, high=128, size=[100, 200], dtype=numpy.int8)
place = core.CPUPlace()
cpu_tensor.set(tensor_array, place)
cpu_tensor_array_2 = numpy.array(cpu_tensor)
self.assertAlmostEqual(cpu_tensor_array_2.all(), tensor_array.all())
if core.is_compiled_with_cuda():
cuda_tensor = var.get_tensor()
tensor_array = numpy.random.randint(
-127, high=128, size=[100, 200], dtype=numpy.int8)
place = core.CUDAPlace(0)
cuda_tensor.set(tensor_array, place)
cuda_tensor_array_2 = numpy.array(cuda_tensor)
self.assertAlmostEqual(cuda_tensor_array_2.all(),
tensor_array.all())
def test_int_lod_tensor(self):
place = core.CPUPlace()
scope = core.Scope()
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from op_test import OpTest
import numpy as np
import unittest
class TestUnStackOpBase(OpTest):
def initDefaultParameters(self):
self.input_dim = (5, 6, 7)
self.axis = 0
self.dtype = 'float32'
def initParameters(self):
pass
def get_y_names(self):
y_names = []
for i in range(self.input_dim[self.axis]):
y_names.append('y{}'.format(i))
return y_names
def setUp(self):
self.initDefaultParameters()
self.initParameters()
self.op_type = 'unstack'
self.x = np.random.random(size=self.input_dim).astype(self.dtype)
outs = np.split(self.x, self.input_dim[self.axis], self.axis)
new_shape = list(self.input_dim)
del new_shape[self.axis]
y_names = self.get_y_names()
tmp = []
for i in range(self.input_dim[self.axis]):
tmp.append((y_names[i], np.reshape(outs[i], new_shape)))
self.inputs = {'X': self.x}
self.outputs = {'Y': tmp}
self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad('X', self.get_y_names())
class TestStackOp3(TestUnStackOpBase):
def initParameters(self):
self.axis = -1
class TestStackOp4(TestUnStackOpBase):
def initParameters(self):
self.axis = -3
class TestStackOp5(TestUnStackOpBase):
def initParameters(self):
self.axis = 1
class TestStackOp6(TestUnStackOpBase):
def initParameters(self):
self.axis = 2
if __name__ == '__main__':
unittest.main()
......@@ -31,7 +31,8 @@ class TestVariable(unittest.TestCase):
self.assertEqual(DT.INT16, convert("int16"))
self.assertEqual(DT.INT64, convert("int64"))
self.assertEqual(DT.BOOL, convert("bool"))
self.assertRaises(ValueError, lambda: convert("int8"))
self.assertEqual(DT.INT8, convert("int8"))
self.assertEqual(DT.UINT8, convert("uint8"))
def test_var(self):
b = default_main_program().current_block()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册