提交 13509da6 编写于 作者: Y Yibing Liu

Merge upstream to branch wrap_squeezes

...@@ -113,6 +113,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size ...@@ -113,6 +113,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)) paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
...@@ -148,6 +149,7 @@ paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=No ...@@ -148,6 +149,7 @@ paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=No
paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)) paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None))
paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)) paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)) paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
...@@ -166,6 +168,7 @@ paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], vara ...@@ -166,6 +168,7 @@ paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], vara
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)) paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
...@@ -380,7 +383,7 @@ paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, a ...@@ -380,7 +383,7 @@ paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, a
paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool
paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]] paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]] paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None 24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int] paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
......
...@@ -64,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() { ...@@ -64,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() {
RegType(size_t, proto::VarType::SIZE_T); RegType(size_t, proto::VarType::SIZE_T);
RegType(int16_t, proto::VarType::INT16); RegType(int16_t, proto::VarType::INT16);
RegType(uint8_t, proto::VarType::UINT8); RegType(uint8_t, proto::VarType::UINT8);
RegType(int8_t, proto::VarType::INT8);
#undef RegType #undef RegType
return retv; return retv;
......
...@@ -54,6 +54,9 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { ...@@ -54,6 +54,9 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
case proto::VarType::INT16: case proto::VarType::INT16:
visitor.template operator()<int16_t>(); visitor.template operator()<int16_t>();
break; break;
case proto::VarType::INT8:
visitor.template operator()<int8_t>();
break;
default: default:
PADDLE_THROW("Not supported %d", type); PADDLE_THROW("Not supported %d", type);
} }
......
...@@ -754,17 +754,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, ...@@ -754,17 +754,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
node->Op()->Type()); node->Op()->Type());
CreateComputationalOp(result, node, op_dev_id); CreateComputationalOp(result, node, op_dev_id);
if (node->Op()->Type() == "concat") { }
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
"fetch_barrier"); void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
for (ir::Node *input : node->inputs) {
VarHandle *var = nullptr;
for (int place_offset = 0; place_offset < num_places; ++place_offset) {
auto &var_holders = result->Get<GraphVars>(kGraphVars)[place_offset];
auto &var_holder = var_holders[input->Name()];
if (!var_holder.empty()) {
var = var_holder.rbegin()->get();
op_handle->AddInput(var);
}
}
} }
} }
// Create RPC related op handles that connects its in ops and out ops. // Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
ir::Node *node) const { ir::Node *node) const {
// FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode
// put them into transpiler.
int op_dev_id = -1; int op_dev_id = -1;
if (node->Op()->Type() == "send") { if (node->Op()->Type() == "send") {
// TODO(paddle-dev): getting the first var is not safe. // TODO(paddle-dev): getting the first var is not safe.
...@@ -799,8 +808,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ...@@ -799,8 +808,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
} }
auto recv_param_grad = boost::get<std::vector<std::string>>( auto recv_param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
// FIXME(typhoonzero): assume each recv op output one param
// Use the same place as send.
if (recv_param_grad.size() == 2U) { if (recv_param_grad.size() == 2U) {
op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]); op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
VLOG(10) << "recv param " << recv_param_grad[0] VLOG(10) << "recv param " << recv_param_grad[0]
...@@ -814,34 +821,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ...@@ -814,34 +821,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
.emplace(varname, op_dev_id); .emplace(varname, op_dev_id);
} }
} else { } else {
// send_barrier and fetch_barrier op can be scheduled on device 0 // send_barrier, fetch_barrier will run on place 0;
op_dev_id = 0; op_dev_id = 0;
} }
PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
node->Op()->Type()); node->Op()->Type());
result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle( result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
node->Op()->Type(), places_[op_dev_id])); node->Op()->Type(), places_[op_dev_id]));
// TODO(panyx0718): This might not be needed anymore. if (node->Op()->Type() == "send") {
if (node->Op()->Type() == "send_barrier") { CreateOpHandleIOs(result, node, op_dev_id);
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "send");
} else if (node->Op()->Type() == "recv") {
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
"send_barrier");
} else if (node->Op()->Type() == "fetch_barrier") {
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "recv");
} else if (node->Op()->Type() == "send") {
// do nothing
} else { } else {
PADDLE_THROW( // send_barrier, recv, fetch_barrier's inputs are deps var, get them from
"rpc op should be in [" // all places
"send, send_barrier. recv, fetch_barrier]"); auto p = places_[op_dev_id];
} auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
op_handle->SetDeviceContext(p,
platform::DeviceContextPool::Instance().Get(p));
CreateOpHandleIOs(result, node, op_dev_id); SetOpInputsAllPlaces(result, node, places_.size());
for (ir::Node *output : node->outputs) {
int outvar_dev_id = op_dev_id;
if (node->Op()->Type() == "fetch_barrier") {
outvar_dev_id = GetVarDeviceID(*result, output->Name());
PADDLE_ENFORCE_NE(outvar_dev_id, -1);
}
p = places_[outvar_dev_id];
ir::Node *new_node = nullptr;
if (output->Var()) {
new_node = result->CreateVarNode(output->Var());
} else {
new_node =
result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
}
CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
}
}
} }
bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
......
...@@ -107,6 +107,7 @@ message VarType { ...@@ -107,6 +107,7 @@ message VarType {
// Tensor<size_t> is used in C++. // Tensor<size_t> is used in C++.
SIZE_T = 19; SIZE_T = 19;
UINT8 = 20; UINT8 = 20;
INT8 = 21;
// Other types that may need additional descriptions // Other types that may need additional descriptions
LOD_TENSOR = 7; LOD_TENSOR = 7;
......
...@@ -132,63 +132,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { ...@@ -132,63 +132,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
} }
} }
std::vector<ir::Node *> send_ops;
ir::Node *send_bar = nullptr;
std::vector<ir::Node *> recv_ops;
ir::Node *fetch_bar = nullptr;
for (ir::Node *node : Nodes()) {
if (node->Name() == "send") {
send_ops.push_back(node);
} else if (node->Name() == "send_barrier") {
PADDLE_ENFORCE(!send_bar, "only has one send barrier");
send_bar = node;
} else if (node->Name() == "recv") {
recv_ops.push_back(node);
} else if (node->Name() == "fetch_barrier") {
PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier");
fetch_bar = node;
}
}
if (send_bar) {
for (ir::Node *send : send_ops) {
ir::Node *dep_var = CreateControlDepVar();
send->outputs.push_back(dep_var);
dep_var->inputs.push_back(send);
send_bar->inputs.push_back(dep_var);
dep_var->outputs.push_back(send_bar);
}
for (ir::Node *recv : recv_ops) {
ir::Node *dep_var = CreateControlDepVar();
recv->inputs.push_back(dep_var);
dep_var->outputs.push_back(recv);
send_bar->outputs.push_back(dep_var);
dep_var->inputs.push_back(send_bar);
}
}
if (fetch_bar) {
for (ir::Node *recv : recv_ops) {
ir::Node *dep_var = CreateControlDepVar();
recv->outputs.push_back(dep_var);
dep_var->inputs.push_back(recv);
fetch_bar->inputs.push_back(dep_var);
dep_var->outputs.push_back(fetch_bar);
}
}
std::vector<std::string> send_vars = FindDistTrainSendVars(send_ops);
std::vector<std::string> recv_vars = FindDistTrainRecvVars(recv_ops);
for (ir::Node *node : Nodes()) {
if (IsDistTrainOp(node, send_vars, recv_vars)) {
if (fetch_bar && node->Name() == "concat") {
ir::Node *dep_var = CreateControlDepVar();
fetch_bar->outputs.push_back(dep_var);
dep_var->inputs.push_back(fetch_bar);
node->inputs.push_back(dep_var);
dep_var->outputs.push_back(node);
}
}
}
/** /**
* We should handle write after read(WAR) and write after write(WAW) here. * We should handle write after read(WAR) and write after write(WAW) here.
* Because some of the operators of the program can be executed parallelly. * Because some of the operators of the program can be executed parallelly.
......
...@@ -40,7 +40,11 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type, ...@@ -40,7 +40,11 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
"When calling this method, the Tensor's numel must be " "When calling this method, the Tensor's numel must be "
"equal or larger than zero. " "equal or larger than zero. "
"Please check Tensor::Resize has been called first."); "Please check Tensor::Resize has been called first.");
size_t size = requested_size ? requested_size : numel() * SizeOfType(type); size_t size = numel() * SizeOfType(type);
if (requested_size) {
PADDLE_ENFORCE_GE(requested_size, size);
size = requested_size;
}
/* some versions of boost::variant don't have operator!= */ /* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) || if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) { holder_->size() < size + offset_) {
......
...@@ -72,7 +72,7 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -72,7 +72,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
auto trt_teller = [&](const Node* node) { auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set( std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax", {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
"depthwise_conv2d", "batch_norm"}); "depthwise_conv2d", "batch_norm", "concat"});
if (!node->IsFunction()) return false; if (!node->IsFunction()) return false;
const auto* func = static_cast<const Function*>(node); const auto* func = static_cast<const Function*>(node);
......
...@@ -32,6 +32,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -32,6 +32,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
: NativePaddlePredictor(config), config_(config) {} : NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) { bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true;
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
FLAGS_tensorrt_max_batch_size = config_.max_batch_size; FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
FLAGS_tensorrt_workspace_size = config_.workspace_size; FLAGS_tensorrt_workspace_size = config_.workspace_size;
...@@ -161,3 +162,4 @@ USE_TRT_CONVERTER(fc); ...@@ -161,3 +162,4 @@ USE_TRT_CONVERTER(fc);
USE_TRT_CONVERTER(pool2d); USE_TRT_CONVERTER(pool2d);
USE_TRT_CONVERTER(softmax); USE_TRT_CONVERTER(softmax);
USE_TRT_CONVERTER(batch_norm); USE_TRT_CONVERTER(batch_norm);
USE_TRT_CONVERTER(concat);
...@@ -37,6 +37,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { ...@@ -37,6 +37,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
config1.use_gpu = true; config1.use_gpu = true;
config1.fraction_of_gpu_memory = 0.3; config1.fraction_of_gpu_memory = 0.3;
config1.device = 0; config1.device = 0;
config1.max_batch_size = 10;
auto predictor0 = auto predictor0 =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
......
...@@ -23,9 +23,11 @@ include_directories("${PADDLE_LIB}") ...@@ -23,9 +23,11 @@ include_directories("${PADDLE_LIB}")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include") include_directories("${PADDLE_LIB}/third_party/install/glog/include")
include_directories("${PADDLE_LIB}/third_party/install/gflags/include") include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
if (NOT WIN32)
include_directories("${PADDLE_LIB}/third_party/install/snappy/include") include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
include_directories("${PADDLE_LIB}/third_party/install/zlib/include") include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
endif(NOT WIN32)
include_directories("${PADDLE_LIB}/third_party/boost") include_directories("${PADDLE_LIB}/third_party/boost")
include_directories("${PADDLE_LIB}/third_party/eigen3") include_directories("${PADDLE_LIB}/third_party/eigen3")
......
# Add TRT tests # Add TRT tests
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry) DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
...@@ -18,12 +18,12 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc ...@@ -18,12 +18,12 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
*/
class ConcatOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
std::vector<nvinfer1::ITensor*> itensors;
for (auto& input_name : op_desc.Input("X")) {
itensors.push_back(engine_->GetITensor(input_name));
}
int axis = boost::get<int>(op_desc.GetAttr("axis"));
PADDLE_ENFORCE(axis > 0,
"The axis attr of Concat op should be large than 0 for trt");
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
itensors.size());
axis = axis - 1; // Remove batch dim
layer->setAxis(axis);
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(concat, ConcatOpConverter);
...@@ -79,6 +79,14 @@ class OpConverter { ...@@ -79,6 +79,14 @@ class OpConverter {
it = it =
Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor"); Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
} }
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
op_desc.Type());
}
if (op_desc.Type() == "depthwise_conv2d") {
it = Registry<OpConverter>::Lookup("conv2d");
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
op_desc.Type());
} }
if (!it) { if (!it) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(concat_op, test) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("concat_x1", nvinfer1::DimsCHW(10, 3, 1));
validator.DeclInputVar("concat_x2", nvinfer1::DimsCHW(3, 3, 1));
validator.DeclInputVar("concat_x3", nvinfer1::DimsCHW(7, 3, 1));
validator.DeclOutputVar("concat_out", nvinfer1::DimsCHW(20, 3, 1));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("concat");
desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
desc.SetOutput("Out", {"concat_out"});
int axis = 1;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(5);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(concat);
...@@ -11,12 +11,18 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,12 +11,18 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include <stdlib.h> // for malloc and free #ifdef _WIN32
#include <malloc.h>
#include <windows.h> // VirtualLock/VirtualUnlock
#else
#include <sys/mman.h> // for mlock and munlock #include <sys/mman.h> // for mlock and munlock
#include <algorithm> // for std::max #endif
#include <stdlib.h> // for malloc and free
#include <algorithm> // for std::max
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
...@@ -35,31 +41,42 @@ namespace paddle { ...@@ -35,31 +41,42 @@ namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
void* CPUAllocator::Alloc(size_t* index, size_t size) { void* AlignedMalloc(size_t size) {
// According to http://www.cplusplus.com/reference/cstdlib/malloc/,
// malloc might not return nullptr if size is zero, but the returned
// pointer shall not be dereferenced -- so we make it nullptr.
if (size <= 0) return nullptr;
*index = 0; // unlock memory
void* p = nullptr; void* p = nullptr;
size_t alignment = 32ul;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
// memory alignment // memory alignment
PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!", alignment = 4096ul;
size); #endif
#ifdef _WIN32
p = _aligned_malloc(size, alignment);
#else #else
PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!", PADDLE_ENFORCE_EQ(posix_memalign(&p, alignment, size), 0, "Alloc %ld error!",
size); size);
#endif #endif
PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size); PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
return p;
}
void* CPUAllocator::Alloc(size_t* index, size_t size) {
// According to http://www.cplusplus.com/reference/cstdlib/malloc/,
// malloc might not return nullptr if size is zero, but the returned
// pointer shall not be dereferenced -- so we make it nullptr.
if (size <= 0) return nullptr;
*index = 0; // unlock memory
void* p = AlignedMalloc(size);
if (p != nullptr) { if (p != nullptr) {
if (FLAGS_use_pinned_memory) { if (FLAGS_use_pinned_memory) {
*index = 1; *index = 1;
#ifdef _WIN32
VirtualLock(p, size);
#else
mlock(p, size); // lock memory mlock(p, size); // lock memory
#endif
} }
} }
...@@ -68,7 +85,11 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -68,7 +85,11 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
void CPUAllocator::Free(void* p, size_t size, size_t index) { void CPUAllocator::Free(void* p, size_t size, size_t index) {
if (p != nullptr && index == 1) { if (p != nullptr && index == 1) {
#ifdef _WIN32
VirtualUnlock(p, size);
#else
munlock(p, size); munlock(p, size);
#endif
} }
free(p); free(p);
} }
......
...@@ -291,6 +291,8 @@ op_library(unsqueeze_op DEPS reshape_op) ...@@ -291,6 +291,8 @@ op_library(unsqueeze_op DEPS reshape_op)
op_library(squeeze_op DEPS reshape_op) op_library(squeeze_op DEPS reshape_op)
op_library(extract_rows_op DEPS memory) op_library(extract_rows_op DEPS memory)
op_library(flatten_op DEPS reshape_op) op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
......
...@@ -60,6 +60,20 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -60,6 +60,20 @@ class AucKernel : public framework::OpKernel<T> {
const T* inference_data = predict->data<T>(); const T* inference_data = predict->data<T>();
const auto* label_data = label->data<int64_t>(); const auto* label_data = label->data<int64_t>();
// check if states are inited.
auto* tp_in = ctx.Input<Tensor>("TP");
auto* fp_in = ctx.Input<Tensor>("FP");
auto* tn_in = ctx.Input<Tensor>("TN");
auto* fn_in = ctx.Input<Tensor>("FN");
PADDLE_ENFORCE(tp_in->IsInitialized(), "true_positive is not inited!");
PADDLE_ENFORCE(fp_in->IsInitialized(), "false_negative is not inited!");
PADDLE_ENFORCE(tn_in->IsInitialized(), "true_negative is not inited!");
PADDLE_ENFORCE(fn_in->IsInitialized(), "false_positive is not inited!");
PADDLE_ENFORCE_EQ(tp_in->numel(), num_thresholds, "");
PADDLE_ENFORCE_EQ(fp_in->numel(), num_thresholds, "");
PADDLE_ENFORCE_EQ(tn_in->numel(), num_thresholds, "");
PADDLE_ENFORCE_EQ(fn_in->numel(), num_thresholds, "");
auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace()); auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace()); auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace()); auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
......
...@@ -18,15 +18,32 @@ limitations under the License. */ ...@@ -18,15 +18,32 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct DequantizeFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor* scale,
T max_range, framework::Tensor* out) {
auto in_e = framework::EigenVector<T>::Flatten(*in);
const T* scale_factor = scale->data<T>();
auto out_e = framework::EigenVector<T>::Flatten(*out);
auto& dev = *dev_ctx.eigen_device();
out_e.device(dev) = (scale_factor[0] / max_range) * in_e;
}
};
template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel { class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
public: public:
FakeDequantizeMaxAbsOp(const std::string &type, FakeDequantizeMaxAbsOp(const std::string& type,
const framework::VariableNameMap &inputs, const framework::VariableNameMap& inputs,
const framework::VariableNameMap &outputs, const framework::VariableNameMap& outputs,
const framework::AttributeMap &attrs) const framework::AttributeMap& attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of FakeDequantizeMaxAbsOp should not be null."); "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
...@@ -42,21 +59,17 @@ class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -42,21 +59,17 @@ class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", AddInput("X",
"(Tensor) The input with float-32/64 type is the " "(Tensor) The input with float-32/64 type is the "
"low precision tensor."); "low precision tensor.");
AddInput("Scale", "(float) The scale in quantization stage.");
AddOutput("Out", AddOutput("Out",
"(Tensor) The output is the dequantized high " "(Tensor) The output is the dequantized high "
"precision tensor."); "precision tensor.");
AddAttr<int>("num_bits", AddAttr<float>("max_range", "(float) The max range in quantization stage.");
"(int) `num_bits` is the quantization level bits, "
"such as 2, 5, 8.");
AddAttr<float>("scale",
"(float) The maximum absolute value of low precision tensor."
"It is usually calculated by the fake_quantize_max_abs_op.");
AddComment(R"DOC( AddComment(R"DOC(
FakeDequantizeMaxAbsOp operator. FakeDequantizeMaxAbsOp operator.
This calculation is an opposite operation of FakeQuantizeMaxAbsOp: This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
$$Out = \frac{scale*X}{2^{num_bits} - 1}$$ $$Out = \frac{scale*X}{ max_range }$$
)DOC"); )DOC");
} }
......
...@@ -14,6 +14,42 @@ limitations under the License. */ ...@@ -14,6 +14,42 @@ limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.h" #include "paddle/fluid/operators/fake_dequantize_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void KeDequantize(const T* in, const T* scale, T max_range, int num,
T* out) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num) {
out[idx] = in[idx] * scale[0] / max_range;
}
}
template <typename T>
struct DequantizeFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor* scale,
T max_range, framework::Tensor* out) {
const T* in_data = in->data<T>();
const T* scale_factor = scale->data<T>();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = in->numel();
int block = 512;
int grid = (num + block - 1) / block;
KeDequantize<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, out_data);
}
};
template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext; using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
......
...@@ -19,22 +19,29 @@ limitations under the License. */ ...@@ -19,22 +19,29 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename DeviceContext, typename T>
struct DequantizeFunctor {
void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
const framework::Tensor* scale, T max_range,
framework::Tensor* out);
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> { class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
public: public:
virtual void Compute(const framework::ExecutionContext& ctx) const { virtual void Compute(const framework::ExecutionContext& ctx) const {
auto* in = ctx.Input<framework::Tensor>("X"); auto* in = ctx.Input<framework::Tensor>("X");
auto* scale = ctx.Input<framework::Tensor>("Scale");
auto* out = ctx.Output<framework::Tensor>("Out"); auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(in->place());
int num_bits = ctx.Attr<int>("num_bits"); float max_range = ctx.Attr<float>("max_range");
T scale = static_cast<T>(ctx.Attr<float>("scale"));
int range = std::pow(2, num_bits) - 1; auto& dev_ctx = ctx.template device_context<DeviceContext>();
out->mutable_data<T>(dev_ctx.GetPlace());
auto eigen_out = framework::EigenVector<T>::Flatten(*out); DequantizeFunctor<DeviceContext, T>()(dev_ctx, in, scale,
auto eigen_in = framework::EigenVector<T>::Flatten(*in); static_cast<T>(max_range), out);
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
eigen_out.device(dev) = (scale / range) * eigen_in;
} }
}; };
......
...@@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase { ...@@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase {
class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker { class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() { void Make() {
AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
.AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
SendBarrier operator SendBarrier operator
......
...@@ -15,10 +15,14 @@ limitations under the License. */ ...@@ -15,10 +15,14 @@ limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h" #include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
DEFINE_bool(seq_mode, true, "Use sequence mode");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -98,7 +102,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -98,7 +102,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Hidden");
ctx->ShareLoD("X", "Cell"); ctx->ShareLoD("X", "Cell");
int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; int xx_width;
if (FLAGS_seq_mode) {
xx_width = wx_dims[1];
} else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
}
ctx->SetOutputDim("XX", {x_dims[0], xx_width}); ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->ShareLoD("X", "XX"); ctx->ShareLoD("X", "XX");
} }
...@@ -205,10 +214,138 @@ inline void ReorderInitState(const DeviceContext& ctx, ...@@ -205,10 +214,138 @@ inline void ReorderInitState(const DeviceContext& ctx,
row_shuffle(ctx, src, index_lod, dst, indexed_src); row_shuffle(ctx, src, index_lod, dst, indexed_src);
} }
template <typename DeviceContext, typename T> template <typename T>
class FuisonLSTMKernel : public framework::OpKernel<T> { class FuisonLSTMKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void SeqCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
auto* h0 = ctx.Input<Tensor>("H0");
auto* c0 = ctx.Input<Tensor>("C0");
auto* wx = ctx.Input<Tensor>("WeightX");
auto* wh = ctx.Input<Tensor>("WeightH");
auto* bias = ctx.Input<Tensor>("Bias");
auto* xx = ctx.Output<LoDTensor>("XX");
auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
auto* cell_out = ctx.Output<LoDTensor>("Cell");
bool is_reverse = ctx.Attr<bool>("is_reverse");
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
if (platform::jit::MayIUse(platform::jit::avx)) {
math::VecActivations<T, platform::jit::avx> act_functor;
act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str);
} else {
math::VecActivations<T, platform::jit::isa_any> act_functor;
act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str);
}
auto x_lod = x->lod();
auto x_dims = x->dims(); // T x M
auto wh_dims = wh->dims(); // D x 4D
const int total_T = x_dims[0];
const int N = x_lod[0].size() - 1; // batch size
const int M = x_dims[1]; // x frame size
const int D = wh_dims[0];
const int D2 = D * 2;
const int D3 = D * 3;
const int D4 = wh_dims[1];
const T* x_data = x->data<T>();
const T* h0_data = h0 ? h0->data<T>() : NULL;
const T* c0_data = c0 ? c0->data<T>() : NULL;
const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>();
T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(ctx);
math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
xx_data, bias->data<T>());
int xx_offset = D4;
int gate_offset = D;
if (is_reverse) {
const int offset = (total_T - 1) * D;
xx_data = xx_data + offset * 4;
hidden_out_data = hidden_out_data + offset;
cell_out_data = cell_out_data + offset;
xx_offset = -D4;
gate_offset = -D;
}
auto move_step = [&]() {
xx_data = xx_data + xx_offset;
hidden_out_data = hidden_out_data + gate_offset;
cell_out_data = cell_out_data + gate_offset;
};
for (int i = 0; i < N; ++i) {
int bid = is_reverse ? N - 1 - i : i;
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
const T* prev_cell_data = NULL;
const T* prev_hidden_data = NULL;
int tstart = 0;
if (h0_data) {
prev_hidden_data = h0_data + bid * D;
prev_cell_data = c0_data + bid * D;
} else {
// W_ch, W_ih, W_fh, W_oh
act_gate(D3, xx_data + D, xx_data + D);
act_cand(D, xx_data, xx_data);
// cell out= input*tilde
blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
// hidden out= act_state(cellout) * outgate
act_cell(D, cell_out_data, xx_data + D2);
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev
prev_hidden_data = hidden_out_data;
prev_cell_data = cell_out_data;
tstart = 1;
move_step();
}
for (int step = tstart; step < seq_len; ++step) {
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
prev_hidden_data, D, wh_data, D4, static_cast<T>(1), xx_data,
D4);
// W_ch, W_ih, W_fh, W_oh
act_gate(D3, xx_data + D, xx_data + D);
act_cand(D, xx_data, xx_data);
// a = forget * prev_cell
blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2);
// b = input * tilde
blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
// cell out= a+b
blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
// hidden out= act_state(cellout) * outgate
act_cell(D, cell_out_data, xx_data + D2);
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev
prev_hidden_data = hidden_out_data;
prev_cell_data = cell_out_data;
move_step();
}
}
}
void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X"); auto* x = ctx.Input<LoDTensor>("X");
auto* wx = ctx.Input<Tensor>("WeightX"); auto* wx = ctx.Input<Tensor>("WeightX");
auto* wh = ctx.Input<Tensor>("WeightH"); auto* wh = ctx.Input<Tensor>("WeightH");
...@@ -339,6 +476,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -339,6 +476,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
// restore the output cell state in LoDTensor from the batch cell // restore the output cell state in LoDTensor from the batch cell
to_seq(dev_ctx, batch_cell, cell_out); to_seq(dev_ctx, batch_cell, cell_out);
} }
void Compute(const framework::ExecutionContext& ctx) const override {
if (FLAGS_seq_mode) {
SeqCompute(ctx);
} else {
BatchCompute(ctx);
}
}
}; };
} // namespace operators } // namespace operators
...@@ -348,7 +492,5 @@ namespace ops = paddle::operators; ...@@ -348,7 +492,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker, REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(fusion_lstm, ops::FuisonLSTMKernel<float>,
fusion_lstm, ops::FuisonLSTMKernel<double>);
ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
void FusionSeqExpandConcatFCOp::InferShape(
framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_GT(
ctx->Inputs("X").size(), 1UL,
"Inputs(X) of FusionSeqExpandConcatFCOp should larger than 1.");
PADDLE_ENFORCE(
ctx->HasInput("FCWeight"),
"Input(FCWeight) of FusionSeqExpandConcatFCOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FusionSeqExpandConcatFCOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("FCOut"),
"Output(FCOut) of FusionSeqExpandConcatFCOp should not be null.");
auto ins_dims = ctx->GetInputsDim("X");
auto w_dims = ctx->GetInputDim("FCWeight"); // (M0+M1+M2+..) x D
PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2.");
const int D = w_dims[1];
int sum = ins_dims[0][1];
for (size_t i = 1; i < ins_dims.size(); ++i) {
sum += ins_dims[i][1];
}
PADDLE_ENFORCE_EQ(sum, w_dims[0],
"FC height should be sum of all inputs width.");
if (ctx->HasInput("FCBias")) {
auto b_dims = ctx->GetInputDim("FCBias");
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(FCBias)'s rank must be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1, "FCBias shapes must be 1 * %d.", D);
PADDLE_ENFORCE_EQ(b_dims[1], D, "FCBias shapes must be 1 * %d.", D);
}
ctx->SetOutputDim("Out", {ins_dims[0][0], D});
// fcout should be reshape when run since can not get lod in infershape
// explicit share the ref lod
ctx->ShareLoD("X", "Out", 0);
}
framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
return framework::OpKernelType(
framework::ToDataType(ctx.MultiInput<LoDTensor>("X")[0]->type()),
ctx.device_context());
}
void FusionSeqExpandConcatFCOpMaker::Make() {
AddInput("X",
"(LoDTensor) input LodDTensors, the first one must be have ref lod "
"for sequence expand, and the rest input should have same lod.")
.AsDuplicable();
AddInput("FCWeight", "(Tensor) the weights of fc.");
AddInput("FCBias", "(Tensor, optional) the bias of fc.").AsDispensable();
AddOutput("Out", "(LoDTensor) Output LodTensor.");
AddOutput(
"FCOut",
"(Tensor) the intermediate tensor to keep the result of fc."
"Shape is (N x D), where N is the batch size, D is the output dim of fc")
.AsIntermediate();
AddAttr<std::string>("fc_activation",
"(string, default: identity)"
"The activation for the result of fc."
"`identity` by default.")
.SetDefault("identity")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddComment(R"DOC(
Fusion Sequence expand + concat + fc Operator.
All below conditions should be meet:
The ref_level of seq_expand should be 0.
The ref lod of seq_expand level is the first input of concat.
The other inputs should have same lod and same batch size of ref lod.
The seq len of other inputs should be 1.
The concat axis should be 1.
)DOC");
}
template <typename T>
class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto ins = ctx.MultiInput<LoDTensor>("X");
auto* w = ctx.Input<Tensor>("FCWeight");
auto* b = ctx.Input<Tensor>("FCBias");
auto* out = ctx.Output<LoDTensor>("Out");
auto* fc_out = ctx.Output<Tensor>("FCOut");
auto* ref_in = ins[0];
auto ref_lod = ref_in->lod();
auto in1_lod = ins[1]->lod();
auto ref_dims = ref_in->dims(); // T x M0
auto in1_dims = ins[1]->dims(); // N x M1
auto w_dims = w->dims();
const int N = ref_lod[0].size() - 1;
const int total_T = ref_dims[0];
const int M0 = ref_dims[1];
const int M1 = in1_dims[1];
const int D = w_dims[1];
// some check and fcout should be reshape here
// since infershape can not get lod info
PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1.");
PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1.");
PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N,
"Batch size of all inputs should be equal.");
PADDLE_ENFORCE_EQ(in1_lod[0][N], N,
"Seq_length of other inputs should be 1.");
PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size.");
for (size_t i = 2; i < ins.size(); ++i) {
PADDLE_ENFORCE_EQ(ins[i]->dims()[0], N,
"All other inputs height should be equal");
PADDLE_ENFORCE_EQ(ins[i]->lod(), in1_lod,
"All other inputs should have same lod");
}
fc_out->Resize({N, D});
std::function<void(const int, const T*, T*)> fc_act;
auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
if (platform::jit::MayIUse(platform::jit::avx)) {
math::VecActivations<T, platform::jit::avx> act_functor;
fc_act = act_functor(fc_act_str);
} else {
math::VecActivations<T, platform::jit::isa_any> act_functor;
fc_act = act_functor(fc_act_str);
}
const T* ref_in_data = ref_in->data<T>();
const T* in1_data = ins[1]->data<T>();
const T* w_data = w->data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(ctx);
math::FCCompute<DeviceContext, T>(blas, total_T, D, M0, ref_in_data, w_data,
out_data, b ? b->data<T>() : NULL);
w_data = w_data + M0 * D;
// first write on
blas.MatMul(N, D, M1, in1_data, w_data, fc_out_data);
w_data = w_data + M1 * D;
for (size_t i = 2; i < ins.size(); ++i) {
// add on
const T* in_data = ins[i]->data<T>();
const int K = ins[i]->dims()[1];
blas.GEMM(CblasNoTrans, CblasNoTrans, N, D, K, static_cast<T>(1), in_data,
K, w_data, D, static_cast<T>(1), fc_out_data, D);
w_data = w_data + K * D;
}
T* cur_out_data = out_data;
for (int i = 0; i < N; ++i) {
int seq_len = ref_lod[0][i + 1] - ref_lod[0][i];
T* src = fc_out_data + i * D;
for (int step = 0; step < seq_len; ++step) {
blas.VADD(D, cur_out_data, src, cur_out_data);
cur_out_data = cur_out_data + D;
}
}
fc_act(total_T * D, out_data, out_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOp,
ops::FusionSeqExpandConcatFCOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc,
ops::FusionSeqExpandConcatFCOpKernel<float>,
ops::FusionSeqExpandConcatFCOpKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
class FusionSeqExpandConcatFCOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class FusionSeqExpandConcatFCOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
} // namespace operators
} // namespace paddle
...@@ -177,6 +177,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -177,6 +177,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()), dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
out_row, out_col, output->data<T>()); out_row, out_col, output->data<T>());
} }
// Wait() must be called because `inputs_data` may be destructed before
// kernel ends
context.Wait();
} }
}; };
...@@ -252,6 +255,9 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -252,6 +255,9 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
input.data<T>(), in_row, in_col, dev_outs_col_data, input.data<T>(), in_row, in_col, dev_outs_col_data,
static_cast<int>(outputs_cols.size()), dev_out_gpu_data); static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
} }
// Wait() must be called because `outputs_data` may be destructed before
// kernel ends
context.Wait();
} }
}; };
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <sys/time.h> #include <sys/time.h>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <random>
#include <vector> #include <vector>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "glog/logging.h" #include "glog/logging.h"
......
...@@ -41,7 +41,8 @@ template struct SetConstant<platform::CPUDeviceContext, uint8_t>; ...@@ -41,7 +41,8 @@ template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \ template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, bool, RANK>; \ template struct Transpose<platform::CPUDeviceContext, bool, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \ template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>; template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(1);
DEFINE_CPU_TRANS(2); DEFINE_CPU_TRANS(2);
......
...@@ -33,10 +33,11 @@ template struct SetConstant<platform::CUDADeviceContext, int>; ...@@ -33,10 +33,11 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
template struct SetConstant<platform::CUDADeviceContext, int64_t>; template struct SetConstant<platform::CUDADeviceContext, int64_t>;
template struct SetConstant<platform::CUDADeviceContext, bool>; template struct SetConstant<platform::CUDADeviceContext, bool>;
#define DEFINE_GPU_TRANS(RANK) \ #define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<platform::CUDADeviceContext, float, RANK>; \ template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<platform::CUDADeviceContext, double, RANK>; \ template struct Transpose<platform::CUDADeviceContext, double, RANK>; \
template struct Transpose<platform::CUDADeviceContext, float16, RANK>; template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;
DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(1);
DEFINE_GPU_TRANS(2); DEFINE_GPU_TRANS(2);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename DeviceContext, typename T, size_t D>
void PadFunction(const framework::ExecutionContext& context,
const std::vector<int>& pads, const framework::Tensor& src,
T pad_value, framework::Tensor* out) {
Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = pads[i * 2];
paddings[i].second = pads[i * 2 + 1];
}
auto src_tensor = EigenTensor<T, D>::From(src);
auto out_tensor = EigenTensor<T, D>::From(*out);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
out_tensor.device(place) = src_tensor.pad(paddings, pad_value);
}
template <typename DeviceContext, typename T, size_t D>
void PadGradFunction(const framework::ExecutionContext& context,
const std::vector<int>& pads, const framework::Tensor& src,
framework::Tensor* d_out) {
Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = -pads[i * 2];
paddings[i].second = -pads[i * 2 + 1];
}
auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
auto src_tensor = EigenTensor<T, D>::From(src);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
d_out_tensor.device(place) = src_tensor.pad(paddings, 0);
}
template <typename DeviceContext, typename T>
void PaddingFunctor(int rank, const framework::ExecutionContext& context,
const std::vector<int>& pads, T pad_value,
const framework::Tensor& src, framework::Tensor* out) {
switch (rank) {
case 1:
PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
break;
case 2:
PadFunction<DeviceContext, T, 2>(context, pads, src, pad_value, out);
break;
case 3:
PadFunction<DeviceContext, T, 3>(context, pads, src, pad_value, out);
break;
case 4:
PadFunction<DeviceContext, T, 4>(context, pads, src, pad_value, out);
break;
case 5:
PadFunction<DeviceContext, T, 5>(context, pads, src, pad_value, out);
break;
case 6:
PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
}
}
template <typename DeviceContext, typename T>
void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
const std::vector<int>& pads,
const framework::Tensor& src, framework::Tensor* out) {
switch (rank) {
case 1:
PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
break;
case 2:
PadGradFunction<DeviceContext, T, 2>(context, pads, src, out);
break;
case 3:
PadGradFunction<DeviceContext, T, 3>(context, pads, src, out);
break;
case 4:
PadGradFunction<DeviceContext, T, 4>(context, pads, src, out);
break;
case 5:
PadGradFunction<DeviceContext, T, 5>(context, pads, src, out);
break;
case 6:
PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
}
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -18,65 +18,86 @@ namespace paddle { ...@@ -18,65 +18,86 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T>
void CopyValidData(framework::Tensor* dst_tensor,
const framework::Tensor* src_tensor,
const framework::Vector<size_t>& seq_offsets,
int pad_seq_len, int step_width, bool norm_by_len,
CopyType type, PadLayout layout) {
int seq_num = seq_offsets.size() - 1;
const T* src_data = src_tensor->data<T>();
T* dst_data = dst_tensor->data<T>();
int seq_cpy_gap = step_width;
int pad_cpy_gap =
layout == kBatchLengthWidth ? step_width : seq_num * step_width;
for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
PADDLE_ENFORCE_GE(
pad_seq_len, valid_seq_len,
"The padded sequence length can not be less than its original length.");
int seq_data_offset = seq_offsets[seq_idx] * step_width;
int pad_data_offset = layout == kBatchLengthWidth
? seq_idx * pad_seq_len * step_width
: seq_idx * step_width;
float scale = 1.0f / static_cast<float>(valid_seq_len);
for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
const T* src =
src_data + (type == kSeqToPad ? seq_data_offset : pad_data_offset);
T* dst =
dst_data + (type == kSeqToPad ? pad_data_offset : seq_data_offset);
memcpy(dst, src, step_width * sizeof(T));
if (norm_by_len) {
for (int i = 0; i < step_width; ++i) {
*(dst + i) *= scale;
}
}
seq_data_offset += seq_cpy_gap;
pad_data_offset += pad_cpy_gap;
}
}
}
template <typename T> template <typename T>
class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor* padding, const framework::LoDTensor& seq_tensor,
bool norm_by_times) { framework::LoDTensor* pad_tensor,
auto lod = seq.lod(); const framework::LoDTensor& pad_value, int pad_seq_len = -1,
PADDLE_ENFORCE_GT(lod.size(), 0UL, int lod_level = 0, bool norm_by_times = false,
"The LoD of LoDTensor seq should not be null."); const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod();
const size_t level = 0; const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); const auto& seq_tensor_dims = seq_tensor.dims();
const auto& pad_tensor_dims = pad_tensor->dims();
auto seq_dims = seq.dims(); if (pad_seq_len == -1) {
PADDLE_ENFORCE_EQ(seq_dims[0], pad_seq_len = MaximumSequenceLength(seq_offsets);
static_cast<int64_t>(abs_offset_lod[level].back()), }
"The first dimension of LoDTensor seq should be " int step_width = seq_tensor.numel() / seq_tensor_dims[0];
"equal to the sum of all sequences's length.");
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
auto padding_dims = padding->dims(); step_width, layout);
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
"The input padding should be a 3-D Tensor of shape " "The numel of 'pad_value' can only be 1 or be equal to the "
"[max_sequence_length, num_sequences, sequence_width]."); "'step_width'.");
const int64_t max_sequence_length = MaximumSequenceLength(lod, level); // fill padding value
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, T* pad_data = pad_tensor->data<T>();
"The first dimension of Tensor padding should be the " const T* pad_value_data = pad_value.data<T>();
"maximum length of all sequences in LoDTensor seq."); if (pad_value.numel() == 1) {
for (int i = 0; i < pad_tensor->numel(); ++i) {
const int64_t num_sequences = abs_offset_lod[level].size() - 1; pad_data[i] = *pad_value_data;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, }
"The second dimension of Tensor padding should be the " } else {
"number of sequences in LoDTensor seq."); for (int i = 0; i < pad_tensor->numel(); i += step_width) {
memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
const int64_t sequence_width = seq.numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
const T* seq_data = seq.data<T>();
T* padding_data = padding->data<T>();
for (int64_t i = 0; i < max_sequence_length; ++i) {
for (int64_t j = 0; j < num_sequences; ++j) {
int64_t start_pos = abs_offset_lod[level][j];
int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
if (i < sequence_length) {
// i > 0 => sequence_length > 0
T scale =
norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
for (int64_t k = 0; k < sequence_width; ++k) {
padding_data[(i * num_sequences + j) * sequence_width + k] =
seq_data[(start_pos + i) * sequence_width + k] * scale;
}
} else {
memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
sequence_width * sizeof(T));
}
} }
} }
CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len,
step_width, norm_by_times, kSeqToPad, layout);
} }
}; };
...@@ -84,62 +105,35 @@ template <typename T> ...@@ -84,62 +105,35 @@ template <typename T>
class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
framework::LoDTensor* seq, const framework::Tensor& padding, const framework::LoDTensor& pad_tensor,
bool norm_by_times) { framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
auto lod = seq->lod(); int lod_level = 0, bool norm_by_times = false,
PADDLE_ENFORCE_GT(lod.size(), 0UL, const PadLayout layout = kBatchLengthWidth) {
"The LoD of LoDTensor seq should not be null."); auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims();
const size_t level = 0; const auto& pad_tensor_dims = pad_tensor.dims();
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); if (pad_seq_len == -1) {
pad_seq_len = MaximumSequenceLength(seq_offsets);
auto seq_dims = seq->dims();
PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding.dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequnece_length, num_sequences, sequence_width].");
const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be "
"the maximum length of all sequences in LoDTensor seq.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq->numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
const T* padding_data = padding.data<T>();
T* seq_data = seq->data<T>();
for (int64_t i = 0; i < num_sequences; ++i) {
int64_t start_pos = abs_offset_lod[level][i];
int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
for (int64_t j = 0; j < sequence_length; ++j) {
// sequence_width > j > 0
T scale =
norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
for (int64_t k = 0; k < sequence_width; ++k) {
seq_data[(start_pos + j) * sequence_width + k] =
padding_data[(j * num_sequences + i) * sequence_width + k] *
scale;
}
}
} }
int step_width = seq_tensor->numel() / seq_tensor_dims[0];
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
step_width, layout);
CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len,
step_width, norm_by_times, kPadToSeq, layout);
} }
}; };
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>; template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>; template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -19,41 +19,32 @@ namespace paddle { ...@@ -19,41 +19,32 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T, bool NormByTimes, bool Padding> template <typename T, CopyType Type>
__global__ void SequencePaddingKernel(T* padding, T* sequence, __global__ void SequencePaddingKernel(
const size_t* sequence_start_positions, T* dst, const T* src, const T* pad_value, bool is_constant_pad,
const size_t sequence_width, const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len,
const size_t max_sequence_length, const size_t step_width, bool norm_by_len, const PadLayout layout) {
const size_t num_sequences) { size_t seq_idx = blockIdx.y;
size_t padding_idx = blockIdx.y; size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
size_t start_pos = sequence_start_positions[padding_idx];
size_t sequence_length = size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y;
sequence_start_positions[padding_idx + 1] - start_pos; size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width;
size_t pad_data_offset = layout == kBatchLengthWidth
size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y; ? (seq_idx * pad_seq_len + step_idx) * step_width
size_t padding_base_idx = : (step_idx * seq_num + seq_idx) * step_width;
(sequence_idx * num_sequences + padding_idx) * sequence_width;
size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width; T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset);
const T* src_data =
if (sequence_idx < sequence_length) { src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
if (Padding) { if (step_idx < seq_len) {
/* sequence -> padding */ float scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i]; dst_data[i] = scale * src_data[i];
}
} else {
/* padding -> sequence */
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
}
} }
} else if (sequence_idx < max_sequence_length) { } else if (step_idx < pad_seq_len && Type == kSeqToPad) {
if (Padding) { for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
/* sequence -> padding */ dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i];
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
padding[padding_base_idx + i] = 0;
}
} }
} }
} }
...@@ -62,74 +53,59 @@ template <typename T> ...@@ -62,74 +53,59 @@ template <typename T>
class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor* padding, const framework::LoDTensor& seq_tensor,
bool norm_by_times) { framework::LoDTensor* pad_tensor,
auto lod = seq.lod(); const framework::LoDTensor& pad_value, int pad_seq_len = -1,
PADDLE_ENFORCE_GT(lod.size(), 0UL, int lod_level = 0, bool norm_by_times = false,
"The lod of LoDTensor seq should not be null."); const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod();
const size_t level = 0; const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); const auto& seq_tensor_dims = seq_tensor.dims();
const auto& pad_tensor_dims = pad_tensor->dims();
auto seq_dims = seq.dims(); int max_seq_len = MaximumSequenceLength(seq_offsets);
PADDLE_ENFORCE_EQ(seq_dims[0], if (pad_seq_len == -1) {
static_cast<int64_t>(abs_offset_lod[level].back()), pad_seq_len = max_seq_len;
"The first dimension of LoDTensor seq should be " }
"equal to the sum of all sequences's length."); PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len,
"The pad_seq_len must be equal to or greater than the "
auto padding_dims = padding->dims(); "original max sequence length.");
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, int step_width = seq_tensor.numel() / seq_tensor_dims[0];
"The input padding should be a 3-D Tensor of shape " int seq_num = seq_offsets.size() - 1;
"[max_sequence_length, num_sequences, sequence_width].");
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
int64_t max_sequence_length = MaximumSequenceLength(lod, level); step_width, layout);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
"The first dimension of Tensor padding should be the " "The numel of 'pad_value' can only be 1 or be equal to the "
"maximum length of all sequences in LoDTensor seq."); "'step_width'.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1; if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor);
"The second dimension of Tensor padding should be the " pad_tensor->Resize(pad_tensor_dims);
"number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq.numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) {
TensorCopy(seq, context.GetPlace(), context, padding);
padding->Resize(padding_dims);
return; return;
} }
const int64_t kBlockSize = 512; const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements, /* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread. * and at least 8 elements for each thread.
*/ */
size_t block_dim_x = size_t block_dim_x =
std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x; size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y); dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = num_sequences; size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y); dim3 grid(grid_dim_x, grid_dim_y);
const T* seq_data = seq.data<T>(); const T* seq_data = seq_tensor.data<T>();
T* padding_data = padding->data<T>(); T* pad_data = pad_tensor->data<T>();
if (norm_by_times) { const T* pad_value_data = pad_value.data<T>();
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
max_sequence_length, num_sequences); seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
} else { step_width, norm_by_times, layout);
SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences);
}
} }
}; };
...@@ -137,79 +113,62 @@ template <typename T> ...@@ -137,79 +113,62 @@ template <typename T>
class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
framework::LoDTensor* seq, const framework::Tensor& padding, const framework::LoDTensor& pad_tensor,
bool norm_by_times) { framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
auto lod = seq->lod(); int lod_level = 0, bool norm_by_times = false,
PADDLE_ENFORCE_GT(lod.size(), 0UL, const PadLayout layout = kBatchLengthWidth) {
"The lod of LoDTensor seq should not be null."); auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims();
const size_t level = 0; const auto& pad_tensor_dims = pad_tensor.dims();
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); int max_seq_len = MaximumSequenceLength(seq_offsets);
if (pad_seq_len == -1) {
auto seq_dims = seq->dims(); pad_seq_len = max_seq_len;
PADDLE_ENFORCE_EQ(seq_dims[0], }
static_cast<int64_t>(abs_offset_lod[level].back()), int step_width = seq_tensor->numel() / seq_tensor_dims[0];
"The first dimension of LoDTensor seq should be " int seq_num = seq_offsets.size() - 1;
"equal to the sum of all sequences's length.");
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
auto padding_dims = padding.dims(); step_width, layout);
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape " if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
"[max_sequnece_length, num_sequences, sequence_width]."); TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor);
seq_tensor->Resize(seq_tensor_dims);
int64_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be "
"the maximum length of all sequences in LoDTensor seq.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq->numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) {
TensorCopy(padding, context.GetPlace(), context, seq);
seq->Resize(seq_dims);
return; return;
} }
const int64_t kBlockSize = 512; const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements, /* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread. * and at least 8 elements for each thread.
*/ */
size_t block_dim_x = size_t block_dim_x =
std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x; size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y); dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = num_sequences; size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y); dim3 grid(grid_dim_x, grid_dim_y);
const T* padding_data = padding.data<T>(); const T* pad_data = pad_tensor.data<T>();
T* seq_data = seq->data<T>(); T* seq_data = seq_tensor->data<T>();
if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, seq_data, pad_data, nullptr, false,
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
max_sequence_length, num_sequences); step_width, norm_by_times, layout);
} else {
SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences);
}
} }
}; };
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>; template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>; template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -22,17 +23,33 @@ namespace paddle { ...@@ -22,17 +23,33 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
inline static size_t MaximumSequenceLength(const framework::LoD& lod, enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
const size_t level) {
const size_t num_sequences = lod[level].size() - 1; enum CopyType { kSeqToPad, kPadToSeq };
size_t max_sequence_length = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); inline static size_t MaximumSequenceLength(
for (size_t i = 0; i < num_sequences; ++i) { const framework::Vector<size_t>& seq_offset) {
max_sequence_length = size_t seq_num = seq_offset.size() - 1;
std::max(max_sequence_length, size_t max_seq_len = 0;
abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]); for (size_t i = 0; i < seq_num; ++i) {
max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
} }
return max_sequence_length; return max_seq_len;
}
inline static void CheckDims(const framework::DDim& seq_tensor_dims,
const framework::DDim& pad_tensor_dims,
const framework::Vector<size_t>& seq_offset,
int64_t padded_seq_len, int64_t step_width,
const PadLayout& layout) {
PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
"Value of 1st dimension of the sequence tensor should be "
"equal to sum of lengths of all sequences.");
PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
seq_tensor_dims.size() == pad_tensor_dims.size(),
"pad_tensor's rank should be 1 greater than seq_tensor's "
"rank, or be equal with it.");
} }
/* /*
...@@ -64,15 +81,22 @@ inline static size_t MaximumSequenceLength(const framework::LoD& lod, ...@@ -64,15 +81,22 @@ inline static size_t MaximumSequenceLength(const framework::LoD& lod,
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class PaddingLoDTensorFunctor { class PaddingLoDTensorFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::LoDTensor& seq, void operator()(const DeviceContext& context,
framework::Tensor* padding, bool norm_by_times); const framework::LoDTensor& seq_tensor,
framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class UnpaddingLoDTensorFunctor { class UnpaddingLoDTensorFunctor {
public: public:
void operator()(const DeviceContext& context, framework::LoDTensor* seq, void operator()(const DeviceContext& context,
const framework::Tensor& padding, bool norm_by_times); const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth);
}; };
} // namespace math } // namespace math
......
...@@ -23,7 +23,9 @@ void TestSequencePadding(const paddle::framework::LoD& lod, ...@@ -23,7 +23,9 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
paddle::framework::LoDTensor cpu_seq_back; paddle::framework::LoDTensor cpu_seq_back;
paddle::framework::LoDTensor seq; paddle::framework::LoDTensor seq;
paddle::framework::LoDTensor seq_back; paddle::framework::LoDTensor seq_back;
paddle::framework::Tensor padding; paddle::framework::LoDTensor padding;
paddle::framework::LoDTensor cpu_pad_value;
paddle::framework::LoDTensor pad_value;
const size_t level = lod.size() - 1; const size_t level = lod.size() - 1;
auto seq_dims = auto seq_dims =
...@@ -46,20 +48,33 @@ void TestSequencePadding(const paddle::framework::LoD& lod, ...@@ -46,20 +48,33 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
} }
const size_t max_sequence_length = const size_t max_sequence_length =
paddle::operators::math::MaximumSequenceLength(lod, level); paddle::operators::math::MaximumSequenceLength(lod[level]);
const size_t num_sequences = lod[level].size() - 1; const size_t num_sequences = lod[level].size() - 1;
auto padding_dims = auto padding_dims =
paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length), paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
static_cast<int64_t>(num_sequences), static_cast<int64_t>(num_sequences),
static_cast<int64_t>(sequence_width)}); static_cast<int64_t>(sequence_width)});
padding.mutable_data<T>(padding_dims, *place); padding.mutable_data<T>(padding_dims, *place);
T* pad_value_data =
cpu_pad_value.mutable_data<T>({1}, paddle::platform::CPUPlace());
*pad_value_data = static_cast<T>(0);
if (paddle::platform::is_cpu_place(*place)) {
pad_value = cpu_pad_value;
} else {
TensorCopySync(cpu_pad_value, *place, &pad_value);
}
paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()( paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
*context, seq, &padding, false); *context, seq, &padding, pad_value, -1, 0, false,
paddle::operators::math::kLengthBatchWidth);
seq_back.set_lod(lod); seq_back.set_lod(lod);
seq_back.mutable_data<T>(seq_dims, *place); seq_back.mutable_data<T>(seq_dims, *place);
paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
*context, &seq_back, padding, false); *context, padding, &seq_back, -1, 0, false,
paddle::operators::math::kLengthBatchWidth);
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
cpu_seq_back = seq_back; cpu_seq_back = seq_back;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pad_constant_like_op.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class PadConstantLikeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of PadConstantLikeOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"),
"Input(Y) of PadConstantLikeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of PadConstantLikeOp should not be null.");
auto x_dim = ctx->GetInputDim("X");
auto y_dim = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(x_dim.size(), y_dim.size(),
"The dimention of X and Y should be the same.");
for (int i = 0; i < x_dim.size(); ++i) {
PADDLE_ENFORCE_GE(x_dim[i], y_dim[i]);
}
ctx->SetOutputDim("Out", x_dim);
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
ctx.device_context());
}
};
class PadConstantLikeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input of pad_constant_like op. "
"The input should be a k-D tensor(k > 0 and k < 7)");
AddInput("Y",
"The input of pad_constant_like op. "
"The input should be a k-D tensor(k > 0 and k < 7)");
AddOutput("Out",
"The output of pad_constant_like op. "
"A tensor with the same shape as X.");
AddAttr<float>("pad_value",
"(float, default 0.0) "
"The value to fill the padded areas.")
.SetDefault(0.0f);
AddComment(R"DOC(
PadConstantLikeOp Operator.
Pad input(Y) with a pad_value, the number of values padded to the edges of each
axis is specified by the difference of the shape of X and Y.
((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for
each axis.
The input should be a k-D tensor(k > 0 and k < 7). As an example:
case1:
Given:
X = [[1, 2],
[3, 4],
[1, 2],
[3, 4]]],
X.shape = (4, 2)
Y = [[5, 6],
[7, 8]],
Y.shape = (2, 2)
And
pad_value = 0,
Return:
Out = [[5, 6],
[7, 8],
[0, 0],
[0, 0]]
Out.shape = (4, 2)
case2:
Given:
X = [[[[ 0, 1, 2],
[ 3, 4, 5]],
[[ 6, 7, 8],
[ 9, 10, 11]],
[[12, 13, 14],
[15, 16, 17]]],
[[[18, 19, 20],
[21, 22, 23]],
[[24, 25, 26],
[27, 28, 29]],
[[30, 31, 32],
[33, 34, 35]]]]
X.shape = (2, 3, 2, 3)
Y = [[[[35, 36, 37]],
[[38, 39, 40]],
[[41, 42, 43]]]]
Y.shape = (1, 3, 1, 3)
And
pad_value = -1,
Return:
Out = [[[[35, 36, 37],
[-1, -1, -1]],
[[38, 39, 40],
[-1, -1, -1]],
[[41, 42, 43],
[-1, -1, -1]]],
[[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]]]]
Out.shape = (2, 3, 2, 3)
)DOC");
}
};
class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
auto y_dim = ctx->GetInputDim("Y");
auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(dout_dim.size(), y_dim.size(),
"The dimention of X and Y should be the same.");
auto y_grad_name = framework::GradVarName("Y");
if (ctx->HasOutput(y_grad_name)) {
ctx->SetOutputDim(y_grad_name, y_dim);
ctx->ShareLoD("Y", /*->*/ y_grad_name);
for (int i = 0; i < y_dim.size(); ++i) {
PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i]);
}
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
ctx.device_context());
}
};
class PadConstantLikeOpGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *bind = new framework::OpDesc();
bind->SetType("pad_constant_like_grad");
bind->SetInput("Y", Input("Y"));
bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
bind->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
bind->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(bind);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(pad_constant_like, ops::PadConstantLikeOp,
ops::PadConstantLikeOpMaker, ops::PadConstantLikeOpGradMaker);
REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad);
REGISTER_OP_CPU_KERNEL(
pad_constant_like,
ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, float>,
ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
pad_constant_like_grad,
ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/pad_constant_like_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
pad_constant_like,
ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, float>,
ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
pad_constant_like_grad,
ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/padding.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class PadConstantLikeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto in_x = context.Input<framework::Tensor>("X");
auto in_y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
if (in_x->dims() == in_y->dims()) {
// TensorCopy(in_y, context.GetPlace(), context, out);
out->ShareDataWith(*in_y);
return;
}
T pad_value = context.Attr<T>("pad_value");
out->mutable_data<T>(context.GetPlace());
int rank = context.Input<framework::Tensor>("X")->dims().size();
std::vector<int> pads(rank * 2, 0);
for (int j = 0; j < rank; ++j) {
pads[j * 2] = 0;
pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
}
math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value,
*in_y, out);
}
};
template <typename DeviceContext, typename T>
class PadConstantLikeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto in_y = context.Input<framework::Tensor>("Y");
auto in_dout =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_y = context.Output<framework::Tensor>(framework::GradVarName("Y"));
if (d_y == nullptr) {
return;
}
if (in_dout->dims() == in_y->dims()) {
// TensorCopy(in_dout, context.GetPlace(), context, d_y);
d_y->ShareDataWith(*in_dout);
return;
}
d_y->mutable_data<T>(context.GetPlace());
int rank = in_dout->dims().size();
std::vector<int> pads(static_cast<size_t>(rank) * 2, 0);
for (int j = 0; j < rank; ++j) {
pads[j * 2] = 0;
pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
}
math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout,
d_y);
}
};
} // namespace operators
} // namespace paddle
...@@ -18,117 +18,44 @@ limitations under the License. */ ...@@ -18,117 +18,44 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/padding.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename DeviceContext, typename T, size_t D>
void PadFunction(const framework::ExecutionContext& context) {
auto pads = context.Attr<std::vector<int>>("paddings");
Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = pads[i * 2];
paddings[i].second = pads[i * 2 + 1];
}
T pad_value = context.Attr<T>("pad_value");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
auto x_tensor = EigenTensor<T, D>::From(*x);
auto out_tensor = EigenTensor<T, D>::From(*out);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
out_tensor.device(place) = x_tensor.pad(paddings, pad_value);
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class PadKernel : public framework::OpKernel<T> { class PadKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
int rank = context.Input<Tensor>("X")->dims().size(); auto pads = context.Attr<std::vector<int>>("paddings");
switch (rank) { T pad_value = context.Attr<T>("pad_value");
case 1: auto* x = context.Input<Tensor>("X");
PadFunction<DeviceContext, T, 1>(context); auto* out = context.Output<Tensor>("Out");
break; out->mutable_data<T>(context.GetPlace());
case 2:
PadFunction<DeviceContext, T, 2>(context); int rank = x->dims().size();
break; math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value, *x,
case 3: out);
PadFunction<DeviceContext, T, 3>(context);
break;
case 4:
PadFunction<DeviceContext, T, 4>(context);
break;
case 5:
PadFunction<DeviceContext, T, 5>(context);
break;
case 6:
PadFunction<DeviceContext, T, 6>(context);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
}
} }
}; };
template <typename DeviceContext, typename T, size_t D>
void PadGradFunction(const framework::ExecutionContext& context) {
auto pads = context.Attr<std::vector<int>>("paddings");
Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = -pads[i * 2];
paddings[i].second = -pads[i * 2 + 1];
}
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
if (d_x != nullptr) {
d_x->mutable_data<T>(context.GetPlace());
auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
}
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class PadGradKernel : public framework::OpKernel<T> { class PadGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
size_t rank = auto pads = context.Attr<std::vector<int>>("paddings");
context.Input<Tensor>(framework::GradVarName("Out"))->dims().size(); auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
switch (rank) { auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
case 1: if (d_x == nullptr) {
PadGradFunction<DeviceContext, T, 1>(context); return;
break;
case 2:
PadGradFunction<DeviceContext, T, 2>(context);
break;
case 3:
PadGradFunction<DeviceContext, T, 3>(context);
break;
case 4:
PadGradFunction<DeviceContext, T, 4>(context);
break;
case 5:
PadGradFunction<DeviceContext, T, 5>(context);
break;
case 6:
PadGradFunction<DeviceContext, T, 6>(context);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
} }
d_x->mutable_data<T>(context.GetPlace());
int rank = d_out->dims().size();
math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
d_x);
} }
}; };
......
...@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/scale_op.h" #include "paddle/fluid/operators/scale_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -52,6 +55,21 @@ $$Out = scale*X$$ ...@@ -52,6 +55,21 @@ $$Out = scale*X$$
} }
}; };
class ScaleOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
auto &in_var_name = op_desc.Input("X").front();
auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name));
auto out_var_name = op_desc.Output("Out").front();
auto *out_var = block->FindVarRecursive(out_var_name);
out_var->SetType(in_var.GetType());
out_var->SetDataType(in_var.GetDataType());
}
};
class ScaleGradMaker : public framework::SingleGradOpDescMaker { class ScaleGradMaker : public framework::SingleGradOpDescMaker {
public: public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
...@@ -71,7 +89,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { ...@@ -71,7 +89,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker); REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker,
ops::ScaleOpVarTypeInference);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>, scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>, ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
......
...@@ -22,17 +22,29 @@ namespace operators { ...@@ -22,17 +22,29 @@ namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ScaleKernel : public framework::OpKernel<T> { class ScaleKernel : public framework::OpKernel<T> {
public: public:
virtual void Compute(const framework::ExecutionContext& context) const { virtual void Compute(const framework::ExecutionContext& ctx) const {
auto* tensor = context.Output<framework::Tensor>("Out"); auto* in_var = ctx.InputVar("X");
auto* in = context.Input<framework::Tensor>("X"); auto* in = ctx.Input<framework::Tensor>("X");
tensor->mutable_data<T>(in->place());
auto scale = static_cast<T>(context.Attr<float>("scale")); auto* out_var = ctx.OutputVar("Out");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(in->place());
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor); PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
"in and out should have the same dim");
auto scale = static_cast<T>(ctx.Attr<float>("scale"));
if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
auto& in_slr = in_var->Get<framework::SelectedRows>();
auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
out_slr->set_rows(in_slr.rows());
out_slr->set_height(in_slr.height());
}
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in); auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto& dev = auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
*context.template device_context<DeviceContext>().eigen_device();
eigen_out.device(dev) = scale * eigen_in; eigen_out.device(dev) = scale * eigen_in;
} }
}; };
......
...@@ -56,6 +56,10 @@ class SendBarrierOp : public framework::OperatorBase { ...@@ -56,6 +56,10 @@ class SendBarrierOp : public framework::OperatorBase {
class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker { class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() { void Make() {
AddInput("X", "(Any) Dummy inputs, used for control dependency")
.AsDuplicable();
AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
.AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
SendBarrier operator SendBarrier operator
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_pad_op.h"
namespace paddle {
namespace operators {
class SequencePadOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SequencePadOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("PadValue"),
"Input(PadValue) of SequencePadOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SequencePadOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_GE(x_dims.size(), 2,
"The rank of Input(x) can't be less than 2.");
auto time_step_dims = framework::slice_ddim(x_dims, 1, x_dims.size());
auto pad_value_dims = ctx->GetInputDim("PadValue");
PADDLE_ENFORCE(pad_value_dims == framework::make_ddim({1}) ||
pad_value_dims == time_step_dims,
"The Input(PadValue) must be a scalar or a tensor whose "
"shape equals to time steps in sequences");
int out_dim_0 = -1;
int out_dim_1 = -1;
if (ctx->IsRuntime()) {
// run time
framework::Variable* x_var =
boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
const auto& x_lod = x_var->Get<LoDTensor>().lod();
PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
const auto& x_lod_0 = x_lod[0];
PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
"The Input(X)'s lod info is corrupted.");
PADDLE_ENFORCE_EQ(
x_dims[0], static_cast<int64_t>(x_lod_0.back()),
"The Input(X)'s lod info mismatches the actual tensor shape.");
int seq_num = x_lod_0.size() - 1;
int max_seq_len = math::MaximumSequenceLength(x_lod_0);
int padded_length = ctx->Attrs().Get<int>("padded_length");
if (padded_length == -1) {
padded_length = max_seq_len;
}
PADDLE_ENFORCE_GE(padded_length, max_seq_len,
"The Attr(padded_length) must be -1 or an int greater "
"than the length of the longest original sequence.");
out_dim_0 = seq_num;
out_dim_1 = padded_length;
} else {
// compile time
framework::VarDesc* x_desc =
boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
}
std::vector<int> out_dims_vec{out_dim_0, out_dim_1};
auto time_step_dims_vec = framework::vectorize2int(time_step_dims);
out_dims_vec.insert(out_dims_vec.end(), time_step_dims_vec.begin(),
time_step_dims_vec.end());
ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
}
};
class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(LoDTensor, default LoDTensor<float>) Input variable which "
"should contain lod information.");
AddInput("PadValue",
"(LoDTensor), this Tensor holds values that will be fill into "
"padded steps. It can be a scalar or a tensor whose shape equals "
"to time steps in sequences. If it's a scalar, it will be "
"automatically broadcasted to the shape of time step.");
AddOutput(
"Out",
"(LoDTensor) The output vairable, which contains padded sequences.");
AddAttr<int>(
"padded_length",
"The length of padded sequences. It can be setted to -1 or "
"any positive int. When it is -1, all sequences will be padded up to "
"the length of the longest one among them; when it a certain positive "
"value, it must be greater than the length of the longest original "
"sequence.")
.SetDefault(-1);
AddComment(R"DOC(
Sequence Pad Operator
This operator pads sequences in a same batch to a consistent length.
The length is specified by attribute 'padded_length'. New elements,
whose values are specified by input 'PadValue', will be appended to
the end of each sequence, to make their final lengths consistent.
Following are cases to better explain how this works:
Case 1:
Given a 1-level LoDTensor input(X):
X.lod = [[0, 2, 5]]
X.data = [a, b, c, d, e]
and Input(PadValue):
PadValue.data = [0]
and attribite 'padded_length' = 4,
then we get LoDTensor:
Out.data = [[a, b, 0, 0],
[c, d, e, 0]]
Case 2:
Given a 1-level LoDTensor input(X):
X.lod = [[0, 2, 5]]
X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
and Input(PadValue):
PadValue.data = [0]
and attribite 'padded_length' = -1, which mean using the length
of longest input sequence(3 in this case),
then we get LoDTensor:
Out.data = [[[a1, a2], [b1, b2], [0, 0]],
[[c1, c2], [d1, d2], [e1, e2]]]
Case 3:
Given a 1-level LoDTensor input(X):
X.lod = [[0, 2, 5]]
X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
and Input(PadValue):
PadValue.data = [p1, p2]
and attribite 'padded_length' = -1, which mean using the length
of longest input sequence(3 in this case),
then we get LoDTensor:
Out.data = [[[a1, a2], [b1, b2], [p1, p2]],
[[c1, c2], [d1, d2], [e1, e2]]]
)DOC");
}
};
class SequencePadGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SequencePadGradOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) of SequencePadGradOp should not be null.");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp);
REGISTER_OP_CPU_KERNEL(
sequence_pad,
ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
sequence_pad_grad,
ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_pad_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
sequence_pad,
ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
sequence_pad_grad,
ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using LoD = framework::LoD;
template <typename DeviceContext, typename T>
class SequencePadOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* x = ctx.Input<LoDTensor>("X");
auto* out = ctx.Output<LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
const auto* pad_value = ctx.Input<LoDTensor>("PadValue");
int padded_length = ctx.Attr<int>("padded_length");
math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *x, out, *pad_value,
padded_length, 0, false, math::kBatchLengthWidth);
}
};
template <typename DeviceContext, typename T>
class SequencePadGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
if (d_x) {
const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
d_x->mutable_data<T>(ctx.GetPlace());
int padded_length = ctx.Attr<int>("padded_length");
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *d_out, d_x,
padded_length, 0, false, math::kBatchLengthWidth);
}
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/unstack_op.h"
namespace plat = paddle::platform;
namespace ops = paddle::operators;
USE_OP(stack);
REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker,
ops::UnStackOpInferShape, ops::UnStackGradOpDescMaker);
REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp,
ops::UnStackOpGradInferShape);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class UnStackOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist.");
int axis = ctx->Attrs().Get<int>("axis");
int num = ctx->Attrs().Get<int>("num");
auto x_dim = ctx->GetInputDim("X");
int rank = x_dim.size();
PADDLE_ENFORCE(axis >= -rank && axis < rank,
"Attr(axis) must be inside [-rank, rank), where rank = %d",
rank);
if (axis < 0) axis += rank;
PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast<size_t>(num),
"Number of Outputs(Y) is wrong");
if (x_dim[axis] > 0) {
PADDLE_ENFORCE_EQ(num, x_dim[axis], "Number of Outputs(Y) is wrong");
}
auto vec = framework::vectorize2int(x_dim);
vec.erase(vec.begin() + axis);
ctx->SetOutputsDim("Y", std::vector<framework::DDim>( // NOLINT
x_dim[axis], framework::make_ddim(vec)));
}
};
class UnStackOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "The input of unstack op.");
AddOutput("Y", "The output of unstack op.").AsDuplicable();
AddAttr<int>("axis", "The axis along which Input(X) should be unstacked.")
.SetDefault(0);
AddAttr<int>("num", "The number of outputs(Y).").GreaterThan(0);
AddComment(R"DOC(
UnStack Operator.
UnStack Input(X) into several tensors along Attr(axis).
)DOC");
}
};
class UnStackOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto stack_grad_op = framework::OpRegistry::CreateOp(
"stack_grad", {{framework::GradVarName("Y"), {Input("X")}}},
{{framework::GradVarName("X"), Outputs("Y")}}, Attrs());
stack_grad_op->Run(scope, place);
}
};
class UnStackOpGradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0,
"Number of Inputs(Y@Grad) must be larger than 0");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad) must exist.");
auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
for (size_t i = 1; i < input_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
"Dims of all Inputs(Y@Grad) must be the same");
}
int axis = ctx->Attrs().Get<int>("axis");
int rank = input_dims[0].size();
PADDLE_ENFORCE(
axis >= -(rank + 1) && axis < rank + 1,
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank);
if (axis < 0) axis += (rank + 1);
auto vec = framework::vectorize2int(input_dims[0]);
vec.insert(vec.begin() + axis, input_dims.size());
ctx->SetOutputDim(framework::GradVarName("X"), framework::make_ddim(vec));
}
};
class UnStackGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("unstack_grad");
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
class UnStackGradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto stack_op = framework::OpRegistry::CreateOp(
"stack", {{"X", Inputs(framework::GradVarName("Y"))}},
{{"Y", {Output(framework::GradVarName("X"))}}}, Attrs());
stack_op->Run(scope, place);
}
};
} // namespace operators
} // namespace paddle
...@@ -153,17 +153,29 @@ class WarpCTCKernel : public framework::OpKernel<T> { ...@@ -153,17 +153,29 @@ class WarpCTCKernel : public framework::OpKernel<T> {
framework::make_ddim({static_cast<int64_t>(num_sequences), 1}); framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
// warpctc needs sequences data stored in transposed padding format // warpctc needs sequences data stored in transposed padding format
Tensor warpctc_logits; LoDTensor warpctc_logits;
const size_t max_sequence_length = const size_t max_sequence_length =
math::MaximumSequenceLength(logits_lod, level); math::MaximumSequenceLength(logits_lod[level]);
auto warpctc_logits_dims = auto warpctc_logits_dims =
framework::make_ddim({static_cast<int64_t>(max_sequence_length), framework::make_ddim({static_cast<int64_t>(max_sequence_length),
static_cast<int64_t>(num_sequences), static_cast<int64_t>(num_sequences),
static_cast<int64_t>(sequence_width)}); static_cast<int64_t>(sequence_width)});
warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace()); warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
LoDTensor cpu_pad_value;
T* pad_value_data =
cpu_pad_value.mutable_data<T>({1}, platform::CPUPlace());
*pad_value_data = static_cast<T>(0);
LoDTensor pad_value;
if (platform::is_cpu_place(ctx.GetPlace())) {
pad_value = cpu_pad_value;
} else {
TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value);
}
math::PaddingLoDTensorFunctor<DeviceContext, T>()( math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits, &warpctc_logits, ctx.template device_context<DeviceContext>(), *logits, &warpctc_logits,
false); pad_value, -1, 0, false /* norm_by_times */, math::kLengthBatchWidth);
const T* warpctc_logits_data = warpctc_logits.data<T>(); const T* warpctc_logits_data = warpctc_logits.data<T>();
std::vector<int> warpctc_label_lengths(num_sequences); std::vector<int> warpctc_label_lengths(num_sequences);
...@@ -209,15 +221,15 @@ template <typename DeviceContext, typename T> ...@@ -209,15 +221,15 @@ template <typename DeviceContext, typename T>
class WarpCTCGradKernel : public framework::OpKernel<T> { class WarpCTCGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* warpctc_grad = ctx.Input<Tensor>("WarpCTCGrad"); auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits")); auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss")); const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
logits_grad->mutable_data<T>(ctx.GetPlace()); logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times"); bool norm_by_times = ctx.Attr<bool>("norm_by_times");
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), logits_grad, ctx.template device_context<DeviceContext>(), *warpctc_grad,
*warpctc_grad, norm_by_times); logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
const T* loss_grad_data = loss_grad->data<T>(); const T* loss_grad_data = loss_grad->data<T>();
math::ScaleLoDTensorFunctor<DeviceContext, T>()( math::ScaleLoDTensorFunctor<DeviceContext, T>()(
......
if (NOT WIN32)
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
py_proto_compile(profiler_py_proto SRCS profiler.proto) py_proto_compile(profiler_py_proto SRCS profiler.proto)
...@@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD ...@@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32)
if(WITH_GPU) if(WITH_GPU)
nv_library(enforce SRCS enforce.cc) nv_library(enforce SRCS enforce.cc)
...@@ -58,9 +60,12 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) ...@@ -58,9 +60,12 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
if (NOT WIN32)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
endif(NOT WIN32)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
......
...@@ -22,9 +22,13 @@ limitations under the License. */ ...@@ -22,9 +22,13 @@ limitations under the License. */
#ifdef __APPLE__ #ifdef __APPLE__
#include <sys/sysctl.h> #include <sys/sysctl.h>
#include <sys/types.h> #include <sys/types.h>
#elif defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h>
#else #else
#include <unistd.h> #include <unistd.h>
#endif #endif // _WIN32
#include <algorithm> #include <algorithm>
#include "gflags/gflags.h" #include "gflags/gflags.h"
...@@ -32,16 +36,20 @@ limitations under the License. */ ...@@ -32,16 +36,20 @@ limitations under the License. */
DEFINE_double(fraction_of_cpu_memory_to_use, 1, DEFINE_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle," "Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
#if !defined(_WIN32)
DEFINE_uint64(initial_cpu_memory_in_mb, DEFINE_uint64(initial_cpu_memory_in_mb,
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
/* Aligned with mozga-intel, MKLDNN need at least 5000 MB /* Aligned with mozga-intel, MKLDNN need at least 5000 MB
* to obtain the best performance*/ * to obtain the best performance*/
5000, 5000ul,
#else #else
500, 500ul,
#endif #endif
"Initial CPU memory for PaddlePaddle, in MD unit."); "Initial CPU memory for PaddlePaddle, in MD unit.");
#else
DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
"Initial CPU memory for PaddlePaddle, in MD unit.");
#endif // !defined(_WIN32)
DEFINE_double( DEFINE_double(
fraction_of_cuda_pinned_memory_to_use, 0.5, fraction_of_cuda_pinned_memory_to_use, 0.5,
...@@ -60,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() { ...@@ -60,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() {
size_t len = sizeof(size); size_t len = sizeof(size);
if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size; if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
return 0L; return 0L;
#elif defined(_WIN32)
MEMORYSTATUSEX sMeminfo;
sMeminfo.dwLength = sizeof(sMeminfo);
GlobalMemoryStatusEx(&sMeminfo);
return sMeminfo.ullTotalPhys;
#else #else
int64_t pages = sysconf(_SC_PHYS_PAGES); int64_t pages = sysconf(_SC_PHYS_PAGES);
int64_t page_size = sysconf(_SC_PAGE_SIZE); int64_t page_size = sysconf(_SC_PAGE_SIZE);
......
...@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#if !defined(_WIN32)
#include <sys/time.h> #include <sys/time.h>
#else
#include <windows.h>
#endif // !_WIN32
#include <time.h> #include <time.h>
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include <string> #include <string>
...@@ -27,12 +32,15 @@ namespace platform { ...@@ -27,12 +32,15 @@ namespace platform {
/////////////////////// ///////////////////////
// WARN: Under Development. Don't depend on it yet. // WARN: Under Development. Don't depend on it yet.
////////////////////// //////////////////////
#if !defined(_WIN32)
inline uint64_t PosixInNsec() { inline uint64_t PosixInNsec() {
struct timeval tv; struct timeval tv;
gettimeofday(&tv, nullptr); gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec); return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
} }
#else
inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
#endif // !_WIN32
// DeviceTracer performs the following tasks: // DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
......
...@@ -16,7 +16,9 @@ if (CUPTI_FOUND) ...@@ -16,7 +16,9 @@ if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc) list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND) endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
if (NOT WIN32)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
endif(NOT WIN32)
if (WITH_MKLML) if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif() endif()
......
...@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include <dlfcn.h>
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
...@@ -23,6 +21,7 @@ limitations under the License. */ ...@@ -23,6 +21,7 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/dynload/cupti_lib_path.h" #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
DEFINE_string(cudnn_dir, "", DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, " "Specify path for loading libcudnn.so. For instance, "
......
...@@ -18,6 +18,11 @@ limitations under the License. */ ...@@ -18,6 +18,11 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle #include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__ #endif // __GNUC__
#if defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cudnn.h> #include <cudnn.h>
...@@ -117,7 +122,12 @@ struct EOFException : public std::exception { ...@@ -117,7 +122,12 @@ struct EOFException : public std::exception {
// always forces branch prediction of true. // always forces branch prediction of true.
// This generates faster binary code. __builtin_expect is since C++11. // This generates faster binary code. __builtin_expect is since C++11.
// For more details, please check https://stackoverflow.com/a/43870188/724872. // For more details, please check https://stackoverflow.com/a/43870188/724872.
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0) #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition == 0)
#endif
template <typename... Args> template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
...@@ -230,6 +240,7 @@ inline void throw_on_error(T e) { ...@@ -230,6 +240,7 @@ inline void throw_on_error(T e) {
throw_on_error(e, ""); throw_on_error(e, "");
} }
#if !defined(_WIN32)
#define PADDLE_THROW(...) \ #define PADDLE_THROW(...) \
do { \ do { \
throw ::paddle::platform::EnforceNotMet( \ throw ::paddle::platform::EnforceNotMet( \
...@@ -248,15 +259,28 @@ inline void throw_on_error(T e) { ...@@ -248,15 +259,28 @@ inline void throw_on_error(T e) {
__FILE__, __LINE__); \ __FILE__, __LINE__); \
} \ } \
} while (false) } while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
#endif
#define PADDLE_THROW_EOF() \ #define PADDLE_THROW_EOF() \
do { \ do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
__LINE__); \ __LINE__); \
} while (false) } while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__)
#endif // REPLACE_ENFORCE_GLOG
#else // !_WIN32
// disable enforce, caused by the varardic macro exception error
#define PADDLE_THROW(x) \
do { \
throw std::make_exception_ptr( \
std::runtime_error("Windows disable the enforce.")); \
} while (false)
#define PADDLE_ENFORCE(x, ...) x
#endif // !_WIN32
/* /*
* Some enforce helpers here, usage: * Some enforce helpers here, usage:
* int a = 1; * int a = 1;
......
...@@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx); ...@@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void PopEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
#if !defined(_WIN32)
struct RecordEvent { struct RecordEvent {
RecordEvent(const std::string& name, const DeviceContext* dev_ctx); RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
...@@ -94,6 +95,15 @@ struct RecordBlock { ...@@ -94,6 +95,15 @@ struct RecordBlock {
std::string name_; std::string name_;
uint64_t start_ns_; uint64_t start_ns_;
}; };
#else
// windows do not support profiler temporarily.
struct RecordEvent {
RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
};
struct RecordBlock {
explicit RecordBlock(int block_id) {}
};
#endif
// Return the event list of all threads. Assumed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
...@@ -234,6 +234,7 @@ void BindVarDsec(pybind11::module *m) { ...@@ -234,6 +234,7 @@ void BindVarDsec(pybind11::module *m) {
pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "") pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
.value("BOOL", pd::proto::VarType::BOOL) .value("BOOL", pd::proto::VarType::BOOL)
.value("UINT8", pd::proto::VarType::UINT8) .value("UINT8", pd::proto::VarType::UINT8)
.value("INT8", pd::proto::VarType::INT8)
.value("INT16", pd::proto::VarType::INT16) .value("INT16", pd::proto::VarType::INT16)
.value("INT32", pd::proto::VarType::INT32) .value("INT32", pd::proto::VarType::INT32)
.value("INT64", pd::proto::VarType::INT64) .value("INT64", pd::proto::VarType::INT64)
......
...@@ -130,6 +130,7 @@ PYBIND11_PLUGIN(core) { ...@@ -130,6 +130,7 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCPUTensorSetFromArray<bool>) .def("set", PyCPUTensorSetFromArray<bool>)
.def("set", PyCPUTensorSetFromArray<uint16_t>) .def("set", PyCPUTensorSetFromArray<uint16_t>)
.def("set", PyCPUTensorSetFromArray<uint8_t>) .def("set", PyCPUTensorSetFromArray<uint8_t>)
.def("set", PyCPUTensorSetFromArray<int8_t>)
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
.def("set", PyCUDATensorSetFromArray<float>) .def("set", PyCUDATensorSetFromArray<float>)
.def("set", PyCUDATensorSetFromArray<int>) .def("set", PyCUDATensorSetFromArray<int>)
...@@ -138,6 +139,7 @@ PYBIND11_PLUGIN(core) { ...@@ -138,6 +139,7 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCUDATensorSetFromArray<bool>) .def("set", PyCUDATensorSetFromArray<bool>)
.def("set", PyCUDATensorSetFromArray<uint16_t>) .def("set", PyCUDATensorSetFromArray<uint16_t>)
.def("set", PyCUDATensorSetFromArray<uint8_t>) .def("set", PyCUDATensorSetFromArray<uint8_t>)
.def("set", PyCUDATensorSetFromArray<int8_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<float>) .def("set", PyCUDAPinnedTensorSetFromArray<float>)
.def("set", PyCUDAPinnedTensorSetFromArray<int>) .def("set", PyCUDAPinnedTensorSetFromArray<int>)
.def("set", PyCUDAPinnedTensorSetFromArray<double>) .def("set", PyCUDAPinnedTensorSetFromArray<double>)
...@@ -145,6 +147,7 @@ PYBIND11_PLUGIN(core) { ...@@ -145,6 +147,7 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCUDAPinnedTensorSetFromArray<bool>) .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
.def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>) .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>) .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<int8_t>)
#endif #endif
.def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
.def("_set_float_element", TensorSetElement<float>) .def("_set_float_element", TensorSetElement<float>)
......
...@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) { inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
auto buffer_info = auto buffer_info =
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool, details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
uint8_t, platform::float16>()(tensor); uint8_t, int8_t, platform::float16>()(tensor);
return buffer_info; return buffer_info;
} }
......
...@@ -335,12 +335,18 @@ function assert_api_not_changed() { ...@@ -335,12 +335,18 @@ function assert_api_not_changed() {
fi fi
python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
deactivate deactivate
}
function assert_api_spec_approvals() {
if [ -z ${BRANCH} ]; then
BRANCH="develop"
fi
API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true` API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/API.spec" || true`
echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# TODO: curl -H 'Authorization: token ${TOKEN}' # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
...@@ -622,11 +628,12 @@ function main() { ...@@ -622,11 +628,12 @@ function main() {
cicheck) cicheck)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
build build
assert_api_not_changed ${PYTHON_ABI:-""}
run_test run_test
gen_capi_package gen_capi_package
gen_fluid_inference_lib gen_fluid_inference_lib
test_fluid_inference_lib test_fluid_inference_lib
assert_api_not_changed ${PYTHON_ABI:-""} assert_api_spec_approvals
;; ;;
*) *)
print_usage print_usage
......
...@@ -24,6 +24,7 @@ set and test set into paddle reader creators. ...@@ -24,6 +24,7 @@ set and test set into paddle reader creators.
from __future__ import print_function from __future__ import print_function
import numpy as np
import zipfile import zipfile
import paddle.dataset.common import paddle.dataset.common
import re import re
...@@ -150,12 +151,12 @@ def __initialize_meta_info__(): ...@@ -150,12 +151,12 @@ def __initialize_meta_info__():
def __reader__(rand_seed=0, test_ratio=0.1, is_test=False): def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
fn = __initialize_meta_info__() fn = __initialize_meta_info__()
rand = random.Random(x=rand_seed) np.random.seed(rand_seed)
with zipfile.ZipFile(file=fn) as package: with zipfile.ZipFile(file=fn) as package:
with package.open('ml-1m/ratings.dat') as rating: with package.open('ml-1m/ratings.dat') as rating:
for line in rating: for line in rating:
line = cpt.to_text(line, encoding='latin') line = cpt.to_text(line, encoding='latin')
if (rand.random() < test_ratio) == is_test: if (np.random.random() < test_ratio) == is_test:
uid, mov_id, rating, _ = line.strip().split("::") uid, mov_id, rating, _ = line.strip().split("::")
uid = int(uid) uid = int(uid)
mov_id = int(mov_id) mov_id = int(mov_id)
......
...@@ -95,6 +95,8 @@ def convert_np_dtype_to_dtype_(np_dtype): ...@@ -95,6 +95,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
return core.VarDesc.VarType.INT16 return core.VarDesc.VarType.INT16
elif dtype == np.uint8: elif dtype == np.uint8:
return core.VarDesc.VarType.UINT8 return core.VarDesc.VarType.UINT8
elif dtype == np.int8:
return core.VarDesc.VarType.INT8
else: else:
raise ValueError("Not supported numpy dtype %s" % dtype) raise ValueError("Not supported numpy dtype %s" % dtype)
......
...@@ -246,7 +246,11 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True): ...@@ -246,7 +246,11 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True):
rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
}) })
if sync: if sync:
helper.append_op(type="send_barrier", attrs={"endpoints": endpoints}) helper.append_op(
type="send_barrier",
inputs={"X": dummy_output},
outputs={"Out": []},
attrs={"endpoints": endpoints})
def Recv(endpoints, get_vars, dummy_input=None, sync=True): def Recv(endpoints, get_vars, dummy_input=None, sync=True):
...@@ -282,7 +286,10 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True): ...@@ -282,7 +286,10 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True):
attrs={"endpoints": endpoints, attrs={"endpoints": endpoints,
"epmap": epmap}) "epmap": epmap})
if sync: if sync:
helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints}) helper.append_op(
type="fetch_barrier",
outputs={"Out": get_vars},
attrs={"endpoints": endpoints})
return get_vars return get_vars
......
...@@ -119,10 +119,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): ...@@ -119,10 +119,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
helper = LayerHelper("auc", **locals()) helper = LayerHelper("auc", **locals())
auc_out = helper.create_tmp_variable(dtype="float64") auc_out = helper.create_tmp_variable(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches. # make tp, tn, fp, fn persistable, so that can accumulate all batches.
tp = helper.create_global_variable(persistable=True, dtype='int64') tp = helper.create_global_variable(
tn = helper.create_global_variable(persistable=True, dtype='int64') persistable=True, dtype='int64', shape=[num_thresholds])
fp = helper.create_global_variable(persistable=True, dtype='int64') tn = helper.create_global_variable(
fn = helper.create_global_variable(persistable=True, dtype='int64') persistable=True, dtype='int64', shape=[num_thresholds])
fp = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds])
fn = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds])
for var in [tp, tn, fp, fn]: for var in [tp, tn, fp, fn]:
helper.set_variable_initializer( helper.set_variable_initializer(
var, Constant( var, Constant(
......
...@@ -17,6 +17,7 @@ All layers just related to the neural network. ...@@ -17,6 +17,7 @@ All layers just related to the neural network.
from __future__ import print_function from __future__ import print_function
import numpy as np
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant from ..initializer import Normal, Constant
from ..framework import Variable from ..framework import Variable
...@@ -24,7 +25,6 @@ from ..param_attr import ParamAttr ...@@ -24,7 +25,6 @@ from ..param_attr import ParamAttr
from .layer_function_generator import autodoc, templatedoc from .layer_function_generator import autodoc, templatedoc
from .tensor import concat from .tensor import concat
from . import utils from . import utils
import random
from .. import unique_name from .. import unique_name
from functools import reduce from functools import reduce
...@@ -54,6 +54,7 @@ __all__ = [ ...@@ -54,6 +54,7 @@ __all__ = [
'conv2d_transpose', 'conv2d_transpose',
'conv3d_transpose', 'conv3d_transpose',
'sequence_expand', 'sequence_expand',
'sequence_pad',
'lstm_unit', 'lstm_unit',
'reduce_sum', 'reduce_sum',
'reduce_mean', 'reduce_mean',
...@@ -89,6 +90,7 @@ __all__ = [ ...@@ -89,6 +90,7 @@ __all__ = [
'lod_reset', 'lod_reset',
'lrn', 'lrn',
'pad', 'pad',
'pad_constant_like',
'label_smooth', 'label_smooth',
'roi_pool', 'roi_pool',
'dice_loss', 'dice_loss',
...@@ -107,6 +109,7 @@ __all__ = [ ...@@ -107,6 +109,7 @@ __all__ = [
'flatten', 'flatten',
'sequence_mask', 'sequence_mask',
'stack', 'stack',
'unstack',
] ]
...@@ -2657,6 +2660,51 @@ def sequence_expand(x, y, ref_level=-1, name=None): ...@@ -2657,6 +2660,51 @@ def sequence_expand(x, y, ref_level=-1, name=None):
return tmp return tmp
@templatedoc()
def sequence_pad(x, pad_value, maxlen=None):
"""
${comment}
Args:
x(Variable): Input variable which should contain lod information.
pad_value(Variable): The Variable that holds values that will be fill
into padded steps. It can be a scalar or a tensor whose shape
equals to time steps in sequences. If it's a scalar, it will be
automatically broadcasted to the shape of time step.
maxlen(int, default None): The length of padded sequences. It can be
None or any positive int. When it is None, all sequences will be
padded up to the length of the longest one among them; when it a
certain positive value, it must be greater than the length of the
longest original sequence."
Returns:
Variable: The padded sequence batch. All sequences has the same length.
Examples:
.. code-block:: python
import numpy
x = fluid.layers.data(name='y', shape=[10, 5],
dtype='float32', lod_level=1)
pad_value = fluid.layers.assign(input=numpy.array([0]))
out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
"""
helper = LayerHelper('sequence_pad', input=x, **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
if maxlen is None:
maxlen = -1
helper.append_op(
type='sequence_pad',
inputs={'X': x,
'PadValue': pad_value},
outputs={'Out': out},
attrs={'padded_length': maxlen})
return out
def beam_search(pre_ids, def beam_search(pre_ids,
pre_scores, pre_scores,
ids, ids,
...@@ -4793,6 +4841,86 @@ def pad(x, paddings, pad_value=0., name=None): ...@@ -4793,6 +4841,86 @@ def pad(x, paddings, pad_value=0., name=None):
return out return out
def pad_constant_like(x, y, pad_value=0., name=None):
"""
Pad input(Y) with :attr:`pad_value`, the number of values padded to
the edges of each axis is specified by the difference of the shape
of X and Y. ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n))
unique pad widths for each axis. The input should be a k-D
tensor(k > 0 and k < 7).
See below for an example.
.. code-block:: text
Given:
X = [[[[ 0, 1, 2],
[ 3, 4, 5]],
[[ 6, 7, 8],
[ 9, 10, 11]],
[[12, 13, 14],
[15, 16, 17]]],
[[[18, 19, 20],
[21, 22, 23]],
[[24, 25, 26],
[27, 28, 29]],
[[30, 31, 32],
[33, 34, 35]]]]
X.shape = (2, 3, 2, 3)
Y = [[[[35, 36, 37]],
[[38, 39, 40]],
[[41, 42, 43]]]]
Y.shape = (1, 3, 1, 3)
And
pad_value = -1,
Return:
Out = [[[[35, 36, 37],
[-1, -1, -1]],
[[38, 39, 40],
[-1, -1, -1]],
[[41, 42, 43],
[-1, -1, -1]]],
[[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]]]]
Out.shape = (2, 3, 2, 3)
Args:
x (Variable): The input tensor variable.
y (Variable): The input tensor variable.
pad_value (float): The constant value used to pad.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The padded tensor variable.
Examples:
.. code-block:: python
# x is a rank 4 tensor variable, x.shape = (2, 3, 2, 3)
# y is a rank 4 tensor variable, y.shape = (1, 3, 1, 3)
out = fluid.layers.pad_constant_like(x=x, y=y, pad_value=0.)
# out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3]
"""
helper = LayerHelper('pad_constant_like', input=x, **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
helper.append_op(
type='pad_constant_like',
inputs={'X': x,
'Y': y},
outputs={'Out': out},
attrs={'pad_value': float(pad_value)})
return out
def label_smooth(label, def label_smooth(label,
prior_dist=None, prior_dist=None,
epsilon=0.1, epsilon=0.1,
...@@ -5187,7 +5315,7 @@ def random_crop(x, shape, seed=None): ...@@ -5187,7 +5315,7 @@ def random_crop(x, shape, seed=None):
dtype = x.dtype dtype = x.dtype
out = helper.create_tmp_variable(dtype) out = helper.create_tmp_variable(dtype)
if seed is None: if seed is None:
seed = random.randint(-65536, 65535) seed = np.random.randint(-65536, 65536)
op_attrs = {"shape": shape} op_attrs = {"shape": shape}
if isinstance(seed, int): if isinstance(seed, int):
op_attrs["startup_seed"] = seed op_attrs["startup_seed"] = seed
...@@ -5389,7 +5517,7 @@ def crop(x, shape=None, offsets=None, name=None): ...@@ -5389,7 +5517,7 @@ def crop(x, shape=None, offsets=None, name=None):
helper = LayerHelper('crop', **locals()) helper = LayerHelper('crop', **locals())
if not (isinstance(shape, list) or isinstance(shape, tuple) or \ if not (isinstance(shape, list) or isinstance(shape, tuple) or \
isinstance(shape, Variable)): isinstance(shape, Variable)):
raise ValueError("The shape should be a list, tuple or Variable.") raise ValueError("The shape should be a list, tuple or Variable.")
if offsets is None: if offsets is None:
...@@ -5501,7 +5629,7 @@ def prelu(x, mode, param_attr=None, name=None): ...@@ -5501,7 +5629,7 @@ def prelu(x, mode, param_attr=None, name=None):
channel:elements in a channel share same weight channel:elements in a channel share same weight
element:each element has a weight element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
Returns: Returns:
Variable: The output tensor with the same shape as input. Variable: The output tensor with the same shape as input.
...@@ -5615,23 +5743,23 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None): ...@@ -5615,23 +5743,23 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
:code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where: :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
.. math:: .. math::
y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n)) y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
Args: Args:
x (Variable): Input tensor of sequence_mask layer, x (Variable): Input tensor of sequence_mask layer,
whose elements are integers less than :code:`maxlen`. whose elements are integers less than :code:`maxlen`.
maxlen (int|None): Maximum length of the sequence. If :code:`maxlen` maxlen (int|None): Maximum length of the sequence. If :code:`maxlen`
is None, it would be replace with :math:`max(x)`. is None, it would be replace with :math:`max(x)`.
dtype (np.dtype|core.VarDesc.VarType|str): Data type of the output. dtype (np.dtype|core.VarDesc.VarType|str): Data type of the output.
name (str|None): A name for this layer(optional). If set None, the name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically. layer will be named automatically.
Returns: Returns:
Variable: The output sequence mask. Variable: The output sequence mask.
""" """
helper = LayerHelper('sequence_mask', **locals()) helper = LayerHelper('sequence_mask', **locals())
...@@ -5656,23 +5784,23 @@ def stack(x, axis=0): ...@@ -5656,23 +5784,23 @@ def stack(x, axis=0):
**Stack Layer** **Stack Layer**
This layer stacks all of the input :code:`x` along axis. This layer stacks all of the input :code:`x` along axis.
Input :code:`x` can be a single variable, a :code:`list` of variables, Input :code:`x` can be a single variable, a :code:`list` of variables,
or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or
:code:`tuple`, the shapes of all these variables must be the same. :code:`tuple`, the shapes of all these variables must be the same.
Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`, Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`,
the shape of the output variable would be the shape of the output variable would be
:math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`. :math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`.
If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`. If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
If :code:`axis` is None, it would be replaced with 0. If :code:`axis` is None, it would be replaced with 0.
Args: Args:
x (Variable|list(Variable)|tuple(Variable)): Input variables. x (Variable|list(Variable)|tuple(Variable)): Input variables.
axis (int|None): The axis along which all inputs are stacked. axis (int|None): The axis along which all inputs are stacked.
Returns: Returns:
Variable: The stacked variable. Variable: The stacked variable.
""" """
helper = LayerHelper('stack', **locals()) helper = LayerHelper('stack', **locals())
...@@ -5686,3 +5814,44 @@ def stack(x, axis=0): ...@@ -5686,3 +5814,44 @@ def stack(x, axis=0):
type='stack', inputs={'X': x}, outputs={'Y': out}, type='stack', inputs={'X': x}, outputs={'Y': out},
attrs={'axis': axis}) attrs={'axis': axis})
return out return out
def unstack(x, axis=0, num=None):
"""
**UnStack Layer**
This layer unstacks input :code:`x` into several tensors along axis.
If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`.
If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`,
and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is
raised.
Args:
x (Variable): Input variable.
axis (int): The axis along which the input is unstacked.
num (int|None): The number of output variables.
Returns:
list(Variable): The unstacked variables.
"""
helper = LayerHelper('unstack', **locals())
if num is None:
if axis is None or x.shape[axis] <= 0:
raise ValueError('unknown unstack number')
else:
num = x.shape[axis]
outs = []
for _ in num:
outs.append(helper.create_tmp_variable(x.dtype))
helper.append_op(
type='unstack',
inputs={'X': [x]},
outputs={'Y': outs},
attrs={'axis': axis,
'num': num})
return outs
...@@ -46,10 +46,12 @@ class Optimizer(object): ...@@ -46,10 +46,12 @@ class Optimizer(object):
def __init__(self, def __init__(self,
learning_rate, learning_rate,
regularization=None, regularization=None,
LARS_weight_decay=0.0): LARS_weight_decay=0.0,
name=None):
if not isinstance(learning_rate, float) and \ if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable): not isinstance(learning_rate, framework.Variable):
raise TypeError("learning rate should be float or Variable") raise TypeError("learning rate should be float or Variable")
self._name = name
self.regularization = regularization self.regularization = regularization
self._learning_rate = learning_rate self._learning_rate = learning_rate
# the learning rate type should be inferenced from loss # the learning rate type should be inferenced from loss
...@@ -153,6 +155,8 @@ class Optimizer(object): ...@@ -153,6 +155,8 @@ class Optimizer(object):
dtype: data type of the accumulator variable dtype: data type of the accumulator variable
fill_value: value to initialize the accumulator variable fill_value: value to initialize the accumulator variable
""" """
if self._name is not None:
name = self._name + "_" + name
if (name in self._accumulators and if (name in self._accumulators and
param.name in self._accumulators[name]): param.name in self._accumulators[name]):
raise Exception("Accumulator {} already exists for parameter {}". raise Exception("Accumulator {} already exists for parameter {}".
...@@ -181,6 +185,8 @@ class Optimizer(object): ...@@ -181,6 +185,8 @@ class Optimizer(object):
Returns: Returns:
accumulator variable for the parameter accumulator variable for the parameter
""" """
if self._name is not None:
name = self._name + "_" + name
if (name not in self._accumulators or if (name not in self._accumulators or
param.name not in self._accumulators[name]): param.name not in self._accumulators[name]):
raise Exception("Accumulator {} does not exist for parameter {}". raise Exception("Accumulator {} does not exist for parameter {}".
......
...@@ -134,7 +134,7 @@ class SE_ResNeXt(): ...@@ -134,7 +134,7 @@ class SE_ResNeXt():
size=class_dim, size=class_dim,
act='softmax', act='softmax',
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.2))) initializer=fluid.initializer.Constant(value=0.05)))
return out return out
def shortcut(self, input, ch_out, stride): def shortcut(self, input, ch_out, stride):
...@@ -184,7 +184,7 @@ class SE_ResNeXt(): ...@@ -184,7 +184,7 @@ class SE_ResNeXt():
act=None, act=None,
# avoid pserver CPU init differs from GPU # avoid pserver CPU init differs from GPU
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.2)), initializer=fluid.initializer.Constant(value=0.05)),
bias_attr=False) bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act) return fluid.layers.batch_norm(input=conv, act=act)
...@@ -192,13 +192,19 @@ class SE_ResNeXt(): ...@@ -192,13 +192,19 @@ class SE_ResNeXt():
pool = fluid.layers.pool2d( pool = fluid.layers.pool2d(
input=input, pool_size=0, pool_type='avg', global_pooling=True) input=input, pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
squeeze = fluid.layers.fc(input=pool, squeeze = fluid.layers.fc(
size=num_channels // reduction_ratio, input=pool,
act='relu') size=num_channels // reduction_ratio,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)),
act='relu')
stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
excitation = fluid.layers.fc(input=squeeze, excitation = fluid.layers.fc(
size=num_channels, input=squeeze,
act='sigmoid') size=num_channels,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)),
act='sigmoid')
scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
return scale return scale
......
...@@ -49,28 +49,32 @@ class TestDistWord2vec2x2(TestDistRunnerBase): ...@@ -49,28 +49,32 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='shared_w', initializer=fluid.initializer.Constant())) name='shared_w',
initializer=fluid.initializer.Constant(value=0.1)))
embed_second = fluid.layers.embedding( embed_second = fluid.layers.embedding(
input=words[1], input=words[1],
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='shared_w', initializer=fluid.initializer.Constant())) name='shared_w',
initializer=fluid.initializer.Constant(value=0.1)))
embed_third = fluid.layers.embedding( embed_third = fluid.layers.embedding(
input=words[2], input=words[2],
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='shared_w', initializer=fluid.initializer.Constant())) name='shared_w',
initializer=fluid.initializer.Constant(value=0.1)))
embed_forth = fluid.layers.embedding( embed_forth = fluid.layers.embedding(
input=words[3], input=words[3],
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='shared_w', initializer=fluid.initializer.Constant())) name='shared_w',
initializer=fluid.initializer.Constant(value=0.1)))
concat_embed = fluid.layers.concat( concat_embed = fluid.layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], input=[embed_first, embed_second, embed_third, embed_forth],
...@@ -80,13 +84,13 @@ class TestDistWord2vec2x2(TestDistRunnerBase): ...@@ -80,13 +84,13 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
size=HIDDEN_SIZE, size=HIDDEN_SIZE,
act='sigmoid', act='sigmoid',
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant())) initializer=fluid.initializer.Constant(value=0.1)))
predict_word = fluid.layers.fc( predict_word = fluid.layers.fc(
input=hidden1, input=hidden1,
size=dict_size, size=dict_size,
act='softmax', act='softmax',
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant())) initializer=fluid.initializer.Constant(value=0.1)))
cost = fluid.layers.cross_entropy( cost = fluid.layers.cross_entropy(
input=predict_word, label=words[4]) input=predict_word, label=words[4])
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
......
...@@ -100,7 +100,7 @@ class TestSendOp(unittest.TestCase): ...@@ -100,7 +100,7 @@ class TestSendOp(unittest.TestCase):
main.global_block().append_op( main.global_block().append_op(
type="fetch_barrier", type="fetch_barrier",
inputs={}, inputs={},
outputs={}, outputs={"Out": []},
attrs={ attrs={
"endpoints": ["127.0.0.1:{0}".format(port)], "endpoints": ["127.0.0.1:{0}".format(port)],
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
......
...@@ -22,7 +22,7 @@ class TestDistSeResneXt2x2(TestDistBase): ...@@ -22,7 +22,7 @@ class TestDistSeResneXt2x2(TestDistBase):
self._sync_mode = True self._sync_mode = True
def test_se_resnext(self): def test_se_resnext(self):
self.check_with_place("dist_word2vec.py", delta=1e-7) self.check_with_place("dist_word2vec.py", delta=1e-4)
class TestDistSeResneXt2x2Async(TestDistBase): class TestDistSeResneXt2x2Async(TestDistBase):
......
...@@ -20,41 +20,50 @@ import math ...@@ -20,41 +20,50 @@ import math
from op_test import OpTest from op_test import OpTest
def quantize_max_abs(x, num_bits): def quantize_max_abs(x, max_range):
range = math.pow(2, num_bits) - 1
scale = np.max(np.abs(x).flatten()) scale = np.max(np.abs(x).flatten())
y = np.round(x / scale * range) y = np.round(x / scale * max_range)
return y, scale return y, scale
def dequantize_max_abs(x, num_bits, scale): def dequantize_max_abs(x, scale, max_range):
range = math.pow(2, num_bits) - 1 y = (scale / max_range) * x
y = (scale / range) * x
return y return y
class TestFakeDequantizeMaxAbsOp(OpTest): class TestFakeDequantizeMaxAbsOp(OpTest):
def set_args(self): def set_args(self):
self.num_bits = 8 self.num_bits = 8
self.max_range = math.pow(2, self.num_bits - 1) - 1
self.data_type = "float32"
def setUp(self): def setUp(self):
self.set_args() self.set_args()
self.op_type = "fake_dequantize_max_abs" self.op_type = "fake_dequantize_max_abs"
x = np.random.randn(31, 65).astype("float32") x = np.random.randn(31, 65).astype(self.data_type)
yq, scale = quantize_max_abs(x, self.num_bits) yq, scale = quantize_max_abs(x, self.max_range)
ydq = dequantize_max_abs(yq, self.num_bits, scale) ydq = dequantize_max_abs(yq, scale, self.max_range)
self.inputs = {'X': yq} self.inputs = {'X': yq, 'Scale': np.array(scale).astype(self.data_type)}
self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)} self.attrs = {'max_range': self.max_range}
self.outputs = {'Out': ydq} self.outputs = {'Out': ydq}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
class TestFakeDequantizeMaxAbsOp5Bits(OpTest): class TestFakeDequantizeMaxAbsOpDouble(TestFakeDequantizeMaxAbsOp):
def set_args(self):
self.num_bits = 8
self.max_range = math.pow(2, self.num_bits - 1) - 1
self.data_type = "float64"
class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp):
def set_args(self): def set_args(self):
self.num_bits = 5 self.num_bits = 5
self.max_range = math.pow(2, self.num_bits - 1) - 1
self.data_type = "float32"
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -43,13 +43,13 @@ def fusion_lstm( ...@@ -43,13 +43,13 @@ def fusion_lstm(
act_cell, act_cand) act_cell, act_cand)
class TestLstmOp(OpTest): class TestFusionLSTMOp(OpTest):
def set_argument(self): def set_conf(self):
self.lod = [[2, 3, 2]] pass
def setUp(self): def setUp(self):
self.op_type = 'fusion_lstm' self.op_type = 'fusion_lstm'
self.lod = [[2, 3, 2]] self.lod = [[2, 3, 5, 4]]
self.M = 8 self.M = 8
self.D = 16 self.D = 16
self.has_initial_state = False self.has_initial_state = False
...@@ -58,33 +58,33 @@ class TestLstmOp(OpTest): ...@@ -58,33 +58,33 @@ class TestLstmOp(OpTest):
self.act_cell = 'tanh' self.act_cell = 'tanh'
self.act_cand = 'tanh' self.act_cand = 'tanh'
self.use_peepholes = False self.use_peepholes = False
self.set_argument() self.set_conf()
T = sum(self.lod[0]) T = sum(self.lod[0])
bs = len(self.lod[0]) bs = len(self.lod[0])
x = np.random.normal(size=(T, self.M)).astype('float64') x = np.random.normal(size=(T, self.M)).astype('float32')
if self.has_initial_state: if self.has_initial_state:
h0 = np.random.normal(size=(bs, self.D)).astype('float64') h0 = np.random.normal(size=(bs, self.D)).astype('float32')
c0 = np.random.normal(size=(bs, self.D)).astype('float64') c0 = np.random.normal(size=(bs, self.D)).astype('float32')
else: else:
h0 = np.zeros((bs, self.D)).astype('float64') h0 = np.zeros((bs, self.D)).astype('float32')
c0 = np.zeros((bs, self.D)).astype('float64') c0 = np.zeros((bs, self.D)).astype('float32')
wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float64') wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32')
if self.use_peepholes: if self.use_peepholes:
b = np.random.normal(size=(1, 7 * self.D)).astype('float64') b = np.random.normal(size=(1, 7 * self.D)).astype('float32')
else: else:
b = np.random.normal(size=(1, 4 * self.D)).astype('float64') b = np.random.normal(size=(1, 4 * self.D)).astype('float32')
w_b = np.copy(b[:, 0:4 * self.D]) w_b = np.copy(b[:, 0:4 * self.D])
w_c = b[:, 4 * self.D:] if self.use_peepholes else None w_c = b[:, 4 * self.D:] if self.use_peepholes else None
# this is the weight of fc # this is the weight of fc
wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float64') wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float32')
# this is the bias of fc # this is the bias of fc
# and it should be manually added into the bias of this fusion LSTM # and it should be manually added into the bias of this fusion LSTM
bx = np.random.normal(size=(1, 4 * self.D)).astype('float64') bx = np.random.normal(size=(1, 4 * self.D)).astype('float32')
b[0, 0:4 * self.D] += bx[0, :] b[0, 0:4 * self.D] += bx[0, :]
h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c, h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c,
self.is_reverse, ACTIVATION[self.act_gate], self.is_reverse, ACTIVATION[self.act_gate],
...@@ -114,35 +114,45 @@ class TestLstmOp(OpTest): ...@@ -114,35 +114,45 @@ class TestLstmOp(OpTest):
} }
def test_check_output(self): def test_check_output(self):
self.check_output(atol=1e-8) self.check_output()
class TestLstmOpInitReverse(TestLstmOp): class TestFusionLSTMOpInit(TestFusionLSTMOp):
def set_argument(self): def set_conf(self):
self.has_initial_state = True
class TestFusionLSTMOpReverse(TestFusionLSTMOp):
def set_conf(self):
self.is_reverse = True
class TestFusionLSTMOpInitReverse(TestFusionLSTMOp):
def set_conf(self):
self.has_initial_state = True self.has_initial_state = True
self.is_reverse = True self.is_reverse = True
class TestLstmOpMD1(TestLstmOp): class TestFusionLSTMOpMD1(TestFusionLSTMOp):
def set_argument(self): def set_conf(self):
self.M = 36 self.M = 36
self.D = 8 self.D = 8
class TestLstmOpMD2(TestLstmOp): class TestFusionLSTMOpMD2(TestFusionLSTMOp):
def set_argument(self): def set_conf(self):
self.M = 8 self.M = 8
self.D = 8 self.D = 8
class TestLstmOpMD3(TestLstmOp): class TestFusionLSTMOpMD3(TestFusionLSTMOp):
def set_argument(self): def set_conf(self):
self.M = 15 self.M = 15
self.D = 3 self.D = 3
class TestLstmOpBS1(TestLstmOp): class TestFusionLSTMOpBS1(TestFusionLSTMOp):
def set_argument(self): def set_conf(self):
self.lod = [[3]] self.lod = [[3]]
self.D = 16 self.D = 16
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
from test_fusion_lstm_op import fc, ACTIVATION
def fusion_seqexpand_concat_fc(xs, lod, w, b, fc_act):
T = sum(lod[0])
N = len(lod[0])
num_inputs = len(xs)
D = w.shape[1]
expanded_inputs = [xs[0]]
for i in range(num_inputs - 1):
x = xs[i + 1]
assert x.shape[0] == N
expanded = np.repeat(x, lod[0], axis=0)
assert expanded.shape[0] == T
assert expanded.shape[1] == x.shape[1]
expanded_inputs.append(expanded)
fc_input = np.concatenate(expanded_inputs, axis=1)
assert fc_input.shape[0] == T
assert fc_input.shape[1] == w.shape[0]
fc_out = fc(fc_input, w, b)
fc_out = fc_act(fc_out)
assert fc_out.shape[0] == T
assert fc_out.shape[1] == D
return fc_out
class TestFusionSeqExpandConcatFCOp(OpTest):
def set_conf(self):
pass
def setUp(self):
self.op_type = 'fusion_seqexpand_concat_fc'
self.lod = [[3, 5, 8, 2]]
self.inputs_M = [15, 10, 10]
self.D = 20
self.with_bias = True
self.fc_act = 'relu'
self.set_conf()
T = sum(self.lod[0])
bs = len(self.lod[0])
num_inputs = len(self.inputs_M)
x0 = np.random.normal(size=(T, self.inputs_M[0])).astype('float32')
xs = [x0]
for i in range(num_inputs - 1):
xi = np.random.normal(size=(bs,
self.inputs_M[i + 1])).astype('float32')
xs.append(xi)
# fc weight and bias
w = np.random.normal(size=(sum(self.inputs_M),
self.D)).astype('float32')
b = np.random.normal(size=(
1, self.D)).astype('float32') if self.with_bias else np.zeros(
(1, self.D)).astype('float32')
out = fusion_seqexpand_concat_fc(xs, self.lod, w, b,
ACTIVATION[self.fc_act])
self.inputs = {'X': [('x0', (x0, self.lod))], 'FCWeight': w}
normal_lod = [[1] * bs]
for i in range(num_inputs - 1):
self.inputs['X'].append(('x%d' % (i + 1), (xs[i + 1], normal_lod)))
if self.with_bias:
self.inputs['FCBias'] = b
self.outputs = {'Out': (out, self.lod)}
self.attrs = {'fc_activation': self.fc_act}
def test_check_output(self):
self.check_output()
class TestFusionSECFCOpNonBias(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.with_bias = False
class TestFusionSECFCOpNonAct(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.fc_act = 'identity'
class TestFusionSECFCOpMD1(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.inputs_M = [3, 4, 2, 1, 5]
self.D = 8
class TestFusionSECFCOpMD2(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.lod = [[5, 6]]
self.inputs_M = [1, 1]
class TestFusionSECFCOpBS1_1(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.lod = [[1]]
self.inputs_M = [3, 4, 2]
class TestFusionSECFCOpBS1_2(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.lod = [[1]]
self.inputs_M = [3, 4]
class TestFusionSECFCOpBS1_3(TestFusionSeqExpandConcatFCOp):
def set_conf(self):
self.lod = [[5]]
self.inputs_M = [6, 3]
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
class TestPadOp(OpTest):
def setUp(self):
self.initTestCase()
self.op_type = "pad_constant_like"
self.inputs = {
'X': np.random.random(self.x_shape).astype("float32"),
'Y': np.random.random(self.y_shape).astype("float32")
}
self.attrs = {}
self.attrs['pad_value'] = self.pad_value
self.outputs = {
'Out': np.pad(self.inputs['Y'],
self.paddings,
mode='constant',
constant_values=self.pad_value)
}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['Y'], 'Out', max_relative_error=0.006)
def initTestCase(self):
self.x_shape = (16, 16)
self.y_shape = (3, 16)
self.pad_value = 0.1
self.paddings = [(0, 13), (0, 0)]
class TestCase1(TestPadOp):
def initTestCase(self):
self.x_shape = (4, 3, 4, 4)
self.y_shape = (2, 3, 4, 4)
self.paddings = [(0, 2), (0, 0), (0, 0), (0, 0)]
self.pad_value = 0.5
class TestCase2(TestPadOp):
def initTestCase(self):
self.x_shape = (4, 3, 4, 4)
self.y_shape = (2, 3, 2, 4)
self.paddings = [(0, 2), (0, 0), (0, 2), (0, 0)]
self.pad_value = 0.5
if __name__ == '__main__':
unittest.main()
...@@ -17,6 +17,8 @@ from __future__ import print_function ...@@ -17,6 +17,8 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
class TestScaleOp(OpTest): class TestScaleOp(OpTest):
...@@ -33,5 +35,57 @@ class TestScaleOp(OpTest): ...@@ -33,5 +35,57 @@ class TestScaleOp(OpTest):
self.check_grad(['X'], 'Out') self.check_grad(['X'], 'Out')
class TestScaleOpSelectedRows(unittest.TestCase):
def check_with_place(self, place, in_name, out_name):
scope = core.Scope()
# create and initialize Grad Variable
in_height = 10
in_rows = [0, 4, 7]
in_row_numel = 12
scale = 2.0
in_selected_rows = scope.var(in_name).get_selected_rows()
in_selected_rows.set_height(in_height)
in_selected_rows.set_rows(in_rows)
in_array = np.random.random(
(len(in_rows), in_row_numel)).astype("float32")
in_tensor = in_selected_rows.get_tensor()
in_tensor.set(in_array, place)
# create and initialize Param Variable
out_selected_rows = scope.var(out_name).get_selected_rows()
out_tensor = out_selected_rows.get_tensor()
out_tensor._set_dims(in_tensor._get_dims())
# create and run sgd operator
scale_op = Operator("scale", X=in_name, Out=out_name, scale=scale)
scale_op.run(scope, place)
# get and compare result
out_height = out_selected_rows.height()
out_rows = out_selected_rows.rows()
result_array = np.array(out_tensor)
assert (in_array * scale == result_array).all()
assert in_height == out_height
assert in_rows == out_rows
def test_scale_selected_rows(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place, 'in', 'out')
def test_scale_selected_rows_inplace(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place, 'in', 'in')
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestSequencePadOp(OpTest):
def set_attr(self):
self.x_shape = [12, 4]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0]
self.padded_length = -1
self.dtype = 'float32'
def set_data(self):
x_data = np.random.uniform(0.1, 0.5, self.x_shape).astype(self.dtype)
pad_value_data = np.array(self.pad_value).astype(self.dtype)
self.inputs = {
'X': (x_data, self.x_len_lod),
'PadValue': pad_value_data
}
self.attrs = {'padded_length': self.padded_length}
def compute(self):
# get padded length
padded_length = self.padded_length
x_len_lod_0 = self.x_len_lod[0]
if padded_length == -1:
max_seq_len = 0
for l in x_len_lod_0:
max_seq_len = max(max_seq_len, l)
padded_length = max_seq_len
# do padding
x_data = self.inputs['X'][0]
pad_value_data = self.inputs['PadValue']
if pad_value_data.shape == (1, ):
pad_value_data = np.broadcast_to(
pad_value_data, shape=x_data.shape[1:])
padded_sequences = []
start_idx = 0
for l in x_len_lod_0:
end_idx = start_idx + l
seq = x_data[start_idx:end_idx]
to_pad_len = padded_length - l
for _ in range(to_pad_len):
seq = np.append(seq, pad_value_data[np.newaxis, :], axis=0)
padded_sequences.append(seq)
start_idx = end_idx
out_data = np.array(padded_sequences)
self.outputs = {'Out': out_data}
def setUp(self):
self.op_type = 'sequence_pad'
self.set_attr()
self.set_data()
self.compute()
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestSequencePadOp2(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 4]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0, 2.0, 3.0, 4.0]
self.padded_length = -1
self.dtype = 'float32'
class TestSequencePadOp3(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 4]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0]
self.padded_length = 7
self.dtype = 'float32'
class TestSequencePadOp4(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 4]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0, 2.0, 3.0, 4.0]
self.padded_length = 7
self.dtype = 'float32'
class TestSequencePadOp5(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 2, 2]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0]
self.padded_length = -1
self.dtype = 'float32'
class TestSequencePadOp6(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 2, 2]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [[1.0, 2.0], [3.0, 4.0]]
self.padded_length = -1
self.dtype = 'float32'
class TestSequencePadOp7(TestSequencePadOp):
def set_attr(self):
self.x_shape = [12, 2, 2]
self.x_len_lod = [[2, 3, 4, 3]]
self.pad_value = [1.0]
self.padded_length = 7
self.dtype = 'float32'
...@@ -59,6 +59,27 @@ class TestTensor(unittest.TestCase): ...@@ -59,6 +59,27 @@ class TestTensor(unittest.TestCase):
self.assertAlmostEqual(1.0, tensor_array_2[3, 9]) self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
self.assertAlmostEqual(2.0, tensor_array_2[19, 11]) self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
def test_int8_tensor(self):
scope = core.Scope()
var = scope.var("int8_tensor")
cpu_tensor = var.get_tensor()
tensor_array = numpy.random.randint(
-127, high=128, size=[100, 200], dtype=numpy.int8)
place = core.CPUPlace()
cpu_tensor.set(tensor_array, place)
cpu_tensor_array_2 = numpy.array(cpu_tensor)
self.assertAlmostEqual(cpu_tensor_array_2.all(), tensor_array.all())
if core.is_compiled_with_cuda():
cuda_tensor = var.get_tensor()
tensor_array = numpy.random.randint(
-127, high=128, size=[100, 200], dtype=numpy.int8)
place = core.CUDAPlace(0)
cuda_tensor.set(tensor_array, place)
cuda_tensor_array_2 = numpy.array(cuda_tensor)
self.assertAlmostEqual(cuda_tensor_array_2.all(),
tensor_array.all())
def test_int_lod_tensor(self): def test_int_lod_tensor(self):
place = core.CPUPlace() place = core.CPUPlace()
scope = core.Scope() scope = core.Scope()
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from op_test import OpTest
import numpy as np
import unittest
class TestUnStackOpBase(OpTest):
def initDefaultParameters(self):
self.input_dim = (5, 6, 7)
self.axis = 0
self.dtype = 'float32'
def initParameters(self):
pass
def get_y_names(self):
y_names = []
for i in range(self.input_dim[self.axis]):
y_names.append('y{}'.format(i))
return y_names
def setUp(self):
self.initDefaultParameters()
self.initParameters()
self.op_type = 'unstack'
self.x = np.random.random(size=self.input_dim).astype(self.dtype)
outs = np.split(self.x, self.input_dim[self.axis], self.axis)
new_shape = list(self.input_dim)
del new_shape[self.axis]
y_names = self.get_y_names()
tmp = []
for i in range(self.input_dim[self.axis]):
tmp.append((y_names[i], np.reshape(outs[i], new_shape)))
self.inputs = {'X': self.x}
self.outputs = {'Y': tmp}
self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad('X', self.get_y_names())
class TestStackOp3(TestUnStackOpBase):
def initParameters(self):
self.axis = -1
class TestStackOp4(TestUnStackOpBase):
def initParameters(self):
self.axis = -3
class TestStackOp5(TestUnStackOpBase):
def initParameters(self):
self.axis = 1
class TestStackOp6(TestUnStackOpBase):
def initParameters(self):
self.axis = 2
if __name__ == '__main__':
unittest.main()
...@@ -31,7 +31,8 @@ class TestVariable(unittest.TestCase): ...@@ -31,7 +31,8 @@ class TestVariable(unittest.TestCase):
self.assertEqual(DT.INT16, convert("int16")) self.assertEqual(DT.INT16, convert("int16"))
self.assertEqual(DT.INT64, convert("int64")) self.assertEqual(DT.INT64, convert("int64"))
self.assertEqual(DT.BOOL, convert("bool")) self.assertEqual(DT.BOOL, convert("bool"))
self.assertRaises(ValueError, lambda: convert("int8")) self.assertEqual(DT.INT8, convert("int8"))
self.assertEqual(DT.UINT8, convert("uint8"))
def test_var(self): def test_var(self):
b = default_main_program().current_block() b = default_main_program().current_block()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册