diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index 78be0749091fb0a617f9fb172cc92b33560a3552..dc6730662f0b888f1981ac9c086320acc52d0a50 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -52,9 +52,8 @@ ExternalProject_Add( extern_anakin ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLML_PROJECT} - # Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code. - GIT_REPOSITORY "https://github.com/luotao1/Anakin" - GIT_TAG "211d1fc5d813d70c0c14072f9083cf25f40940ea" + GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin" + GIT_TAG "9424277cf9ae180a14aff09560d3cd60a49c76d2" PREFIX ${ANAKIN_SOURCE_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DUSE_GPU_PLACE=YES diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 22f1c5bbb547874c083917c67f2af0d38f1db8c0..601fa9cc0abaf72369bb96eae361a235e054e200 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -113,6 +113,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)) +paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)) paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) @@ -146,6 +147,7 @@ paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', ' paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)) paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)) +paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)) paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)) paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) @@ -165,6 +167,7 @@ paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, ke paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)) paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) +paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) @@ -297,6 +300,7 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', ' paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) +paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) @@ -379,7 +383,7 @@ paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, a paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]] paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]] -paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None +paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None 24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int] diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 2c62d4ed6b0e61d4a36b61cda40fa539285ccb14..0668ff43c8192f53ff7e05abaeb575e2b78b1de4 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -107,11 +107,11 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) endif() if (NOT WIN32) diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 1a9ce746ea840bc088d222cc4e9bc05159d64734..28f3da88fa18021f6b71e458fdb467be86d4dbf0 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -64,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() { RegType(size_t, proto::VarType::SIZE_T); RegType(int16_t, proto::VarType::INT16); RegType(uint8_t, proto::VarType::UINT8); + RegType(int8_t, proto::VarType::INT8); #undef RegType return retv; diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index f8c72ffc8964e64a10cff04f322f40b39b2fe263..84691a2059124960a3213802fec0863f8abe6df7 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -54,6 +54,9 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { case proto::VarType::INT16: visitor.template operator()(); break; + case proto::VarType::INT8: + visitor.template operator()(); + break; default: PADDLE_THROW("Not supported %d", type); } diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index bc61b0eacbf6c8a1fd4487ad5a442fed1b536345..7722c9401e0e7c071adb7bee9b35306431bb7a11 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -754,17 +754,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, node->Op()->Type()); CreateComputationalOp(result, node, op_dev_id); - if (node->Op()->Type() == "concat") { - ConnectOp(result, result->Get(kGraphOps).back().get(), - "fetch_barrier"); +} + +void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { + auto *op_handle = result->Get(kGraphOps).back().get(); + for (ir::Node *input : node->inputs) { + VarHandle *var = nullptr; + for (int place_offset = 0; place_offset < num_places; ++place_offset) { + auto &var_holders = result->Get(kGraphVars)[place_offset]; + auto &var_holder = var_holders[input->Name()]; + if (!var_holder.empty()) { + var = var_holder.rbegin()->get(); + op_handle->AddInput(var); + } + } } } // Create RPC related op handles that connects its in ops and out ops. void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { - // FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode - // put them into transpiler. int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. @@ -799,8 +808,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); - // FIXME(typhoonzero): assume each recv op output one param - // Use the same place as send. if (recv_param_grad.size() == 2U) { op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]); VLOG(10) << "recv param " << recv_param_grad[0] @@ -814,34 +821,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, .emplace(varname, op_dev_id); } } else { - // send_barrier and fetch_barrier op can be scheduled on device 0 + // send_barrier, fetch_barrier will run on place 0; op_dev_id = 0; } PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", node->Op()->Type()); - result->Get(kGraphOps).emplace_back(new RPCOpHandle( result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], node->Op()->Type(), places_[op_dev_id])); - // TODO(panyx0718): This might not be needed anymore. - if (node->Op()->Type() == "send_barrier") { - ConnectOp(result, result->Get(kGraphOps).back().get(), "send"); - } else if (node->Op()->Type() == "recv") { - ConnectOp(result, result->Get(kGraphOps).back().get(), - "send_barrier"); - } else if (node->Op()->Type() == "fetch_barrier") { - ConnectOp(result, result->Get(kGraphOps).back().get(), "recv"); - } else if (node->Op()->Type() == "send") { - // do nothing + if (node->Op()->Type() == "send") { + CreateOpHandleIOs(result, node, op_dev_id); } else { - PADDLE_THROW( - "rpc op should be in [" - "send, send_barrier. recv, fetch_barrier]"); - } + // send_barrier, recv, fetch_barrier's inputs are deps var, get them from + // all places + auto p = places_[op_dev_id]; + auto *op_handle = result->Get(kGraphOps).back().get(); + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); - CreateOpHandleIOs(result, node, op_dev_id); + SetOpInputsAllPlaces(result, node, places_.size()); + for (ir::Node *output : node->outputs) { + int outvar_dev_id = op_dev_id; + if (node->Op()->Type() == "fetch_barrier") { + outvar_dev_id = GetVarDeviceID(*result, output->Name()); + PADDLE_ENFORCE_NE(outvar_dev_id, -1); + } + p = places_[outvar_dev_id]; + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id); + } + } } bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 2cf14bd371831ab682166f4256d6966b5ab278c8..c6588435819a982166cf2d2368a82b4402fdc2bc 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -107,6 +107,7 @@ message VarType { // Tensor is used in C++. SIZE_T = 19; UINT8 = 20; + INT8 = 21; // Other types that may need additional descriptions LOD_TENSOR = 7; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index da0955a9a000e0d0bff3fe9d0bc3bd25171be3d2..bfc649017f19d67660bd11d590134cf56772bb27 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -3,14 +3,18 @@ cc_library(graph SRCS graph.cc DEPS node) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) +cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) -cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits) -cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detecter) +cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) +cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector) +cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass) - +cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) +cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) -cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter) -cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detecter graph pass graph_traits framework_proto) +cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) +cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) +cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto) diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..2876de88f174b1fa4ce0eacb8687e15e723bf1fc --- /dev/null +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -0,0 +1,273 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/api/helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +struct Param { + std::string X = "concat_0.tmp_0"; + std::string C0 = "cell_init"; + std::string H0 = "hidden_init"; + std::string AttentionWeight = "attention_fc.w_0"; + std::string AttentionBias = "attention_fc.b_0"; + std::string AttentionScalar = "attention_output.w_0"; + std::string AttentionScalarBias = "attention_output.b_0"; + std::string LSTMWeight = "attention_w.new"; + std::string LSTMBias = "attention_b.new"; + std::string Hidden = "array_to_lod_tensor_0.tmp_0"; + std::string Cell = "at.cell.new"; + std::string AttentionedX = "at.x.new"; + std::string AttentionFCOut = "at.fc.new"; + std::string LSTMX = "at.lstmx.new"; + std::string LSTMOUT = "at.lstmout.new"; +}; + +void PrepareParameters(Graph* graph, const Param& param); + +void FindWhileOp(Graph* graph) { + GraphPatternDetector gpd; + std::unordered_set fused_external_ops( + {35, 36, 37, 38, 43, 44, 49, 45, 46, 47, 41, 42, 53, 54, 48, + 57, 55, 56, 52, 74, 80, 77, 78, 79, 50, 77, 39, 40, 51}); + + gpd.mutable_pattern()->NewNode( + [&](Node* n) { return fused_external_ops.count(n->id()); }, "while"); + + if (!graph->Has(kGraphvizMarkedNodeAttr)) { + graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t); + } + auto& marked_nodes = + graph->Get(kGraphvizMarkedNodeAttr); + + auto handle = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + auto* while_pat_node = gpd.pattern().RetriveNode("while"); + auto* while_node = subgraph.at(while_pat_node); + marked_nodes.insert(while_node); + }; + gpd(graph, handle); + + Param param; + // Add AttentionLSTM node + OpDesc op_desc; + op_desc.SetType("attention_lstm"); + +#define OP_SET_IN(x) op_desc.SetInput(#x, {param.x}); +#define OP_SET_OUT(x) op_desc.SetOutput(#x, {param.x}); + OP_SET_IN(X); + OP_SET_IN(C0); + OP_SET_IN(H0); + OP_SET_IN(AttentionWeight); + OP_SET_IN(AttentionBias); + OP_SET_IN(AttentionScalar); + OP_SET_IN(AttentionScalarBias); + OP_SET_IN(LSTMWeight); + OP_SET_IN(LSTMBias); + + OP_SET_OUT(Hidden); + OP_SET_OUT(Cell); + OP_SET_OUT(AttentionedX); + OP_SET_OUT(AttentionFCOut); + OP_SET_OUT(LSTMX); + OP_SET_OUT(LSTMOUT); +#undef OP_SET_IN +#undef OP_SET_OUT + + auto* X = graph->RetriveNode(34); + auto* LSTMOUT = graph->RetriveNode(81); + auto* cell_init = graph->RetriveNode(6); + auto* hidden_init = graph->RetriveNode(8); + +#define LINK_TO(node0, node1) \ + node0->outputs.push_back(node1); \ + node1->inputs.push_back(node0); + + auto* lstm_op = graph->CreateOpNode(&op_desc); + PrepareParameters(graph, param); + + LINK_TO(X, lstm_op); + LINK_TO(cell_init, lstm_op); + LINK_TO(hidden_init, lstm_op); + LINK_TO(lstm_op, LSTMOUT); + + GraphSafeRemoveNodes(graph, marked_nodes); +} + +#define CHECK_P1(x) PADDLE_ENFORCE_NOT_NULL(x); +#define CHECK_P2(x0, x1) \ + CHECK_P1(x0); \ + CHECK_P1(x1); +#define CHECK_P3(x0, x1, x2) \ + CHECK_P2(x0, x1); \ + CHECK_P1(x2); +#define CHECK_P4(x0, x1, x2, x3) \ + CHECK_P3(x0, x1, x2); \ + CHECK_P1(x3); +#define CHECK_P5(x0, x1, x2, x3, x4) \ + CHECK_P4(x0, x1, x2, x3); \ + CHECK_P1(x4); + +void PrepareLSTMWeight(const LoDTensor& W_forget_w0, + const LoDTensor& W_forget_w1, + const LoDTensor& W_input_w0, const LoDTensor& W_input_w1, + const LoDTensor& W_output_w0, + const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0, + const LoDTensor& W_cell_w1, LoDTensor* out); + +void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, + const LoDTensor& B_output, const LoDTensor& B_cell, + LoDTensor* out); + +void PrepareParameters(Graph* graph, const Param& param) { + // Check parameters + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto* scope = graph->Get(kParamScopeAttr); + + // Create new parameters. + scope->Var(param.LSTMWeight)->GetMutable(); + scope->Var(param.LSTMBias)->GetMutable(); + scope->Var(param.Hidden)->GetMutable(); + scope->Var(param.Cell)->GetMutable(); + scope->Var(param.AttentionedX)->GetMutable(); + scope->Var(param.AttentionFCOut)->GetMutable(); + scope->Var(param.LSTMX)->GetMutable(); + scope->Var(param.LSTMOUT)->GetMutable(); + +#define GATE_W(name__) \ + auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \ + auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \ + auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \ + CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \ + VLOG(4) << #name__ "_w0" \ + << " shape: " << W_##name__##_w0->Get().dims(); \ + VLOG(4) << #name__ "_w1" \ + << " shape: " << W_##name__##_w1->Get().dims(); \ + VLOG(4) << #name__ "_b0" \ + << " shape: " << W_##name__##_b0->Get().dims(); \ + auto& W_##name__##_w0_t = W_##name__##_w0->Get(); \ + auto& W_##name__##_w1_t = W_##name__##_w1->Get(); \ + auto& W_##name__##_b0_t = W_##name__##_b0->Get(); + + GATE_W(forget); + GATE_W(input); + GATE_W(output); + GATE_W(c); +#undef GATE_W + + auto* attention_fc_w = scope->FindVar("attention_fc.w_0"); + auto* attention_fc_b = scope->FindVar("attention_fc.b_0"); + auto* attention_output_w = scope->FindVar("attention_output.w_0"); + auto* attention_output_b = scope->FindVar("attention_output.b_0"); + CHECK_P4(attention_fc_w, attention_fc_b, attention_output_w, + attention_output_b); + + auto* lstm_weight = scope->Var(param.LSTMWeight); + auto* lstm_weight_t = lstm_weight->GetMutable(); + auto* lstm_bias = scope->Var(param.LSTMBias); + auto* lstm_bias_t = lstm_bias->GetMutable(); + + // reshape attention_bias + auto* attention_bias_t = + scope->FindVar(param.AttentionBias)->GetMutable(); + PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1); + attention_bias_t->Resize(make_ddim({1, attention_bias_t->dims()[0]})); + + auto* attention_scalar_bias_t = + scope->FindVar(param.AttentionScalarBias)->GetMutable(); + attention_scalar_bias_t->Resize( + make_ddim({1, attention_scalar_bias_t->dims()[0]})); + + PrepareLSTMWeight(W_forget_w0_t, W_forget_w1_t, W_input_w0_t, W_input_w1_t, + W_output_w0_t, W_output_w1_t, W_c_w0_t, W_c_w1_t, + lstm_weight_t); + PrepareLSTMBias(W_forget_b0_t, W_input_b0_t, W_output_b0_t, W_c_b0_t, + lstm_bias_t); +} + +// Prepare parameters +void PrepareLSTMWeight(const LoDTensor& W_forget_w0, + const LoDTensor& W_forget_w1, + const LoDTensor& W_input_w0, const LoDTensor& W_input_w1, + const LoDTensor& W_output_w0, + const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0, + const LoDTensor& W_cell_w1, LoDTensor* out) { + int D = W_forget_w0.dims()[0]; + int M = W_forget_w1.dims()[0]; + out->Resize(make_ddim({D + M, 4 * D})); + VLOG(3) << "LSTMWeight resized to " << out->dims(); + + float* out_data = out->mutable_data(platform::CPUPlace()); + std::array tensors( + {W_forget_w0.data(), W_input_w0.data(), + W_output_w0.data(), W_cell_w0.data()}); + std::array tensors1( + {W_forget_w1.data(), W_input_w1.data(), + W_output_w1.data(), W_cell_w1.data()}); + + for (int row = 0; row < D; row++) { + for (int col = 0; col < 4; col++) { + float* dst = out_data + 4 * D * row + D * col; + const float* src = tensors[col] + D * row; + memcpy(dst, src, D * sizeof(float)); + } + } + + for (int row = 0; row < M; row++) { + for (int col = 0; col < 4; col++) { + float* dst = out_data + 4 * D * (D + row) + D * col; + const float* src = tensors1[col] + D * row; + memcpy(dst, src, D * sizeof(float)); + } + } +} + +void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, + const LoDTensor& B_output, const LoDTensor& B_cell, + LoDTensor* out) { + std::array tensors( + {B_forget.data(), B_input.data(), B_output.data(), + B_cell.data()}); + + PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); + int D = B_forget.dims()[0]; + out->Resize(make_ddim({1, 4 * D})); + auto* out_data = out->mutable_data(platform::CPUPlace()); + for (size_t i = 0; i < tensors.size(); i++) { + memcpy(out_data + D * i, tensors[i], D * sizeof(float)); + } +} + +// Parameters + +std::unique_ptr AttentionLSTMFusePass::ApplyImpl( + std::unique_ptr graph) const { + PDPattern external_pattern, subblock_pattern; + + FindWhileOp(graph.get()); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(attention_lstm_fuse_pass, + paddle::framework::ir::AttentionLSTMFusePass); diff --git a/paddle/fluid/inference/analysis/dot.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h similarity index 62% rename from paddle/fluid/inference/analysis/dot.cc rename to paddle/fluid/framework/ir/attention_lstm_fuse_pass.h index d5471ffcb594a6915e9e65c0fee5adc5f5bdf40c..a756dfc1b98e1de55c809c73e2c4df1e628950ae 100644 --- a/paddle/fluid/inference/analysis/dot.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,12 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/dot.h" +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" namespace paddle { -namespace inference { -namespace analysis { -size_t Dot::counter = 0; -} // namespace analysis -} // namespace inference +namespace framework { +namespace ir { + +class AttentionLSTMFusePass : public FusePassBase { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index f4327742eac843f27385c165216ce48ceb97ea71..201160f29df1ee5473ba5e6cf434fa246e015a12 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -100,12 +100,10 @@ void BuildFCPattern(PDPattern* pattern) { }, "elementwise_add_out"); - pattern->AddEdge(mul_parameter_var, mul_op); - pattern->AddEdge(mul_tmp_input_var, mul_op); - pattern->AddEdge(mul_op, mul_out_var); - pattern->AddEdge(mul_out_var, elementwise_add_op); - pattern->AddEdge(elementwise_add_tmp_var, elementwise_add_op); - pattern->AddEdge(elementwise_add_op, elementwise_add_out_var); + mul_op->LinksFrom({mul_parameter_var, mul_tmp_input_var}) + .LinksTo({mul_out_var}); + elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var}) + .LinksTo({elementwise_add_out_var}); } // Replace the node `from` in the links to `to` @@ -125,7 +123,7 @@ std::unique_ptr FCFusePass::ApplyImpl( std::unordered_set nodes2delete; - GraphPatternDetecter gpd; + GraphPatternDetector gpd; BuildFCPattern(gpd.mutable_pattern()); #define GET_NODE(id) \ @@ -134,7 +132,7 @@ std::unique_ptr FCFusePass::ApplyImpl( auto* id = subgraph.at(gpd.pattern().RetriveNode(#id)); \ PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); - auto handler = [&](const GraphPatternDetecter::subgraph_t& subgraph, + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { VLOG(4) << "handle FC fuse"; // Currently, there is no FC op available, so I will just simulate the diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h index eb43dd4486cda578804fb9f6438c67e9e4a03091..31ed0e362f760319130135ad49fe2bb4e68e6786 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_fuse_pass.h @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detecter.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..daecf3b407c5b40c0ad6c3a75d7fbad3fe45c664 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr FCLstmFusePass::ApplyImpl( + std::unique_ptr graph) const { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + std::unordered_set fused_ops({// first lstm + 13, 15, 16, + // second lstm + 23, 25, 26}); + + pattern->NewNode([&](Node* x) { return fused_ops.count(x->id()); }, + "any_node"); + + std::unordered_set marked_nodes; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + + auto* id = subgraph.at(gpd.pattern().RetriveNode("any_node")); + marked_nodes.insert(id); + }; + gpd(graph.get(), handler); + + // Create New OpDesc + auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h, + int bias, int hidden, int cell, int xx) { +#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x); + GET_NODE(input); + GET_NODE(weight_x); + GET_NODE(weight_h); + GET_NODE(bias); + GET_NODE(hidden); + GET_NODE(cell); + GET_NODE(xx); + GET_NODE(lstm); + + OpDesc op_desc; + op_desc.SetType("fusion_lstm"); +#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()}); + SET_IN(X, input); + SET_IN(WeightX, weight_x); + SET_IN(WeightH, weight_h); + SET_IN(Bias, bias); +#undef GET_NODE +#undef SET_IN + + LOG(INFO) << "hidden_n: " << hidden_n->Name(); + LOG(INFO) << "cell: " << cell_n->Name(); + LOG(INFO) << "xx: " << xx_n->Name(); + + op_desc.SetInput("H0", {}); + op_desc.SetInput("C0", {}); + op_desc.SetOutput("Hidden", {hidden_n->Name()}); + op_desc.SetOutput("Cell", {cell_n->Name()}); + op_desc.SetOutput("XX", {xx_n->Name()}); + op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"}); + op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"}); + op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse")); + op_desc.SetAttr("use_peepholes", false); + auto* op = graph->CreateOpNode(&op_desc); + +#define LINK_TO(a, b) \ + a->outputs.push_back(b); \ + b->inputs.push_back(a); + LINK_TO(input_n, op); + LINK_TO(weight_x_n, op); + LINK_TO(weight_h_n, op); + LINK_TO(bias_n, op); + LINK_TO(op, hidden_n); +#undef LINK_TO + return op; + + }; + + lstm_creator(16, 12, 14, 18, 17, 22, 21, 19); + lstm_creator(26, 12, 24, 28, 27, 32, 31, 29); + + // remove all the nodes + + for (auto* node : marked_nodes) { + graph->RemoveNode(const_cast(node)); + } + + for (auto* node : graph->Nodes()) { + for (auto it = node->inputs.begin(); it != node->inputs.end();) { + if (marked_nodes.count(*it)) { + it = const_cast(node)->inputs.erase(it); + } else + it++; + } + for (auto it = node->outputs.begin(); it != node->outputs.end();) { + if (marked_nodes.count(*it)) { + it = const_cast(node)->outputs.erase(it); + } else + it++; + } + } + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass); diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..74b08ae558b12c9328db58687cd01edbc37291a8 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class FCLstmFusePass : public Pass { + public: + virtual ~FCLstmFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h new file mode 100644 index 0000000000000000000000000000000000000000..bf6a0ae8274cecc785ffb269b0b574a42ee7d418 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_pass_base.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace ir { + +static const char kParamScopeAttr[] = "param_scope"; + +class FusePassBase : public Pass { + public: + void Init(Graph* graph) const { graph_ = graph; } + + Scope* param_scope() const { + PADDLE_ENFORCE(graph_->Has(kParamScopeAttr)); + return graph_->Get(kParamScopeAttr); + } + + virtual ~FusePassBase() {} + + protected: + mutable Graph* graph_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 2a6bf4ac230df81b38751000bf4b663f24984db3..39b0f2f038380b4631728e28031511205c4b40f2 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -132,63 +132,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { } } - std::vector send_ops; - ir::Node *send_bar = nullptr; - std::vector recv_ops; - ir::Node *fetch_bar = nullptr; - for (ir::Node *node : Nodes()) { - if (node->Name() == "send") { - send_ops.push_back(node); - } else if (node->Name() == "send_barrier") { - PADDLE_ENFORCE(!send_bar, "only has one send barrier"); - send_bar = node; - } else if (node->Name() == "recv") { - recv_ops.push_back(node); - } else if (node->Name() == "fetch_barrier") { - PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier"); - fetch_bar = node; - } - } - if (send_bar) { - for (ir::Node *send : send_ops) { - ir::Node *dep_var = CreateControlDepVar(); - send->outputs.push_back(dep_var); - dep_var->inputs.push_back(send); - send_bar->inputs.push_back(dep_var); - dep_var->outputs.push_back(send_bar); - } - for (ir::Node *recv : recv_ops) { - ir::Node *dep_var = CreateControlDepVar(); - recv->inputs.push_back(dep_var); - dep_var->outputs.push_back(recv); - send_bar->outputs.push_back(dep_var); - dep_var->inputs.push_back(send_bar); - } - } - if (fetch_bar) { - for (ir::Node *recv : recv_ops) { - ir::Node *dep_var = CreateControlDepVar(); - recv->outputs.push_back(dep_var); - dep_var->inputs.push_back(recv); - fetch_bar->inputs.push_back(dep_var); - dep_var->outputs.push_back(fetch_bar); - } - } - - std::vector send_vars = FindDistTrainSendVars(send_ops); - std::vector recv_vars = FindDistTrainRecvVars(recv_ops); - for (ir::Node *node : Nodes()) { - if (IsDistTrainOp(node, send_vars, recv_vars)) { - if (fetch_bar && node->Name() == "concat") { - ir::Node *dep_var = CreateControlDepVar(); - fetch_bar->outputs.push_back(dep_var); - dep_var->inputs.push_back(fetch_bar); - node->inputs.push_back(dep_var); - dep_var->outputs.push_back(node); - } - } - } - /** * We should handle write after read(WAR) and write after write(WAW) here. * Because some of the operators of the program can be executed parallelly. diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 0d27be5fc007746d6ca41ff0dbcea5c5f45599ef..b696489565ccaf83947d8ecd730c24c3bf22b5c8 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -99,13 +99,13 @@ class Graph { // Create a normal variable with non-null VarDesc. ir::Node *CreateVarNode(VarDesc *var_desc) { PADDLE_ENFORCE(var_desc); - return AddNode(new ir::Node(var_desc)); + return AddNode(new ir::Node(var_desc, node_count_++)); } // Create a normal runnable operator with OpDesc. ir::Node *CreateOpNode(OpDesc *op_desc) { PADDLE_ENFORCE(op_desc); - return AddNode(new ir::Node(op_desc)); + return AddNode(new ir::Node(op_desc, node_count_++)); } // Create a control dependency var that connects 2 operations. The @@ -115,13 +115,14 @@ class Graph { // TODO(panyx0718): control var name should be really unique. const std::string name = string::Sprintf( "%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); - return AddNode(new ir::Node(name, ir::Node::Type::kVariable)); + return AddNode( + new ir::Node(name, ir::Node::Type::kVariable, node_count_++)); } // A more free style way of creating a graph node. Mostly use for test // or "copy" from another node. Avoid using it if possible. ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) { - return AddNode(new ir::Node(name, type)); + return AddNode(new ir::Node(name, type, node_count_++)); } // Clear all node information of the graph and return the ownership of the @@ -142,12 +143,20 @@ class Graph { nodes_.erase(node); } + Node *RetriveNode(int id) { + auto it = id2node_.find(id); + if (it != id2node_.end()) return it->second; + return nullptr; + } + private: // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); nodes_[node].reset(node); node_set_.insert(node); + PADDLE_ENFORCE(!id2node_.count(node->id()), "duplicate id %d", node->id()); + id2node_[node->id()] = node; return node; } @@ -157,6 +166,8 @@ class Graph { std::map> attr_dels_; std::map> nodes_; std::unordered_set node_set_; + std::map id2node_; + int node_count_{0}; }; bool IsControlDepVar(const ir::Node &var); diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index dc81a2cac585b50b81f79f8f204ce1145d93eab0..62f94a1c0e5a300438bbe5fea34b9a07df5d9ebf 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -103,10 +103,10 @@ std::map> BuildOperationAdjList( for (auto &var : n->inputs) { for (auto &adj_n : var->inputs) { PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); - adj_list[n].insert(adj_n); VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) << " -> " << n->Name() << reinterpret_cast(n) << " via " << var->Name() << reinterpret_cast(var); + adj_list[n].insert(adj_n); } } } diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc similarity index 71% rename from paddle/fluid/framework/ir/graph_pattern_detecter.cc rename to paddle/fluid/framework/ir/graph_pattern_detector.cc index e197861251fe5c9f98eaaba2a10b4af371dcbcba..dce4be8ff04204a134441410646c9a01b5dd40a3 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -17,7 +17,7 @@ #include #include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/graph_pattern_detecter.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/platform/enforce.h" @@ -34,7 +34,7 @@ PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) { name); } - nodes_.emplace_back(new PDNode(std::move(teller), name)); + nodes_.emplace_back(new PDNode(std::move(teller), this, name)); auto* cur = nodes_.back().get(); node_map_[name] = cur; return cur; @@ -56,19 +56,22 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) { edges_.emplace_back(a, b); } -void GraphPatternDetecter::operator()(Graph* graph, - GraphPatternDetecter::handle_t handler) { +void GraphPatternDetector::operator()(Graph* graph, + GraphPatternDetector::handle_t handler) { if (!MarkPDNodesInGraph(*graph)) return; auto subgraphs = DetectPatterns(); UniquePatterns(&subgraphs); RemoveOverlappedMatch(&subgraphs); + LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; + int id = 0; for (auto& g : subgraphs) { + LOG(INFO) << "optimizing #" << id++ << " subgraph"; handler(g, graph); } } -bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) { +bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { VLOG(4) << "mark pdnodes in graph"; if (graph.Nodes().empty()) return false; @@ -114,13 +117,15 @@ bool IsNodesLink(Node* a, Node* b) { return false; } -std::vector -GraphPatternDetecter::DetectPatterns() { +std::vector +GraphPatternDetector::DetectPatterns() { // Init empty subgraphs. - std::vector result; + std::vector result; std::vector init_groups; - PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed"); - auto* first_pnode = pattern_.edges().front().first; + std::array, 2> bi_records; + // PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed"); + auto* first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() + : pattern_.edges().front().first; if (!pdnodes2nodes_.count(first_pnode)) return result; for (auto* node : pdnodes2nodes_[first_pnode]) { HitGroup group; @@ -129,7 +134,6 @@ GraphPatternDetecter::DetectPatterns() { } int step = 0; - std::array, 2> bi_records; bi_records[0] = std::move(init_groups); // Extend a PDNode to subgraphs by deducing the connection relations defined @@ -141,6 +145,7 @@ GraphPatternDetecter::DetectPatterns() { auto& pre_groups = bi_records[step % 2]; auto& cur_groups = bi_records[1 - (step++ % 2)]; cur_groups.clear(); + if (pre_groups.empty()) break; // source -> target for (Node* source : pdnodes2nodes_[edge.first]) { for (Node* target : pdnodes2nodes_[edge.second]) { @@ -163,7 +168,7 @@ GraphPatternDetecter::DetectPatterns() { } for (auto& group : bi_records[step % 2]) { - GraphPatternDetecter::subgraph_t subgraph; + GraphPatternDetector::subgraph_t subgraph; for (auto& role : group.roles) { subgraph.emplace(role.first, role.second); } @@ -172,10 +177,10 @@ GraphPatternDetecter::DetectPatterns() { return result; } -void GraphPatternDetecter::UniquePatterns( - std::vector* subgraphs) { +void GraphPatternDetector::UniquePatterns( + std::vector* subgraphs) { if (subgraphs->empty()) return; - std::vector result; + std::vector result; std::unordered_set set; for (auto& g : *subgraphs) { @@ -192,7 +197,7 @@ void GraphPatternDetecter::UniquePatterns( *subgraphs = result; } -void GraphPatternDetecter::RemoveOverlappedMatch( +void GraphPatternDetector::RemoveOverlappedMatch( std::vector* subgraphs) { std::vector result; std::unordered_set node_set; @@ -215,6 +220,46 @@ void GraphPatternDetecter::RemoveOverlappedMatch( *subgraphs = result; } +std::string PDPattern::DotString() const { + using inference::analysis::Dot; + Dot dot; + int id = 0; + // Create Nodes + std::unordered_map node2dot; + for (const auto& node : nodes()) { + std::string node_id = "Node" + std::to_string(id++); + dot.AddNode(node_id, {}, node->name()); + node2dot[node.get()] = node_id; + } + // Create Edges + for (const auto& edge : edges()) { + if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) { + LOG(ERROR) << "no node " << edge.first << " " << edge.second; + continue; + } + auto& src = node2dot.at(edge.first); + auto& trg = node2dot.at(edge.second); + dot.AddEdge(src, trg, {}); + } + return dot.Build(); +} + +PDNode& PDNode::LinksTo(const std::vector& others) { + // extend outlinks. + for (PDNode* x : others) { + pattern_->AddEdge(this, x); + } + return *this; +} + +PDNode& PDNode::LinksFrom(const std::vector& others) { + // extend outlinks. + for (PDNode* x : others) { + pattern_->AddEdge(x, this); + } + return *this; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.h b/paddle/fluid/framework/ir/graph_pattern_detector.h similarity index 72% rename from paddle/fluid/framework/ir/graph_pattern_detecter.h rename to paddle/fluid/framework/ir/graph_pattern_detector.h index 68c39902b5a79bf25ca7f08529a958274ac64e33..0ac34a57aacdc4fcd3d6bcaa0b72b1d6dabb3abd 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detecter.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -21,12 +21,14 @@ #include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/inference/analysis/dot.h" namespace paddle { namespace framework { namespace ir { +class PDPattern; -// Some basic torminolygies: +// Some basic terminologies: // - PDPattern: a pattern defined as a data flow graph. // - PDNode: the node in the pattern, each PDNode represents an `ir::Node` // that meets some conditions defined in `PDNode.teller`. @@ -36,30 +38,43 @@ namespace ir { struct PDNode { // tell whether an ir::Node* is a candidation for a PDNode. using teller_t = std::function; + enum class Type { kOp, kVar }; - PDNode(teller_t&& teller, const std::string& name = "") - : teller_(teller), name_(name) { - PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set."); - } - - PDNode(PDNode&& other) = default; - - std::vector inlinks; - std::vector outlinks; + // this link to others + PDNode& LinksTo(const std::vector& others); + PDNode& LinksFrom(const std::vector& others); bool Tell(Node* node) const { PADDLE_ENFORCE(teller_ != nullptr, "teller should be set for a PDNode"); return teller_(node); } + bool IsOp() const { return type_ == Type::kOp; } + bool IsVar() const { return type_ == Type::kVar; } + const std::string& name() const { return name_; } PDNode(const PDNode&) = delete; PDNode& operator=(const PDNode&) = delete; private: + PDNode(teller_t&& teller, PDPattern* pattern, const std::string& name = "", + Type type = Type::kVar) + : teller_(std::move(teller)), + pattern_(pattern), + name_(name), + type_(type) { + PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set."); + } + + PDNode(PDNode&& other) = default; + + friend class PDPattern; + teller_t teller_; + PDPattern* pattern_; std::string name_; + Type type_; }; /* @@ -102,6 +117,8 @@ class PDPattern { const std::vector>& nodes() const { return nodes_; } const std::vector& edges() const { return edges_; } + std::string DotString() const; + private: #ifdef PADDLE_WITH_TESTING FRIEND_TEST(PDPattern, AddEdge); @@ -117,7 +134,7 @@ class PDPattern { }; /* - * GraphPatternDetecter helps to detect the specific patterns in the graph. + * GraphPatternDetector helps to detect the specific patterns in the graph. * Input a pattern, output a list of the matched subgraphs/nodes. * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.). * @@ -129,7 +146,7 @@ class PDPattern { * * Usage: * // Create a detector - * GraphPatternDetecter detector; + * GraphPatternDetector detector; * // Define the detector's pattern, by adding PDNode and define the edges. * auto* node0 = detector.mutable_pattern().AddNode(...) * auto* node1 = detector.mutable_pattern().AddNode(...) @@ -138,11 +155,11 @@ class PDPattern { * detector.mutable_pattern().AddEdge(node0, node1); * // Create an handler, to define the behavior of treating the filtered * // subgraphs that comply with the patterns. - * GraphPatternDetecter::handle_t handler = some labmda + * GraphPatternDetector::handle_t handler = some labmda * // Execute the detector. * detector(&graph, handler); */ -class GraphPatternDetecter { +class GraphPatternDetector { public: using subgraph_t = std::unordered_map; @@ -177,10 +194,62 @@ class GraphPatternDetecter { using hit_rcd_t = std::pair; PDPattern pattern_; - std::vector marked_records_; std::unordered_map> pdnodes2nodes_; }; +// some helper methods. + +// Op's input. +static bool VarLinksToOp(Node* node, const std::string& op_type) { + for (auto* out : node->outputs) { + if (out->IsOp() && out->Op()->Type() == op_type) { + return true; + } + } + return false; +} + +// Op's output. +static bool VarLinksFromOp(Node* node, const std::string& op_type) { + for (auto* out : node->inputs) { + if (out->IsOp() && out->Op()->Type() == op_type) { + return true; + } + } + return false; +} + +// Check whether a var node is a op node's nth input. +static bool IsNthInput(Node* var, Node* op, const std::string& argument, + size_t nth) { + PADDLE_ENFORCE(var->IsVar()); + PADDLE_ENFORCE(op->IsOp()); + if (op->inputs.size() <= nth) return false; + return var->Name() == op->Op()->Input(argument)[nth]; +} + +static void GraphSafeRemoveNodes(Graph* graph, + const std::unordered_set& nodes) { + for (auto* node : nodes) { + graph->RemoveNode(const_cast(node)); + } + + for (auto* node : graph->Nodes()) { + for (auto it = node->inputs.begin(); it != node->inputs.end();) { + if (nodes.count(*it)) { + it = const_cast(node)->inputs.erase(it); + } else + it++; + } + for (auto it = node->outputs.begin(); it != node->outputs.end();) { + if (nodes.count(*it)) { + it = const_cast(node)->outputs.erase(it); + } else + it++; + } + } +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc similarity index 95% rename from paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc rename to paddle/fluid/framework/ir/graph_pattern_detector_tester.cc index 06f9df5546910f492c9dd1da3e694623898d3d1d..a4d0646230c0fdfb7e1970523799e7db10c75538 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/graph_pattern_detecter.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include @@ -82,7 +82,7 @@ TEST(PDPattern, AddEdge) { } TEST(GraphPatternDetecter, MarkPDNodesInGraph) { - GraphPatternDetecter x; + GraphPatternDetector x; // mark o2, o3, v2 // The pattern is a graph: @@ -131,7 +131,7 @@ TEST(GraphPatternDetecter, MultiSubgraph) { Graph graph(program); BuildGraph(&graph); - GraphPatternDetecter x; + GraphPatternDetector x; // The pattern is a graph: // op -> var @@ -149,8 +149,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) { x.mutable_pattern()->AddEdge(any_var, any_op1); int count = 0; - GraphPatternDetecter::handle_t handle = [&]( - const GraphPatternDetecter::subgraph_t& s, Graph* g) { + GraphPatternDetector::handle_t handle = [&]( + const GraphPatternDetector::subgraph_t& s, Graph* g) { LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> " << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name(); count++; diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..414d8f79b15de091c62af5fe099ffae144156e4e --- /dev/null +++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" + +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr GraphToProgramPass::ApplyImpl( + std::unique_ptr graph) const { + ProgramDesc& program = Get("program"); + + std::unique_ptr program_pb( + new proto::ProgramDesc(*program.Proto())); + + auto block = program_pb->mutable_blocks(kRootBlockIndex); + block->clear_vars(); + std::unordered_set visited_vars; + for (ir::Node* n : graph->Nodes()) { + if (n->NodeType() == ir::Node::Type::kVariable) { + if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) { + visited_vars.insert(n->Var()->Name()); + block->add_vars()->MergeFrom(*n->Var()->Proto()); + } + } + } + + block->clear_ops(); + std::vector nodes = TopologySortOperations(*graph); + for (ir::Node* n : nodes) { + if (!n->Op()) { + continue; + } + block->add_ops()->MergeFrom(*n->Op()->Proto()); + } + + program.CopyFrom(*program_pb); + return graph; +} +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(graph_to_program_pass, paddle::framework::ir::GraphToProgramPass); diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..124ec5a8e771fb768b31fa2e9f5143db96154490 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_to_program_pass.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class GraphToProgramPass : public Pass { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..88ad17a0c65137d62484ec340f48e1a76ff893fe --- /dev/null +++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" + +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +void BuildNoCircleGraph(Graph* g) { + OpDesc op1; + op1.SetType("op1"); + OpDesc op2; + op2.SetType("op2"); + OpDesc op3; + op3.SetType("op3"); + OpDesc op4; + op4.SetType("op4"); + OpDesc op5; + op5.SetType("op5"); + VarDesc var1("var1"); + VarDesc var2("var2"); + VarDesc var3("var3"); + VarDesc var4("var4"); + + ir::Node* o1 = g->CreateOpNode(&op1); + ir::Node* o2 = g->CreateOpNode(&op2); + ir::Node* o3 = g->CreateOpNode(&op3); + ir::Node* o4 = g->CreateOpNode(&op4); + ir::Node* o5 = g->CreateOpNode(&op5); + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + v2->inputs.push_back(o2); + // o2->v3->o5 + o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + v4->outputs.push_back(o5); +} + +TEST(GraphToProgramPass, Basic) { + ProgramDesc prog; + std::unique_ptr g(new Graph(prog)); + BuildNoCircleGraph(g.get()); + + auto pass = paddle::framework::ir::PassRegistry::Instance().Get( + "graph_to_program_pass"); + + ProgramDesc compiled_prog; + pass->SetNotOwned("program", &compiled_prog); + pass->Apply(std::move(g)); + std::vector ops = compiled_prog.Block(0).AllOps(); + EXPECT_EQ(ops[0]->Type(), "op1"); + EXPECT_EQ(ops[1]->Type(), "op2"); + if (ops[2]->Type() == "op3") { + EXPECT_EQ(ops[3]->Type(), "op4"); + } else if (ops[2]->Type() == "op4") { + EXPECT_EQ(ops[3]->Type(), "op3"); + } + EXPECT_EQ(ops[4]->Type(), "op5"); + + std::unordered_set vars; + for (VarDesc* v : compiled_prog.Block(0).AllVars()) { + vars.insert(v->Name()); + } + EXPECT_TRUE(vars.find("var1") != vars.end()); + EXPECT_TRUE(vars.find("var2") != vars.end()); + EXPECT_TRUE(vars.find("var3") != vars.end()); +} +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index e7ff0c1dac134334e3baad88886862ebff0fe367..3a114c6a237ea4411a8c4dd4b3ee6a00b7729d7c 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -16,11 +16,13 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/inference/analysis/dot.h" namespace paddle { namespace framework { namespace ir { static const char kGraphVizPath[] = "graph_viz_path"; +using inference::analysis::Dot; std::unique_ptr GraphVizPass::ApplyImpl( std::unique_ptr graph) const { @@ -30,41 +32,65 @@ std::unique_ptr GraphVizPass::ApplyImpl( PADDLE_ENFORCE(fout->good()); std::ostream& sout = *fout; - size_t var_id = 0; - std::unordered_map vars; - - sout << "digraph G {\n"; - - for (const ir::Node* n : graph->Nodes()) { - if (n->NodeType() != ir::Node::Type::kVariable) continue; - size_t cur_var_id = var_id++; - vars[n] = cur_var_id; - - sout << "var_" << cur_var_id << " [label=\"" << n->Name() << "\"]" - << std::endl; - } - - size_t op_id = 0; - for (const ir::Node* n : graph->Nodes()) { - if (n->NodeType() != ir::Node::Type::kOperation) continue; - std::string op_name = "op_" + std::to_string(op_id++); - sout << op_name << " [label=\"" << n->Name() << "\", shape=rect]" - << std::endl; - for (auto in : n->inputs) { - std::string var_name = "var_" + std::to_string(vars[in]); - sout << var_name << " -> " << op_name << std::endl; + std::unordered_map node2dot; + + Dot dot; + + std::vector op_attrs({Dot::Attr("style", "filled"), + Dot::Attr("shape", "box"), + Dot::Attr("fillcolor", "red")}); + std::vector var_attrs({Dot::Attr("style", "filled,rounded"), + // Dot::Attr("shape", "diamond"), + Dot::Attr("fillcolor", "yellow")}); + + std::vector marked_op_attrs({Dot::Attr("style", "filled"), + Dot::Attr("shape", "box"), + Dot::Attr("fillcolor", "lightgray")}); + std::vector marked_var_attrs( + {Dot::Attr("style", "filled,rounded"), + // Dot::Attr("shape", "diamond"), + Dot::Attr("fillcolor", "lightgray")}); + + auto marked_nodes = ConsumeMarkedNodes(graph.get()); + // Create nodes + for (const Node* n : graph->Nodes()) { + std::string node_id = n->Name() + "(" + std::to_string(n->id()) + ")"; + if (n->IsOp()) { + decltype(op_attrs) attr = + marked_nodes.count(n) ? marked_op_attrs : op_attrs; + dot.AddNode(node_id, attr, node_id); + } else if (n->IsVar()) { + decltype(op_attrs) attr = + marked_nodes.count(n) ? marked_var_attrs : var_attrs; + dot.AddNode(node_id, attr, node_id); } - - for (auto out : n->outputs) { - std::string var_name = "var_" + std::to_string(vars[out]); - sout << op_name << " -> " << var_name << std::endl; + node2dot[n] = node_id; + } + // Create edges + for (const Node* n : graph->Nodes()) { + const auto& src_id = node2dot.at(n); + for (auto* out : n->outputs) { + const auto& trg_id = node2dot.at(out); + dot.AddEdge(src_id, trg_id, {}); } } - sout << "}\n"; + sout << dot.Build(); + return graph; } +GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes( + Graph* graph) const { + marked_nodes_t res; + if (graph->Has(kGraphvizMarkedNodeAttr)) { + auto& attr = graph->Get(kGraphvizMarkedNodeAttr); + res = attr; + attr.clear(); + } + return res; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h index 1fd8c8a26e9581ccf605d4271a49ec2e90d8b997..8d885cb9e4ee6e01de386b0f22423988dbe60ca6 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.h +++ b/paddle/fluid/framework/ir/graph_viz_pass.h @@ -27,10 +27,19 @@ namespace paddle { namespace framework { namespace ir { +const char kGraphvizMarkedNodeAttr[] = "__graphviz__marked_node__"; + class GraphVizPass : public Pass { + public: + using marked_nodes_t = std::unordered_set; + protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; + + // Tell whether there are any marked nodes in the graph. Consume the + // corresponding attribute. + marked_nodes_t ConsumeMarkedNodes(Graph* graph) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index aab3180e7e5ece5de5f5227e76f78687700fed87..6d40e3852295a722a4f99946202ecde2bc4b82e9 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -29,20 +29,26 @@ class Node { enum class Type { kOperation, kVariable }; static constexpr char kControlDepVarName[] = "__control_var"; - explicit Node(const std::string& name, Type type) - : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} + explicit Node(const std::string& name, Type type, int id = -1) + : name_(name), + var_desc_(nullptr), + op_desc_(nullptr), + type_(type), + id_(id) {} - explicit Node(VarDesc* var_desc) + explicit Node(VarDesc* var_desc, int id = -1) : name_(var_desc->Name()), var_desc_(new VarDesc(*var_desc)), op_desc_(nullptr), - type_(Type::kVariable) {} + type_(Type::kVariable), + id_(id) {} - explicit Node(OpDesc* op_desc) + explicit Node(OpDesc* op_desc, int id = -1) : name_(op_desc->Type()), var_desc_(nullptr), op_desc_(new OpDesc(*op_desc, op_desc->Block())), - type_(Type::kOperation) {} + type_(Type::kOperation), + id_(id) {} Type NodeType() const { return type_; } @@ -58,6 +64,8 @@ class Node { return op_desc_.get(); } + int id() const { return id_; } + bool IsOp() const { return type_ == Type::kOperation; } bool IsVar() const { return type_ == Type::kVariable; } @@ -69,6 +77,7 @@ class Node { std::unique_ptr var_desc_; std::unique_ptr op_desc_; Type type_; + int id_; private: DISABLE_COPY_AND_ASSIGN(Node); diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..9bb5c232e5c2269643ddef7ed9c938e0332f7274 --- /dev/null +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc @@ -0,0 +1,256 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +struct FuseExpr {}; + +// sequence expand, concat fuse pattern, return concat's output +PDNode* BuildSeqExpandConcatPattern(PDPattern* pattern) { + // The following operators will be fused: + // concat + // sequence_expand + // sequence_expand + + // The following variables will be treat as inputs: + // concat mid input, 0th input for fused op + // sequence_expand input, 1th input for fused op + // sequence_expand input, 2th input for fused op + + // The following variables will be treat as outputs: + // concat output + + // So the following variables will be removed: + // sequence-expand output + // sequence-expand output + + // Three operators + auto* sequence_expand0 = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "sequence_expand"; + }, + "sequence_expand0"); + + auto* sequence_expand1 = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "sequence_expand"; + }, + "sequence_expand1"); + + auto* concat = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "concat" && // basic check + x->Op()->Input("X").size() == 3; // Special case + }, + "concat"); + + auto* sequence_expand0_in = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && VarLinksToOp(x, "sequence_expand"); + }, + "sequence_expand0_in"); + auto* sequence_expand1_in = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && VarLinksToOp(x, "sequence_expand"); + }, + "sequence_expand1_in"); + + // The variables + auto* sequence_expand0_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && + VarLinksFromOp(x, "sequence_expand") && // basic check + VarLinksToOp(x, "concat") && // is concat's input + IsNthInput(x, x->outputs[0], "X", 1); // X[0] + }, + "sequence_expand0_out"); + + auto* sequence_expand1_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && + VarLinksFromOp(x, "sequence_expand") && // basic check + VarLinksToOp(x, "concat") && // is concat's input + IsNthInput(x, x->outputs[0], "X", 2); // x[2] + }, + "sequence_expand1_out"); + + auto* concat_in0 = pattern->NewNode( + [](Node* x) { return x && x->IsVar() && VarLinksToOp(x, "concat"); }, + "concat_in0"); + + auto* concat_out = pattern->NewNode( + [](Node* x) { return x && x->IsVar() && VarLinksFromOp(x, "concat"); }, + "concat_out"); + + // Links + sequence_expand0->LinksFrom({sequence_expand0_in}) + .LinksTo({sequence_expand0_out}); + sequence_expand1->LinksFrom({sequence_expand1_in}) + .LinksTo({sequence_expand1_out}); + concat->LinksFrom({sequence_expand0_out, sequence_expand1_out, concat_in0}) + .LinksTo({concat_out}); + return concat_out; +} + +PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) { + PDNode* fc_w = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksToOp(x, "mul") && // link + x->Var()->Proto()->persistable(); // is a parameter + }, + "fc_w"); + + PDNode* mul_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksFromOp(x, "mul") && // link + VarLinksToOp(x, "elementwise_add") && // + !x->Var()->Proto()->persistable(); // is a parameter + }, + "mul_out"); + + PDNode* fc_mul = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "mul"; // basic + }, + "fc_mul"); + + PDNode* fc_bias = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksToOp(x, "elementwise_add") && // link + x->Var()->Proto()->persistable(); // is a parameter + }, + "fc_bias"); + + PDNode* elementwise_add = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "elementwise_add"; + }, + "elementwise_add"); + + PDNode* add_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksFromOp(x, "elementwise_add") && // link + !x->Var()->Proto()->persistable(); // is a parameter + }, + "add_out"); + + std::set acts({"sigmoid", "tanh", "relu", "identity"}); + PDNode* act = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && acts.count(x->Op()->Type()); + + }, + "act"); + + PDNode* fc_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + !x->Var()->Proto()->persistable(); // is a parameter + }, + "fc_out"); + + fc_mul->LinksFrom({fc_w, fc_x}).LinksTo({mul_out}); + elementwise_add->LinksFrom({mul_out, fc_bias}).LinksTo({add_out}); + act->LinksFrom({add_out}).LinksTo({fc_out}); + return fc_out; +} + +std::unique_ptr SeqConcatFcFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init(graph.get()); + GraphPatternDetector detector; + auto* pattern = detector.mutable_pattern(); + auto* concat_out = BuildSeqExpandConcatPattern(pattern); + BuildFCPattern(pattern, concat_out); + +#define GET_NODE(id, pattern) \ + PADDLE_ENFORCE(subgraph.count(pattern.RetriveNode(#id)), \ + "pattern has no Node called %s", #id); \ + auto* id = subgraph.at(pattern.RetriveNode(#id)); \ + PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); + + detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "get one concat pattern"; + // fc + GET_NODE(fc_w, detector.pattern()); + GET_NODE(fc_bias, detector.pattern()); + GET_NODE(act, detector.pattern()); + GET_NODE(fc_out, detector.pattern()); + + // concat + GET_NODE(concat_in0, detector.pattern()); + GET_NODE(sequence_expand0_in, detector.pattern()); + GET_NODE(sequence_expand1_in, detector.pattern()); + + OpDesc op_desc; + op_desc.SetType("fusion_seqexpand_concat_fc"); + op_desc.SetInput("X", {concat_in0->Name(), sequence_expand0_in->Name(), + sequence_expand1_in->Name()}); + op_desc.SetInput("FCWeight", {fc_w->Name()}); + op_desc.SetInput("FCBias", {fc_bias->Name()}); + const std::string fc_out_tmp = fc_out->Name() + ".tmp"; + param_scope()->Var(fc_out_tmp)->GetMutable(); + op_desc.SetOutput("FCOut", {fc_out_tmp}); + op_desc.SetOutput("Out", {fc_out->Name()}); + op_desc.SetAttr("fc_activation", act->Op()->Type()); + + auto* op_node = graph->CreateOpNode(&op_desc); +// Add links +#define NODE_LINKS(a, b) \ + a->outputs.push_back(b); \ + b->inputs.push_back(a); + NODE_LINKS(fc_w, op_node); + NODE_LINKS(fc_bias, op_node); + NODE_LINKS(concat_in0, op_node); + NODE_LINKS(sequence_expand0_in, op_node); + NODE_LINKS(sequence_expand1_in, op_node); + NODE_LINKS(op_node, fc_out); + + // Clean nodes. + std::unordered_set marked_nodes; + for (auto& item : subgraph) { + marked_nodes.insert(item.second); + } + marked_nodes.erase(fc_w); + marked_nodes.erase(fc_bias); + marked_nodes.erase(concat_in0); + marked_nodes.erase(sequence_expand0_in); + marked_nodes.erase(sequence_expand1_in); + marked_nodes.erase(fc_out); + + GraphSafeRemoveNodes(graph, marked_nodes); + }); + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(seq_concat_fc_fuse_pass, + paddle::framework::ir::SeqConcatFcFusePass); diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..9f5fd1a29adf918806d8f30097d8c7f002f48f3e --- /dev/null +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SeqConcatFcFusePass : public FusePassBase { + public: + virtual ~SeqConcatFcFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 122dc161b41246e5f08bd0ae8b763489e9ee22f9..555faba9624b9c76a9efdf4a62cd319f9682566e 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -95,6 +95,12 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, need_update_ = true; } +OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) { + CopyFrom(other); + block_ = block; + need_update_ = true; +} + void OpDesc::CopyFrom(const OpDesc &op_desc) { desc_.set_type(op_desc.Type()); inputs_ = op_desc.inputs_; @@ -131,8 +137,9 @@ OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block) for (const proto::OpDesc::Attr &attr : desc_.attrs()) { std::string attr_name = attr.name(); // The sub_block referred to by the BLOCK attr hasn't been added - // to ProgramDesc class yet, we skip setting BLOCK attr here. - if (attr.type() != proto::AttrType::BLOCK) { + // to ProgramDesc class yet, we skip setting BLOCK/BLOCKS attr here. + if (attr.type() != proto::AttrType::BLOCK && + attr.type() != proto::AttrType::BLOCKS) { attrs_[attr_name] = GetAttrValue(attr); } } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 2422392e24d864dc3e7973ab35e038ecf2c0392a..b4205aba83e774fb9c08193124adb93935c00157 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -37,11 +37,7 @@ class OpDesc { explicit OpDesc(BlockDesc *block) : block_(block) {} - OpDesc(const OpDesc &other, BlockDesc *block) { - *this = other; - block_ = block; - need_update_ = true; - } + OpDesc(const OpDesc &other, BlockDesc *block); void CopyFrom(const OpDesc &op_desc); diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index 344c001a69b53c82967ee983783892a514c2490b..a63944eaee6132c1082947fddcad4e0d72e26df1 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -80,6 +80,12 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { InitFromProto(); } +void ProgramDesc::CopyFrom(const proto::ProgramDesc &desc) { + blocks_.clear(); + desc_ = desc; + InitFromProto(); +} + ProgramDesc::ProgramDesc(const std::string &binary_str) { PADDLE_ENFORCE(desc_.ParseFromString(binary_str), "Fail to parse program_desc from binary string."); @@ -111,10 +117,16 @@ void ProgramDesc::InitFromProto() { const std::vector ProgramDesc::GetFeedTargetNames() { auto &global_block = Block(0); + // The order of feed_target_names must follow the index specified in `col`. + // since feed operator's order doesn't necessary follow 'col'. std::vector feed_target_names; for (auto *op : global_block.AllOps()) { if (op->Type() == kFeedOpType) { - feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]); + int col = boost::get(op->GetAttr("col")); + if (col >= feed_target_names.size()) { + feed_target_names.resize(col + 1); + } + feed_target_names[col] = op->Output("Out")[0]; } } return feed_target_names; @@ -122,10 +134,16 @@ const std::vector ProgramDesc::GetFeedTargetNames() { const std::vector ProgramDesc::GetFetchTargetNames() { auto &global_block = Block(0); + // The order of fetch_target_names must follow the index specified in `col`. + // since fetch operator's order doesn't necessary follow 'col'. std::vector fetch_target_names; for (auto *op : global_block.AllOps()) { if (op->Type() == kFetchOpType) { - fetch_target_names.push_back(op->Input("X")[0]); + int col = boost::get(op->GetAttr("col")); + if (col >= fetch_target_names.size()) { + fetch_target_names.resize(col + 1); + } + fetch_target_names[col] = op->Input("X")[0]; } } return fetch_target_names; diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index f3afc85eb924e4b03b7597e043ffd4e267adc977..a0e81cade18c0ca5eb1b98fee8325ae2d917d1a2 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -53,6 +53,8 @@ class ProgramDesc { void Flush(); + void CopyFrom(const proto::ProgramDesc &desc); + proto::ProgramDesc *Proto(); // The output variable of feed_op is referenced as feed_target. diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index d61dbb98a235ca9af089d35318b7f4c68cb125cc..b6ba0df033af12d48e88eb57a3b97b559077250d 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -40,7 +40,11 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type, "When calling this method, the Tensor's numel must be " "equal or larger than zero. " "Please check Tensor::Resize has been called first."); - size_t size = requested_size ? requested_size : numel() * SizeOfType(type); + size_t size = numel() * SizeOfType(type); + if (requested_size) { + PADDLE_ENFORCE_GE(requested_size, size); + size = requested_size; + } /* some versions of boost::variant don't have operator!= */ if (holder_ == nullptr || !(holder_->place() == place) || holder_->size() < size + offset_) { diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 429997c8b89fef7aa164e878095ab3b5c9998e5b..e9550dbfb976bee70741158b94b04084919e8271 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -26,7 +26,7 @@ namespace paddle { namespace framework { template -bool IsType(const std::type_index& type_index) { +inline bool IsType(const std::type_index& type_index) { return type_index == std::type_index(typeid(T)); } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index ba7645aa02413f28a648f35e381da7824604a455..a4f6364ae5b7d832096c92e9c6d8b3e865713cff 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor) # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? cc_library(paddle_fluid_api SRCS io.cc - DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) + DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 4feaed2b0d9cdec735bd3fadc98aa2bad715c209..779ede5e460d0ceb6fd404c4a32374f9f9d92088 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,5 +1,8 @@ cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) -cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc +set(analysis_deps + framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor) + +cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc analyzer.cc helper.cc # passes @@ -10,11 +13,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph tensorrt_subgraph_node_mark_pass.cc fluid_to_ir_pass.cc model_store_pass.cc - DEPS framework_proto proto_desc ir_pass_manager graph pass) + DEPS ${analysis_deps}) cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) -cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis) +cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) @@ -31,7 +34,7 @@ function (inference_analysis_test TARGET) endif() cc_test(${TARGET} SRCS "${analysis_test_SRCS}" - DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detecter pass ${analysis_test_EXTRA_DEPS} + DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS} ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt}) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) endif(WITH_TESTING) @@ -58,20 +61,25 @@ endif() inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis + analysis_predictor # ir fc_fuse_pass + fc_lstm_fuse_pass + seq_concat_fc_fuse_pass graph_viz_pass infer_clean_graph_pass - graph_pattern_detecter - infer_clean_graph_pass + graph_pattern_detector + infer_clean_graph_pass + attention_lstm_fuse_pass + paddle_inference_api pass ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) -inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) -inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) +inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api) +inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid) inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 7d16364609463e9c48720e772cebee7731dfd452..05b606cd0fb1802dfe815b3242813f42264f6366 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -72,7 +72,7 @@ class DfgPassManagerImpl final : public DfgPassManager { auto trt_teller = [&](const Node* node) { std::unordered_set teller_set( {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax", - "depthwise_conv2d", "batch_norm"}); + "depthwise_conv2d", "batch_norm", "concat"}); if (!node->IsFunction()) return false; const auto* func = static_cast(node); @@ -102,6 +102,19 @@ class DfgPassManagerImpl final : public DfgPassManager { Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); } void Analyzer::Run(Argument* argument) { + // Ungly support fluid-to-ir-pass + argument->Set(kFluidToIrPassesAttr, + new std::vector({ + // Manual update the passes here. + "graph_viz_pass", // + "infer_clean_graph_pass", "graph_viz_pass", // + "attention_lstm_fuse_pass", "graph_viz_pass", // + "fc_lstm_fuse_pass", "graph_viz_pass", // + "seq_concat_fc_fuse_pass", "graph_viz_pass", // + "fc_fuse_pass", "graph_viz_pass" // + + })); + for (auto& x : data_) { PADDLE_ENFORCE(x->Initialize(argument)); x->RunAll(); diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index baa7600283a9bc0b81833b419a2ea64692ed2203..263fbb044902e886c357835ab298b4f646c7a3ed 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/profiler.h" DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN"); DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN"); @@ -264,39 +265,24 @@ void TestDituRNNPrediction(const std::string &model_path, const std::string &data_path, int batch_size, bool use_analysis, bool activate_ir, int num_times = 1) { - FLAGS_IA_enable_ir = activate_ir; - FLAGS_IA_enable_tensorrt_subgraph_engine = false; - FLAGS_IA_output_storage_path = "./analysis.out"; - - std::string model_out; - if (use_analysis) { - Argument argument(model_path); - argument.model_output_store_path.reset(new std::string("./analysis.out")); - - Analyzer analyzer; - analyzer.Run(&argument); - - // Should get the transformed model stored to ./analysis.out - model_out = "./analysis.out"; - ASSERT_TRUE(PathExists(model_out)); - } else { - model_out = FLAGS_infer_ditu_rnn_model; - } - NativeConfig config; - config.prog_file = model_out + "/__model__"; - config.param_file = model_out + "/param"; + config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__"; + config.param_file = FLAGS_infer_ditu_rnn_model + "/param"; config.use_gpu = false; config.device = 0; config.specify_input_name = true; - auto predictor = + auto base_predictor = CreatePaddlePredictor(config); + auto predictor = + CreatePaddlePredictor(config); std::vector input_slots; DataRecord data(data_path, batch_size); // Prepare inputs. PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs; + std::vector outputs, base_outputs; + + base_predictor->Run(input_slots, &base_outputs); Timer timer; timer.tic(); @@ -308,37 +294,25 @@ void TestDituRNNPrediction(const std::string &model_path, << ", latency: " << timer.toc() / num_times << "ms"; LOG(INFO) << "====================================="; - for (auto &out : outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &base_out = base_outputs[i]; size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, [](int a, int b) { return a * b; }); + size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), + 1, [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_EQ(size, size1); + PADDLE_ENFORCE_GT(size, 0); float *data = static_cast(out.data.data()); - for (size_t i = 0; - i < std::min(sizeof(ditu_rnn_target_data) / sizeof(float), size); - i++) { - EXPECT_NEAR(data[i], ditu_rnn_target_data[i], 1e-3); + float *base_data = static_cast(base_out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_data[i], 1e-3); } } } -// Turn on the IR pass supportion, run a real inference and check the result. -TEST(Analyzer, SupportIRPass) { - FLAGS_IA_enable_ir = true; - FLAGS_IA_enable_tensorrt_subgraph_engine = false; - FLAGS_IA_output_storage_path = "./analysis.out"; - - Argument argument(FLAGS_inference_model_dir); - argument.model_output_store_path.reset(new std::string("./analysis.out")); - - Analyzer analyzer; - analyzer.Run(&argument); - - // Should get the transformed model stored to ./analysis.out - ASSERT_TRUE(PathExists("./analysis.out")); - - // Inference from this path. - TestWord2vecPrediction("./analysis.out"); -} - // Directly infer with the original model. TEST(Analyzer, DituRNN_without_analysis) { TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, @@ -365,5 +339,8 @@ TEST(Analyzer, DituRNN_with_analysis_with_IR) { } // namespace paddle USE_PASS(fc_fuse_pass); +USE_PASS(seq_concat_fc_fuse_pass); +USE_PASS(fc_lstm_fuse_pass); USE_PASS(graph_viz_pass); USE_PASS(infer_clean_graph_pass); +USE_PASS(attention_lstm_fuse_pass); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index a17d6281a2976f0600c7ce94c2d43e65d30de265..4401d5c5a3ca8da1c04336de4be8397334d46d9e 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -26,6 +26,7 @@ #include #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/platform/variant.h" namespace paddle { namespace inference { @@ -58,6 +59,46 @@ struct Argument { // The output storage path of ModelStorePass. std::unique_ptr model_output_store_path; + + // Support for any other attributes. + template + void Set(const std::string& key, T* data) { + PADDLE_ENFORCE_NOT_NULL(data); + PADDLE_ENFORCE(!attrs_.count(key), "duplicate attr called %s", key); + attrs_[key] = data; + attr_deleters_[key] = [data, key, this]() { + VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + VLOG(3) << "argument delete attr: " << key; + delete data; + }; + } + + bool Has(const std::string& name) const { return attrs_.count(name); } + + template + T* Release(const std::string& key) { + PADDLE_ENFORCE(attrs_.count(key)); + auto* res = boost::any_cast(attrs_.at(key)); + attrs_.erase(key); + attr_deleters_.erase(key); + return res; + } + + template + T& Get(const std::string& key) { + PADDLE_ENFORCE(Has(key)); + return *boost::any_cast(attrs_.at(key)); + } + + ~Argument() { + for (auto& item : attr_deleters_) { + item.second(); + } + } + + private: + std::unordered_map attrs_; + std::unordered_map> attr_deleters_; }; #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 8c7dd146e429a7f5cd28bdd418e457e8ea5680bd..8ca402da31f52f1a68a04b5de368c9c659a3a108 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/proto_desc.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" +#include "paddle/fluid/inference/io.h" namespace paddle { namespace inference { @@ -65,6 +66,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { } } + if (argument_->Has("param_scope")) { + LOG(WARNING) << "parameter changes in the scope takes effect"; + } + PADDLE_ENFORCE(argument_->transformed_program_desc.get()); } diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h index 4bf1840fdda8508b52d7274a338c5b1c95baf354..4693729cb43d7a9df96b11c4bf3064a70d1db4c3 100644 --- a/paddle/fluid/inference/analysis/dot.h +++ b/paddle/fluid/inference/analysis/dot.h @@ -29,13 +29,13 @@ namespace paddle { namespace inference { namespace analysis { +static size_t dot_node_counter{0}; + /* * A Dot template that helps to build a DOT graph definition. */ class Dot { public: - static size_t counter; - struct Attr { std::string key; std::string value; @@ -57,7 +57,7 @@ class Dot { Node(const std::string& name, const std::vector& attrs) : name(name), attrs(attrs), - id_("node_" + std::to_string(Dot::counter++)) {} + id_("node_" + std::to_string(dot_node_counter++)) {} std::string id() const { return id_; } @@ -65,6 +65,10 @@ class Dot { std::stringstream ss; CHECK(!name.empty()); ss << id_; + if (attrs.empty()) { + ss << "[label=" << '"' << name << '"' << "]"; + return ss.str(); + } for (size_t i = 0; i < attrs.size(); i++) { if (i == 0) { ss << "[label=" << '"' << name << '"' << " "; @@ -108,9 +112,11 @@ class Dot { explicit Dot(const std::vector& attrs) : attrs_(attrs) {} - void AddNode(const std::string& name, const std::vector& attrs) { - CHECK(!nodes_.count(name)) << "duplicate Node '" << name << "'"; - nodes_.emplace(name, Node{name, attrs}); + void AddNode(const std::string& id, const std::vector& attrs, + std::string label = "") { + CHECK(!nodes_.count(id)) << "duplicate Node '" << id << "'"; + if (label.empty()) label = id; + nodes_.emplace(id, Node{label, attrs}); } void AddEdge(const std::string& source, const std::string& target, diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc index 073f49752872cbb65fddc74be75ec28d4dd0bbaf..5e53fff39213b53bc78e9272a7efd26d7ee91023 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc @@ -13,3 +13,47 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void FluidToIrPass::EnableParamModify(const std::string &model_dir, + const std::string &prog_file, + const std::string ¶m_file) { + PADDLE_ENFORCE(argument_); + argument_->Set("param_scope", new framework::Scope); + // Load parameters. + VLOG(3) << "Loading parameters from " << model_dir; + LoadParams(&argument_->Get("param_scope"), model_dir, + prog_file, param_file); +} + +bool FluidToIrPass::LoadParams(framework::Scope *scope, const std::string &dir, + const std::string &prog_file, + const std::string ¶m_file) { + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + framework::Executor executor(place); + PADDLE_ENFORCE(argument_->origin_program_desc.get()); + framework::ProgramDesc program(*argument_->origin_program_desc); + if ((!prog_file.empty()) && (!param_file.empty())) { + LOG(INFO) << "load single model file from " << prog_file; + Load(&executor, scope, prog_file, param_file); + } else if (!dir.empty()) { + LOG(INFO) << "load from dir " << dir; + Load(&executor, scope, dir); + } else { + LOG(ERROR) << "failed to load parameters"; + return false; + } + return true; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h index fa3f8d313bbdd6733fa3878dd7023e125b6ced36..29008105f82989f5797116e78990853880708936 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h @@ -21,12 +21,17 @@ namespace paddle { namespace inference { namespace analysis { +static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__"; + class FluidToIrPass final : public DataFlowGraphPass { public: FluidToIrPass() = default; bool Initialize(Argument *argument) override { ANALYSIS_ARGUMENT_CHECK_FIELD(argument); + PADDLE_ENFORCE(argument->Has(kFluidToIrPassesAttr), + "argument need the attr %s", kFluidToIrPassesAttr); + argument_ = argument; if (argument->origin_program_desc) { LOG(WARNING) << "argument's origin_program_desc is already set, might " "duplicate called"; @@ -46,12 +51,21 @@ class FluidToIrPass final : public DataFlowGraphPass { if (!argument->main_dfg) { argument->main_dfg.reset(new DataFlowGraph); } - // Persist the ProgramDesc in graph's attribute. The IR graph just keep the - // address, will segfault if the original ProgramDesc destroys. - auto &ir_program_p = argument->main_dfg->Attr("ir_program_desc").Pointer(); - ir_program_p = new framework::ProgramDesc(program); + argument->Set("ir_program_desc", new framework::ProgramDesc(program)); + + LOG(INFO) << "Loading parameters"; + // Load parameters to argument if needed. + if (argument->fluid_model_dir || (argument->fluid_model_program_path && + argument->fluid_model_param_path)) { +#define SAFE_GET(ATTR) std::string ATTR = argument->ATTR ? *argument->ATTR : ""; + SAFE_GET(fluid_model_dir); + SAFE_GET(fluid_model_program_path); + SAFE_GET(fluid_model_param_path); +#undef SAFE_GET + EnableParamModify(fluid_model_dir, fluid_model_program_path, + fluid_model_param_path); + } - argument_ = argument; return true; } @@ -59,20 +73,36 @@ class FluidToIrPass final : public DataFlowGraphPass { void Run(DataFlowGraph *graph) override { // Call all the IR Passes - IRPassManager ir_passes(*static_cast( - argument_->main_dfg->Attr("ir_program_desc").Pointer())); - ir_passes.Apply(std::vector( - {// Manual update the passes here. - "graph_viz_pass", "infer_clean_graph_pass", "graph_viz_pass", - "fc_fuse_pass", "graph_viz_pass"})); + IRPassManager ir_passes( + argument_->Get("ir_program_desc"), nullptr); + // Pass the scope from analysis to IR if needed. + if (argument_->Has("param_scope")) { + // Here the address is passed, attention that IR doesn't own the scope, so + // the real scope in analysis should live during the IR phase. + ir_passes.graph().Set( + "param_scope", new framework::Scope *( + &argument_->Get("param_scope"))); + } + + const auto &ir_passes_to_apply = + argument_->Get>(kFluidToIrPassesAttr); + ir_passes.Apply(ir_passes_to_apply); PADDLE_ENFORCE(argument_->main_dfg.get()); argument_->main_dfg->Build(ir_passes.graph()); - // PADDLE_ENFORCE(argument_->main_dfg->IsFullyConnected()); } + void EnableParamModify(const std::string &model_dir, + const std::string &prog_file, + const std::string ¶m_file); + std::string repr() const override { return "fluid-to-ir-pass"; } + private: + // Load parameters from a single file or from a directory. + bool LoadParams(framework::Scope *scope, const std::string &dir, + const std::string &prog_file, const std::string ¶m_file); + private: Argument *argument_{nullptr}; }; diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc index af934f261baa3807059ce6ab036545594630df58..6a13c60e7b2ebf645b12d5ddf83ef6ab3a2e83bd 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc @@ -24,6 +24,8 @@ namespace analysis { TEST(FluidToIrPass, Test) { FluidToIrPass pass; Argument argument(FLAGS_inference_model_dir); + argument.Set(kFluidToIrPassesAttr, + new std::vector({"infer_clean_graph_pass"})); pass.Initialize(&argument); pass.Run(argument.main_dfg.get()); } @@ -32,6 +34,9 @@ TEST(FluidToIrPass, Test) { } // namespace inference } // namespace paddle -USE_PASS(fc_fuse_pass); USE_PASS(graph_viz_pass); USE_PASS(infer_clean_graph_pass); +USE_PASS(attention_lstm_fuse_pass); +USE_PASS(fc_lstm_fuse_pass); +USE_PASS(seq_concat_fc_fuse_pass); +USE_PASS(fc_fuse_pass); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index d849b637bcf3fe3944ad11680bbe041e19a71e24..5da5241e49a2f7c8c0951e1a3c31784b8af65134 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -14,20 +14,24 @@ #include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/scope.h" namespace paddle { namespace inference { namespace analysis { -IRPassManager::IRPassManager(const ProgramDesc& program) { +IRPassManager::IRPassManager(const ProgramDesc &program, + framework::Scope *scope) + : program_(program) { graph_.reset(new framework::ir::Graph(program)); + if (scope) graph_->Set("param_scope", new framework::Scope *(scope)); } -void IRPassManager::Apply(const std::vector& passes) { - graph_->Set("graph_viz_path", new std::string("./1.dot")); +void IRPassManager::Apply(const std::vector &passes) { // Apply all the passes std::string pre_pass; - for (const std::string& pass_name : passes) { + for (const std::string &pass_name : passes) { LOG(WARNING) << "Running IR pass [" << pass_name << "]"; auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); if (pass_name == "graph_viz_pass") { diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 3338e37ecf1c591a631fd829a05b07e562af703e..bb230283b7c2cc783d0b68ea0aa3cca1cabc75e6 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" namespace paddle { namespace inference { @@ -31,14 +32,15 @@ using framework::ProgramDesc; class IRPassManager final { public: - IRPassManager(const ProgramDesc& program); + IRPassManager(const ProgramDesc &program, framework::Scope *scope); - void Apply(const std::vector& passes); + void Apply(const std::vector &passes); - framework::ir::Graph& graph() const { return *graph_; } + framework::ir::Graph &graph() const { return *graph_; } private: std::unique_ptr graph_; + ProgramDesc program_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc index cfdca33882ea00a28e3ea51ca5fd77ec9605bf3a..ff5ec94265a4f05c1294ad6c8ac5f86c249b84b6 100644 --- a/paddle/fluid/inference/analysis/pass_manager.cc +++ b/paddle/fluid/inference/analysis/pass_manager.cc @@ -33,9 +33,9 @@ bool PassManager::Initialize(Argument* argument) { void DfgPassManager::RunAll() { PADDLE_ENFORCE(argument_); - LOG(INFO) << "Total " << data_.size() << " passes"; + LOG(INFO) << "Total " << data_.size() << " Analysys passes"; for (auto& pass : data_) { - LOG(WARNING) << "Running pass [" << pass->repr() << "]"; + LOG(WARNING) << "Running Analysis pass [" << pass->repr() << "]"; pass->Run(argument_->main_dfg.get()); } } diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 0ca1af455ca10fa6995ad3a1c33825108a3fd7ad..adfe4392448557a30cd834022b9a5d21d9086b95 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -20,7 +20,7 @@ endif(APPLE) set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager graph_viz_pass fc_fuse_pass - infer_clean_graph_pass + infer_clean_graph_pass ) if(WITH_GPU AND TENSORRT_FOUND) @@ -46,7 +46,8 @@ function(inference_api_test TARGET_NAME) endif(WITH_TESTING) endfunction(inference_api_test) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api) cc_test(test_paddle_inference_api SRCS api_tester.cc diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b29b233822330e3c1441793ce036b9b9278721b --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -0,0 +1,165 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { + +using inference::analysis::Argument; +using inference::Singleton; +using inference::analysis::Analyzer; +using framework::proto::ProgramDesc; + +/* This predictor is based on the original native predictor with IR and Analysis + * support. It will optimize IR and Parameters in the runtime. + * TODO(Superjomn) Replace the Navive predictor? + */ +class AnalysisPredictor : public NativePaddlePredictor { + public: + explicit AnalysisPredictor(const NativeConfig& config) + : NativePaddlePredictor(config), config_(config) {} + + bool Init(const std::shared_ptr& parent_scope) { + VLOG(3) << "Predictor::init()"; + if (config_.use_gpu) { + place_ = paddle::platform::CUDAPlace(config_.device); + } else { + place_ = paddle::platform::CPUPlace(); + } + PADDLE_ENFORCE(!parent_scope); + if (parent_scope) { + scope_ = parent_scope; + sub_scope_ = &(parent_scope->NewScope()); + } else { + paddle::framework::InitDevices(false); + scope_.reset(new paddle::framework::Scope()); + } + + executor_.reset(new paddle::framework::Executor(place_)); + + // Initialize the inference program + if (!config_.model_dir.empty()) { + // Parameters are saved in separate files sited in + // the specified `dirname`. + inference_program_ = paddle::inference::Load( + executor_.get(), scope_.get(), config_.model_dir); + } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + // All parameters are saved in a single file. + // The file names should be consistent with that used + // in Python API `fluid.io.save_inference_model`. + inference_program_ = paddle::inference::Load( + executor_.get(), scope_.get(), config_.prog_file, config_.param_file); + } else { + LOG(ERROR) << "fail to load inference model."; + return false; + } + + OptimizeInferenceProgram(); + ctx_ = executor_->Prepare(*inference_program_, 0); + + VLOG(5) << "to create variables"; + PADDLE_ENFORCE(scope_.get()); + executor_->CreateVariables(*inference_program_, + sub_scope_ ? sub_scope_ : scope_.get(), 0); + + // Get the feed_target_names and fetch_target_names + feed_target_names_ = inference_program_->GetFeedTargetNames(); + fetch_target_names_ = inference_program_->GetFetchTargetNames(); + return true; + } + + bool Run(const std::vector& inputs, + std::vector* output_data, + int batch_size = -1) override { + return NativePaddlePredictor::Run(inputs, output_data, batch_size); + } + + void OptimizeInferenceProgram() { + LOG(INFO) << "optimize begin"; + FLAGS_IA_enable_ir = true; + FLAGS_IA_enable_tensorrt_subgraph_engine = false; + FLAGS_IA_output_storage_path = ""; // Don't output the model. + // Analyze inference_program + Argument argument; + if (!config_.model_dir.empty()) { + argument.fluid_model_dir.reset(new std::string(config_.model_dir)); + } else { + PADDLE_ENFORCE( + !config_.param_file.empty(), + "Either model_dir or (param_file, prog_file) should be set."); + PADDLE_ENFORCE(!config_.prog_file.empty()); + argument.fluid_model_program_path.reset( + new std::string(config_.prog_file)); + argument.fluid_model_param_path.reset( + new std::string(config_.param_file)); + } + argument.origin_program_desc.reset( + new ProgramDesc(*inference_program_->Proto())); + Singleton::Global().Run(&argument); + CHECK(argument.transformed_program_desc); + VLOG(5) << "to prepare executor"; + // LOG(INFO) << "transformed_parogram_desc " << + // argument.transformed_program_desc->DebugString(); + inference_program_.reset( + new framework::ProgramDesc(*argument.transformed_program_desc)); + PADDLE_ENFORCE(argument.Has("param_scope")); + // Update scope. + scope_.reset(argument.Release("param_scope")); + LOG(INFO) << "optimize end =="; + } + + private: + NativeConfig config_; +}; + +template <> +std::unique_ptr CreatePaddlePredictor< + NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) { + VLOG(3) << "create NativePredictor"; + if (config.use_gpu) { + // 1. GPU memeroy + PADDLE_ENFORCE_GT( + config.fraction_of_gpu_memory, 0.f, + "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); + PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); + std::vector flags; + if (config.fraction_of_gpu_memory >= 0.0f || + config.fraction_of_gpu_memory <= 0.95f) { + flags.push_back("dummpy"); + std::string flag = "--fraction_of_gpu_memory_to_use=" + + std::to_string(config.fraction_of_gpu_memory); + flags.push_back(flag); + VLOG(3) << "set flag: " << flag; + framework::InitGflags(flags); + } + } + + std::unique_ptr predictor(new AnalysisPredictor(config)); + if (!dynamic_cast(predictor.get())->Init(nullptr)) { + return nullptr; + } + return predictor; +} + +} // namespace paddle + +USE_PASS(fc_fuse_pass); +USE_PASS(graph_viz_pass); +USE_PASS(infer_clean_graph_pass); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 9ac037297167fe7de29925ffe36f4d39efb65313..93de7a5209e7dc289b4b02e73ef3bb20bfc8c774 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -32,6 +32,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { : NativePaddlePredictor(config), config_(config) {} bool Init(const std::shared_ptr& parent_scope) { + FLAGS_IA_enable_tensorrt_subgraph_engine = true; VLOG(3) << "Predictor::init()"; FLAGS_tensorrt_max_batch_size = config_.max_batch_size; FLAGS_tensorrt_workspace_size = config_.workspace_size; @@ -161,3 +162,4 @@ USE_TRT_CONVERTER(fc); USE_TRT_CONVERTER(pool2d); USE_TRT_CONVERTER(softmax); USE_TRT_CONVERTER(batch_norm); +USE_TRT_CONVERTER(concat); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index 8f1a72316d6c146ebc9a86ced739ef088a3b4267..9e7425eddd2df07ffe897f908aad360abe42117a 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -37,6 +37,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { config1.use_gpu = true; config1.fraction_of_gpu_memory = 0.3; config1.device = 0; + config1.max_batch_size = 10; auto predictor0 = CreatePaddlePredictor(config0); diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..9cc491e10d691a206dd903b78c0ea570741da44c --- /dev/null +++ b/paddle/fluid/inference/api/helper.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/helper.h" + +namespace paddle { +namespace inference { + +template <> +std::string to_string>( + const std::vector> &vec) { + std::stringstream ss; + for (const auto &piece : vec) { + ss << to_string(piece) << "\n"; + } + return ss.str(); +} + +template <> +std::string to_string>>( + const std::vector>> &vec) { + std::stringstream ss; + for (const auto &line : vec) { + for (const auto &rcd : line) { + ss << to_string(rcd) << ";\t"; + } + ss << '\n'; + } + return ss.str(); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 2c166cc0622f68e6d527005795c21236ccf43c33..e44b1b74bc385c015fa6efcebac05359a810cbc1 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -44,7 +44,8 @@ class Timer { } }; -void split(const std::string &str, char sep, std::vector *pieces) { +static void split(const std::string &str, char sep, + std::vector *pieces) { pieces->clear(); if (str.empty()) { return; @@ -60,7 +61,8 @@ void split(const std::string &str, char sep, std::vector *pieces) { pieces->push_back(str.substr(pos)); } } -void split_to_float(const std::string &str, char sep, std::vector *fs) { +static void split_to_float(const std::string &str, char sep, + std::vector *fs) { std::vector pieces; split(str, sep, &pieces); std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs), @@ -76,27 +78,14 @@ std::string to_string(const std::vector &vec) { } template <> std::string to_string>( - const std::vector> &vec) { - std::stringstream ss; - for (const auto &piece : vec) { - ss << to_string(piece) << "\n"; - } - return ss.str(); -} + const std::vector> &vec); + template <> std::string to_string>>( - const std::vector>> &vec) { - std::stringstream ss; - for (const auto &line : vec) { - for (const auto &rcd : line) { - ss << to_string(rcd) << ";\t"; - } - ss << '\n'; - } - return ss.str(); -} + const std::vector>> &vec); + // clang-format off -void TensorAssignData(PaddleTensor *tensor, const std::vector> &data) { +static void TensorAssignData(PaddleTensor *tensor, const std::vector> &data) { // Assign buffer int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, [](int a, int b) { return a * b; }); tensor->data.Resize(sizeof(float) * dim); diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 36fd0727aa7beef4a06a5f2e63ec0c43583ddf84..1baa64c249f291ec1bc874be5031abe6d4368274 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -77,6 +77,7 @@ enum class PaddleEngineKind { kNative = 0, // Use the native Fluid facility. kAnakin, // Use Anakin for inference. kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. + kAnalysis // TODO(Superjomn) support following engines latter. // kTensorRT, // Use TensorRT for inference. // kAutoMixedAnakin, // Automatically mix Fluid with Anakin. diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 181868977dd8f2568486ed0c4e1f260a69795896..cef7b2a7e3a29da05628d7540f5545dc9adda27e 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -143,5 +143,21 @@ std::unique_ptr Load( return main_program; } +void SaveVars(const framework::Scope& scope, + const std::vector& vars, const std::string& dirname, + bool predicate) { + framework::ProgramDesc prog; + auto* block = prog.MutableBlock(0); + auto* op = block->AppendOp(); + op->SetType("save_combine"); + op->SetInput("X", vars); + op->SetAttr("file_path", dirname + "/param"); + op->CheckAttrs(); + + platform::CPUPlace place; + framework::Executor exe(place); + exe.Run(prog, const_cast(&scope), 0, true, true); +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h index 01b50b3670cb9da2e0be232a61ea6129dd83aa20..ab492577c1476abee30d6dd1c740394391e5a93a 100644 --- a/paddle/fluid/inference/io.h +++ b/paddle/fluid/inference/io.h @@ -41,5 +41,10 @@ std::unique_ptr Load(framework::Executor* executor, const std::string& prog_filename, const std::string& param_filename); +// Save the variables from a scope to disk. +void SaveVars(const framework::Scope& scope, + const std::vector& vars, const std::string& dirname, + bool predicate = true); + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 2a449eb95e3537a11962912a6a3f29e89958fbd8..9d7be2d03cf7bb12afe7e52d9630f184d689dc25 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,7 +1,7 @@ # Add TRT tests nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc -batch_norm_op.cc activation_op.cc softmax_op.cc +batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -18,12 +18,12 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL) nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL) - nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL) - nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL) - nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL) + +nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..bb9627bf957b63993b2c8d23e7ec8122eb004eaf --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights. + */ +class ConcatOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + std::vector itensors; + for (auto& input_name : op_desc.Input("X")) { + itensors.push_back(engine_->GetITensor(input_name)); + } + int axis = boost::get(op_desc.GetAttr("axis")); + PADDLE_ENFORCE(axis > 0, + "The axis attr of Concat op should be large than 0 for trt"); + + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(), + itensors.size()); + axis = axis - 1; // Remove batch dim + layer->setAxis(axis); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(concat, ConcatOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 41faaf7212accaaec238062b1340e8da8fa6be33..d309d94c560f2b484fac6b6cd40cc2704d641069 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -79,6 +79,14 @@ class OpConverter { it = Registry::Lookup("elementwise_" + op_type + "_tensor"); } + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); + } + + if (op_desc.Type() == "depthwise_conv2d") { + it = Registry::Lookup("conv2d"); + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); } if (!it) { diff --git a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4f284a4db5758e072915d7fd0f16115b8a36ba8b --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(concat_op, test) { + std::unordered_set parameters({""}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("concat_x1", nvinfer1::DimsCHW(10, 3, 1)); + validator.DeclInputVar("concat_x2", nvinfer1::DimsCHW(3, 3, 1)); + validator.DeclInputVar("concat_x3", nvinfer1::DimsCHW(7, 3, 1)); + validator.DeclOutputVar("concat_out", nvinfer1::DimsCHW(20, 3, 1)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("concat"); + desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"}); + desc.SetOutput("Out", {"concat_out"}); + + int axis = 1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + + validator.Execute(5); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle +USE_OP(concat); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 695790a37dce889e838462b401ca4e89f09271d5..94f0550df57e79fa68c135f5c9c4b7effe6ac156 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/profiler.h" @@ -135,6 +136,15 @@ std::vector> GetFeedTargetShapes( return feed_target_shapes; } +void Compile(paddle::framework::ProgramDesc* program) { + std::unique_ptr g( + new paddle::framework::ir::Graph(*program)); + auto pass = paddle::framework::ir::PassRegistry::Instance().Get( + "graph_to_program_pass"); + pass->SetNotOwned("program", program); + pass->Apply(std::move(g)); +} + template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, @@ -172,6 +182,8 @@ void TestInference(const std::string& dirname, paddle::platform::DeviceContextPool::Instance().Get(place)); inference_program = InitProgram(&executor, scope, dirname, is_combined); } + Compile(inference_program.get()); + // Disable the profiler and print the timing information paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, "load_program_profiler"); @@ -249,3 +261,5 @@ void TestInference(const std::string& dirname, delete scope; } + +USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 8da0aaaafeb151e8f1900bc66f06e771c857fc00..2f8fadcefe5d5b6f428092914c7e999ec7524862 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -291,6 +291,8 @@ op_library(unsqueeze_op DEPS reshape_op) op_library(squeeze_op DEPS reshape_op) op_library(extract_rows_op DEPS memory) op_library(flatten_op DEPS reshape_op) +op_library(sequence_pad_op DEPS sequence_padding) +op_library(unstack_op DEPS stack_op) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 8bab37c5830dfdcd5d6ccf1cc049387b496b0d04..a02128c5a54c80ca7ccf9db347cd53f28bbb50f8 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -56,7 +56,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { const int D = w_dims[1] / 4; PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(LSTMWeight)'s rank must be 2."); PADDLE_ENFORCE_EQ(w_dims[0], D + M, - "LSTMWeight dims should be (%d + %d) * %d.", D + M, 4 * D); + "LSTMWeight dims should be (%d + %d) * %d.", D, M, 4 * D); auto b_dims = ctx->GetInputDim("LSTMBias"); PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2."); diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h index 0651203286c0fa17866d333edffaea1b56f23005..0a18585edb54a76aff5ae72ecc71e0eebb9f9361 100644 --- a/paddle/fluid/operators/auc_op.h +++ b/paddle/fluid/operators/auc_op.h @@ -60,20 +60,6 @@ class AucKernel : public framework::OpKernel { const T* inference_data = predict->data(); const auto* label_data = label->data(); - // check if states are inited. - auto* tp_in = ctx.Input("TP"); - auto* fp_in = ctx.Input("FP"); - auto* tn_in = ctx.Input("TN"); - auto* fn_in = ctx.Input("FN"); - PADDLE_ENFORCE(tp_in->IsInitialized(), "true_positive is not inited!"); - PADDLE_ENFORCE(fp_in->IsInitialized(), "false_negative is not inited!"); - PADDLE_ENFORCE(tn_in->IsInitialized(), "true_negative is not inited!"); - PADDLE_ENFORCE(fn_in->IsInitialized(), "false_positive is not inited!"); - PADDLE_ENFORCE_EQ(tp_in->numel(), num_thresholds, ""); - PADDLE_ENFORCE_EQ(fp_in->numel(), num_thresholds, ""); - PADDLE_ENFORCE_EQ(tn_in->numel(), num_thresholds, ""); - PADDLE_ENFORCE_EQ(fn_in->numel(), num_thresholds, ""); - auto* tp_data = true_positive->mutable_data(ctx.GetPlace()); auto* fn_data = false_negative->mutable_data(ctx.GetPlace()); auto* tn_data = true_negative->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc index 9ab2179b5fe689762704039c5f67dd080e530aa5..de641cb08e4cc3322cc8387d873f2aaab279e1dd 100644 --- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -37,6 +37,95 @@ struct bn_type_traits { using op_prim = typename op_type::primitive_desc; }; +class BatchNormMKLDNNHandler : public platform::MKLDNNHandler { + public: + BatchNormMKLDNNHandler( + std::shared_ptr batch_norm_pd, + const platform::MKLDNNDeviceContext &dev_ctx, mkldnn::engine engine, + const std::string &base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key) { + batch_norm_pd_ = batch_norm_pd; + } + + std::shared_ptr AcquireScaleshiftMemoryFromPrimitive(void *ptr) { + return this->AcquireMemoryFromPrimitive( + batch_norm_pd_->weights_primitive_desc(), ptr, "@scaleshift_mem_p"); + } + + std::shared_ptr AcquireMeanMemoryFromPrimitive(void *ptr) { + return this->AcquireMemoryFromPrimitive( + batch_norm_pd_->mean_primitive_desc(), ptr, "@mean_mem_p"); + } + + std::shared_ptr AcquireVarianceMemoryFromPrimitive(void *ptr) { + return this->AcquireMemoryFromPrimitive( + batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p"); + } + + std::shared_ptr AcquireTestTrainingBatchNormFwd( + std::shared_ptr src_memory, + std::shared_ptr scaleshift_memory, + std::shared_ptr dst_memory, std::shared_ptr mean_memory, + std::shared_ptr variance_memory, bool is_test) { + auto prim_key = key_ + "@batch_norm_p"; + auto batch_norm_p = + std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); + + PADDLE_ENFORCE((batch_norm_p != nullptr) || !is_reusing_, + "Fail to find batch norm primitive in device context"); + + if (batch_norm_p == nullptr) { + if (is_test) { + batch_norm_p = std::make_shared( + *batch_norm_pd_, *src_memory, + (const mkldnn::primitive::at &)*mean_memory, + (const mkldnn::primitive::at &)*variance_memory, *scaleshift_memory, + *dst_memory); + } else { + batch_norm_p = std::make_shared( + *batch_norm_pd_, *src_memory, *scaleshift_memory, *dst_memory, + *mean_memory, *variance_memory); + } + + dev_ctx_.SetBlob(prim_key, batch_norm_p); + } else { + is_reusing_ = true; + } + + return batch_norm_p; + } + + static std::string GetHash(const memory::dims &input_dims, float epsilon, + unsigned flag, bool is_test, memory::format format, + const std::string &suffix = "") { + auto dims2str = [](const memory::dims &operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; + }; + return dims2str(input_dims) + std::to_string(epsilon) + + std::to_string(flag) + std::to_string(is_test) + + std::to_string(format) + suffix; + } + + private: + std::shared_ptr batch_norm_pd_; +}; + +std::shared_ptr UpdateMemoryData( + const platform::MKLDNNDeviceContext &dev_ctx, const std::string &key, + void *new_ptr) { + auto mem = std::static_pointer_cast(dev_ctx.GetBlob(key)); + PADDLE_ENFORCE( + mem != nullptr, + (std::string("Fail to find memory in device context [key: ") + key + "]") + .c_str()); + mem->set_data_handle(new_ptr); + return mem; +} + template void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end, Container *c) { @@ -48,15 +137,6 @@ void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end, std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end)))); } -template -void run_batch_norm_op(Args &&... args) { - Op batch_norm_op{args...}; - - std::vector pipeline; - pipeline.push_back(batch_norm_op); - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); -} - } // namespace template @@ -110,6 +190,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); const unsigned int ic = scale_tz[0]; + // MKLDNN requires a single piece of memory for scale and shift/bias data + const size_t scaleshift_size = 2 * ic; + std::vector scaleshift_data; + scaleshift_data.reserve(scaleshift_size); + + copy_to_weights(scale->data(), scale->data() + ic, shift->data(), + shift->data() + ic, &scaleshift_data); + unsigned flags = mkldnn::use_scale_shift; if (is_test) flags |= mkldnn::use_global_stats; if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; @@ -118,64 +206,69 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn::memory::format input_format = platform::MKLDNNFormatForSize(src_tz.size(), x->format()); - auto src_memory = memory( - {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine}, - to_void_cast(x_data)); + // keys for backward pass + const std::string key = BatchNormMKLDNNHandler::GetHash( + src_tz, epsilon, flags, is_test, input_format, + ctx.op().Output("SavedMean")); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input_format); // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits; - auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ - propagation, src_memory.get_primitive_desc().desc(), epsilon, flags}; - std::shared_ptr batch_norm_fwd_pd = - std::shared_ptr( - new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc, - mkldnn_engine)); - - // Save the pd to be used in backward pass - const std::string key = ctx.op().Output("SavedMean"); - const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + auto batch_norm_fwd_desc = + bn_fwd_types::op_desc{propagation, user_src_md, epsilon, flags}; + auto batch_norm_fwd_pd = std::make_shared( + batch_norm_fwd_desc, mkldnn_engine); + // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd); - // MKLDNN requires a single piece of memory for scale and shift/bias data - const size_t scaleshift_size = 2 * ic; - std::vector scaleshift_data; - scaleshift_data.reserve(scaleshift_size); + BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine, + key); - copy_to_weights(scale->data(), scale->data() + ic, shift->data(), - shift->data() + ic, &scaleshift_data); + auto src_memory = + handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data)); // crate mkldnn memory for weights(scale/shift) - auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(), - scaleshift_data.data()); + auto scaleshift_memory = + handler.AcquireScaleshiftMemoryFromPrimitive(scaleshift_data.data()); // create mkldnn memory for output y tensor - auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data); + auto dst_memory = handler.AcquireDstMemory( + batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data); + std::shared_ptr batch_norm_p; if (is_test) { // create mkldnn memory for stats (as input) - auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(), - to_void_cast(mean_data)); - auto variance_memory = - memory(batch_norm_fwd_pd->variance_primitive_desc(), - to_void_cast(variance_data)); - - run_batch_norm_op( - *batch_norm_fwd_pd, src_memory, - (const mkldnn::primitive::at &)mean_memory, - (const mkldnn::primitive::at &)variance_memory, scaleshift_memory, - dst_memory); + std::shared_ptr mean_memory = + handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data)); + std::shared_ptr variance_memory = + handler.AcquireVarianceMemoryFromPrimitive( + to_void_cast(variance_data)); + + batch_norm_p = handler.AcquireTestTrainingBatchNormFwd( + src_memory, scaleshift_memory, dst_memory, mean_memory, + variance_memory, true); } else { // create mkldnn memory for stats (as output) - auto mean_memory = - memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data); - auto variance_memory = memory( - batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data); - - run_batch_norm_op(*batch_norm_fwd_pd, src_memory, - scaleshift_memory, dst_memory, - mean_memory, variance_memory); + std::shared_ptr mean_memory = + handler.AcquireMeanMemoryFromPrimitive(batch_mean_data); + std::shared_ptr variance_memory = + handler.AcquireVarianceMemoryFromPrimitive(batch_variance_data); + + batch_norm_p = handler.AcquireTestTrainingBatchNormFwd( + src_memory, scaleshift_memory, dst_memory, mean_memory, + variance_memory, false); } + y->set_layout(DataLayout::kMKLDNN); + y->set_format(platform::GetMKLDNNFormat(*dst_memory)); + + std::vector pipeline; + pipeline.push_back(*batch_norm_p); + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + if (!is_test) { // mkldnn only compute stats for current batch // so we need compute momentum stats via Eigen lib @@ -192,10 +285,6 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { running_variance_e = variance_e * momentum + batch_variance_e * one_minus_momentum; } - - y->set_layout(DataLayout::kMKLDNN); - y->set_format( - (memory::format)dst_memory.get_primitive_desc().desc().data.format); } }; @@ -242,61 +331,48 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { const unsigned int ic = scale_tz[0]; - // Retrieve bn_fwd_pd from device context - const std::string key = ctx.op().Input("SavedMean"); - const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; - auto batch_norm_fwd_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_batch_norm_fwd_pd)); - PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr, - "Fail to find batch_norm_fwd_pd in device context"); - using bn_bwd_types = bn_type_traits; - // create mkldnn memory from input diff_y tensor - mkldnn::memory::format dst_format = platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format()); - auto user_diff_dst_memory = memory( - {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine}, - to_void_cast(diff_y_data)); - - // create mkldnn memory from input x tensor mkldnn::memory::format input_format = platform::MKLDNNFormatForSize(src_tz.size(), x->format()); - auto src_memory = memory( - {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine}, - to_void_cast(x_data)); + unsigned flags = mkldnn::use_scale_shift; - // for diff_dst, try to use same format as dst in forward pass - auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); - auto diff_dst_md = diff_dst_pd.desc(); + // keys from forward pass + const std::string key = BatchNormMKLDNNHandler::GetHash( + src_tz, epsilon, flags, false, input_format, + ctx.op().Input("SavedMean")); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + + // keys for primitives reuse + const std::string key_with_hash = + key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false, + input_format); + const std::string key_batch_norm_bwd_p = + key_with_hash + "@batch_norm_bwd_p"; + const std::string key_batch_norm_src_mem_p = + key_with_hash + "@batch_norm_bwd_src_mem_p"; + const std::string key_batch_norm_mean_mem_p = + key_with_hash + "@batch_norm_bwd_mean_mem_p"; + const std::string key_batch_norm_variance_mem_p = + key_with_hash + "@batch_norm_bwd_variance_mem_p"; + const std::string key_batch_norm_scaleshift_mem_p = + key_with_hash + "@batch_norm_bwd_scaleshift_mem_p"; + const std::string key_batch_norm_diff_scaleshift_mem_p = + key_with_hash + "@batch_norm_bwd_diff_scaleshift_mem_p"; + const std::string key_batch_norm_diff_src_mem_p = + key_with_hash + "@batch_norm_bwd_diff_src_mem_p"; + const std::string key_batch_norm_diff_dst_mem_p = + key_with_hash + "@batch_norm_bwd_diff_dst_mem_p"; - // create primitive descriptor for batch norm backward - unsigned flags = mkldnn::use_scale_shift; - auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ - mkldnn::prop_kind::backward, diff_dst_md, - src_memory.get_primitive_desc().desc(), epsilon, flags}; - auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ - batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd}; - - // reorder user_diff_dst if it's not in preferred format - auto diff_dst_memory = user_diff_dst_memory; primitive reorder_diff_dst; bool is_diff_dst_reordered = false; - if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) { - diff_dst_memory = memory(diff_dst_pd); - reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory); - is_diff_dst_reordered = true; - } - - // create mkldnn memory for input tensors (src/mean/variance) - auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(), - to_void_cast(batch_mean_data)); - auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(), - to_void_cast(batch_variance_data)); + auto user_diff_dst_memory = memory( + {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine}, + to_void_cast(diff_y_data)); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; @@ -306,30 +382,118 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic, &scaleshift_data); - // create mkldnn memory for input tensors (scale/shift) - auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(), - scaleshift_data.data()); - - // create mkldnn memory for output diff weights (combined scale/shift) std::vector diff_scaleshift_data; diff_scaleshift_data.reserve(scaleshift_size); - auto diff_scaleshift_memory = - memory(batch_norm_bwd_pd.diff_weights_primitive_desc(), - diff_scaleshift_data.data()); - // here assume diff_src is in the same format of src - auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data); + auto batch_norm_fwd_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_batch_norm_fwd_pd)); + PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr, + "Fail to find batch_norm_fwd_pd in device context"); - // finally create batch_norm backward primitive - auto batch_norm_bwd_prim = - batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory, - variance_memory, diff_dst_memory, scaleshift_memory, - diff_src_memory, diff_scaleshift_memory); + auto batch_norm_bwd_p = std::static_pointer_cast( + dev_ctx.GetBlob(key_batch_norm_bwd_p)); + + if (batch_norm_bwd_p == nullptr) { + auto src_memory = std::shared_ptr(new memory( + {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine}, + to_void_cast(x_data))); + + // for diff_dst, try to use same format as dst in forward pass + auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); + auto diff_dst_md = diff_dst_pd.desc(); + + // create primitive descriptor for batch norm backward + auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ + mkldnn::prop_kind::backward, diff_dst_md, + src_memory->get_primitive_desc().desc(), epsilon, flags}; + auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ + batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd}; + + // reorder user_diff_dst if it's not in preferred format + auto diff_dst_memory = std::make_shared(user_diff_dst_memory); + if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = std::make_shared(diff_dst_pd); + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } + + // create mkldnn memory for input tensors (src/mean/variance) + auto mean_memory = + std::make_shared(batch_norm_bwd_pd.mean_primitive_desc(), + to_void_cast(batch_mean_data)); + auto variance_memory = + std::make_shared(batch_norm_bwd_pd.variance_primitive_desc(), + to_void_cast(batch_variance_data)); + + // create mkldnn memory for input tensors (scale/shift) + auto scaleshift_memory = std::make_shared( + batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()); + + // create mkldnn memory for output diff weights (combined scale/shift) + auto diff_scaleshift_memory = std::make_shared( + batch_norm_bwd_pd.diff_weights_primitive_desc(), + diff_scaleshift_data.data()); + + // here assume diff_src is in the same format of src + auto diff_src_memory = std::make_shared( + src_memory->get_primitive_desc(), diff_x_data); + + // finally create batch_norm backward primitive + batch_norm_bwd_p = std::make_shared( + batch_norm_bwd_pd, *src_memory, *mean_memory, *variance_memory, + *diff_dst_memory, *scaleshift_memory, *diff_src_memory, + *diff_scaleshift_memory); + + dev_ctx.SetBlob(key_batch_norm_bwd_p, batch_norm_bwd_p); + dev_ctx.SetBlob(key_batch_norm_src_mem_p, src_memory); + dev_ctx.SetBlob(key_batch_norm_mean_mem_p, mean_memory); + dev_ctx.SetBlob(key_batch_norm_variance_mem_p, variance_memory); + dev_ctx.SetBlob(key_batch_norm_scaleshift_mem_p, scaleshift_memory); + dev_ctx.SetBlob(key_batch_norm_diff_scaleshift_mem_p, + diff_scaleshift_memory); + dev_ctx.SetBlob(key_batch_norm_diff_src_mem_p, diff_src_memory); + dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory); + + // set layout/format of output tensors + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format); + } else { + // primitives already exist + UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data)); + UpdateMemoryData(dev_ctx, key_batch_norm_mean_mem_p, + to_void_cast(batch_mean_data)); + UpdateMemoryData(dev_ctx, key_batch_norm_variance_mem_p, + to_void_cast(batch_variance_data)); + UpdateMemoryData(dev_ctx, key_batch_norm_scaleshift_mem_p, + scaleshift_data.data()); + UpdateMemoryData(dev_ctx, key_batch_norm_diff_scaleshift_mem_p, + diff_scaleshift_data.data()); + auto diff_src_memory = UpdateMemoryData( + dev_ctx, key_batch_norm_diff_src_mem_p, to_void_cast(diff_x_data)); + auto diff_dst_memory = UpdateMemoryData( + dev_ctx, key_batch_norm_diff_dst_mem_p, to_void_cast(diff_y_data)); + + // reorder user_diff_dst if it's not in preferred format + if (diff_dst_memory->get_primitive_desc() != + user_diff_dst_memory.get_primitive_desc()) { + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } + + // set layout/format of output tensors + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format); + } // execute optional reorder and batch_norm backward primitive std::vector pipeline; if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst); - pipeline.push_back(batch_norm_bwd_prim); + pipeline.push_back(*batch_norm_bwd_p); stream(stream::kind::eager).submit(pipeline).wait(); // copy back diff sacle/shift to output tensors (diff scale/shift) @@ -338,12 +502,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::copy(it, std::next(it, ic), diff_scale_data); std::copy(std::next(it, ic), std::end(diff_scaleshift_data), diff_shift_data); - - // set layout/format of output tensors - diff_x->set_layout(DataLayout::kMKLDNN); - diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc() - .desc() - .data.format); } }; } // namespace operators diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index a44d84cd7b99107fef09a6b4dfa60172fabd718b..1301c8ae2b145298b18f68f86dadd3c5cbe4271a 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -29,6 +29,6 @@ target_assign_op.cu) detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc polygon_box_transform_op.cu) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) - -# Export local libraries to parent +detection_library(generate_proposals_op SRCS generate_proposals_op.cc) +#Export local libraries to parent set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d29b0153389574de8992b93ac6795e91556af870 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -0,0 +1,485 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +struct AppendProposalsFunctor { + LoDTensor *out_; + int64_t offset_; + Tensor *to_add_; + + AppendProposalsFunctor(LoDTensor *out, int64_t offset, Tensor *to_add) + : out_(out), offset_(offset), to_add_(to_add) {} + + template + void operator()() const { + auto *out_data = out_->data(); + auto *to_add_data = to_add_->data(); + memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T)); + } +}; + +class GenerateProposalsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Scores"), "Input(Scores) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("BboxDeltas"), + "Input(BboxDeltas) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Anchors"), + "Input(Anchors) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Variances"), + "Input(Variances) shouldn't be null."); + + auto scores_dims = ctx->GetInputDim("Scores"); + auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + auto anchors_dims = ctx->GetInputDim("Anchors"); + auto variances_dims = ctx->GetInputDim("Variances"); + + ctx->SetOutputDim("RpnRois", {-1, 4}); + ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Anchors")->type()), + platform::CPUPlace()); + } +}; + +template +void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, + Tensor *bbox_deltas, Tensor *variances, Tensor *proposals) { + T *proposals_data = proposals->mutable_data(ctx.GetPlace()); + + int64_t row = all_anchors->dims()[0]; + int64_t len = all_anchors->dims()[1]; + + auto *bbox_deltas_data = bbox_deltas->data(); + auto *anchor_data = all_anchors->data(); + const T *variances_data = nullptr; + if (variances) { + variances_data = variances->data(); + } + + for (int64_t i = 0; i < row; ++i) { + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len]; + T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1]; + + T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2; + T anchor_center_y = + (anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2; + + T bbox_center_x = 0, bbox_center_y = 0; + T bbox_width = 0, bbox_height = 0; + + if (variances) { + bbox_center_x = + variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data[i * len + 1] * + bbox_deltas_data[i * len + 1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2]) * + anchor_width; + bbox_height = std::exp(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3]) * + anchor_height; + } else { + bbox_center_x = + bbox_deltas_data[i * len] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; + bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; + bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; + } + + proposals_data[i * len] = bbox_center_x - bbox_width / 2; + proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; + } + // return proposals; +} + +template +void ClipTiledBoxes(const platform::DeviceContext &ctx, const Tensor &im_info, + Tensor *boxes) { + T *boxes_data = boxes->mutable_data(ctx.GetPlace()); + const T *im_info_data = im_info.data(); + for (int64_t i = 0; i < boxes->numel(); ++i) { + if (i % 4 == 0) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f); + } else if (i % 4 == 1) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f); + } else if (i % 4 == 2) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f); + } else { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f); + } + } +} + +template +void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, + float min_size, const Tensor &im_info, Tensor *keep) { + const T *im_info_data = im_info.data(); + T *boxes_data = boxes->mutable_data(ctx.GetPlace()); + min_size *= im_info_data[2]; + keep->Resize({boxes->dims()[0], 1}); + int *keep_data = keep->mutable_data(ctx.GetPlace()); + + int keep_len = 0; + for (int i = 0; i < boxes->dims()[0]; ++i) { + T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; + T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; + T x_ctr = boxes_data[4 * i] + ws / 2; + T y_ctr = boxes_data[4 * i + 1] + hs / 2; + if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && + y_ctr <= im_info_data[0]) { + keep_data[keep_len++] = i; + } + } + keep->Resize({keep_len}); +} + +bool SortScorePairDescend(const std::pair &pair1, + const std::pair &pair2) { + return pair1.first > pair2.first; +} + +template +void GetMaxScoreIndex(const std::vector &scores, + std::vector> *sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); +} + +template +T BBoxArea(const T *box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = inter_xmax - inter_xmin; + const T inter_h = inter_ymax - inter_ymin; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores, + const T nms_threshold, const float eta) { + PADDLE_ENFORCE_NOT_NULL(bbox); + int64_t num_boxes = bbox->dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox->dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores->data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, &sorted_indices); + + std::vector selected_indices; + int selected_num = 0; + T adaptive_threshold = nms_threshold; + const T *bbox_data = bbox->data(); + bool flag; + while (sorted_indices.size() != 0) { + int idx = sorted_indices.front().second; + flag = true; + for (size_t k = 0; k < selected_indices.size(); ++k) { + if (flag) { + const int kept_idx = selected_indices[k]; + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, false); + flag = (overlap <= adaptive_threshold); + } else { + break; + } + } + if (flag) { + selected_indices.push_back(idx); + selected_num++; + } + sorted_indices.erase(sorted_indices.begin()); + if (flag && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + Tensor keep_nms; + keep_nms.Resize({selected_num}); + int *keep_data = keep_nms.mutable_data(ctx.GetPlace()); + for (int i = 0; i < selected_num; ++i) { + keep_data[i] = selected_indices[i]; + } + + return keep_nms; +} + +template +class GenerateProposalsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *scores = context.Input("Scores"); + auto *bbox_deltas = context.Input("BboxDeltas"); + auto *im_info = context.Input("ImInfo"); + auto *anchors = context.Input("Anchors"); + auto *variances = context.Input("Variances"); + + auto *rpn_rois = context.Output("RpnRois"); + auto *rpn_roi_probs = context.Output("RpnRoiProbs"); + + int pre_nms_top_n = context.Attr("pre_nms_topN"); + int post_nms_top_n = context.Attr("post_nms_topN"); + float nms_thresh = context.Attr("nms_thresh"); + float min_size = context.Attr("min_size"); + float eta = context.Attr("eta"); + + auto &dev_ctx = context.template device_context(); + + auto scores_dim = scores->dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto bbox_dim = bbox_deltas->dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, + context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel() / 4, 1}, + context.GetPlace()); + + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, + dev_ctx.GetPlace()); + scores_swap.mutable_data({num, h_score, w_score, c_score}, + dev_ctx.GetPlace()); + + math::Transpose trans; + std::vector axis = {0, 2, 3, 1}; + trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); + trans(dev_ctx, *scores, &scores_swap, axis); + + framework::LoD lod; + std::vector lod0(1, 0); + Tensor *anchor = const_cast(anchors); + anchor->Resize({anchors->numel() / 4, 4}); + Tensor *var = const_cast(variances); + var->Resize({var->numel() / 4, 4}); + + int64_t num_proposals = 0; + for (int64_t i = 0; i < num; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair tensor_pair = + ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var, + bbox_deltas_slice, scores_slice, pre_nms_top_n, + post_nms_top_n, nms_thresh, min_size, eta); + Tensor proposals = tensor_pair.first; + Tensor scores = tensor_pair.second; + + framework::VisitDataType( + framework::ToDataType(rpn_rois->type()), + AppendProposalsFunctor(rpn_rois, 4 * num_proposals, &proposals)); + framework::VisitDataType( + framework::ToDataType(rpn_roi_probs->type()), + AppendProposalsFunctor(rpn_roi_probs, num_proposals, &scores)); + + num_proposals += proposals.dims()[0]; + lod0.emplace_back(num_proposals); + } + + lod.emplace_back(lod0); + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); + } + + std::pair ProposalForOneImage( + const DeviceContext &ctx, const Tensor &im_info_slice, + const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas_slice, // [M, 4] + const Tensor &scores_slice, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) const { + auto *scores_data = scores_slice.data(); + + // Sort index + Tensor index_t; + index_t.Resize({scores_slice.numel()}); + int *index = index_t.mutable_data(ctx.GetPlace()); + for (int i = 0; i < scores_slice.numel(); ++i) { + index[i] = i; + } + std::function compare = + [scores_data](const int64_t &i, const int64_t &j) { + return scores_data[i] > scores_data[j]; + }; + + if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { + std::sort(index, index + scores_slice.numel(), compare); + } else { + std::nth_element(index, index + pre_nms_top_n, + index + scores_slice.numel(), compare); + index_t.Resize({pre_nms_top_n}); + } + + Tensor scores_sel, bbox_sel, anchor_sel, var_sel; + scores_sel.mutable_data({index_t.numel(), 1}, ctx.GetPlace()); + bbox_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + + CPUGather(ctx, scores_slice, index_t, &scores_sel); + CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + CPUGather(ctx, anchors, index_t, &anchor_sel); + CPUGather(ctx, variances, index_t, &var_sel); + + Tensor proposals; + proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + BoxCoder(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals); + + ClipTiledBoxes(ctx, im_info_slice, &proposals); + + Tensor keep; + FilterBoxes(ctx, &proposals, min_size, im_info_slice, &keep); + + Tensor scores_filter; + bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); + scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); + CPUGather(ctx, proposals, keep, &bbox_sel); + CPUGather(ctx, scores_sel, keep, &scores_filter); + if (nms_thresh <= 0) { + return std::make_pair(bbox_sel, scores_sel); + } + + Tensor keep_nms = NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); + + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize({post_nms_top_n}); + } + + proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); + scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); + CPUGather(ctx, bbox_sel, keep_nms, &proposals); + CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + + return std::make_pair(proposals, scores_sel); + } +}; + +class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Scores", "The scores of anchors should be foreground."); + AddInput("BboxDeltas", "bbox_deltas."); + AddInput("ImInfo", "Information for image reshape."); + AddInput("Anchors", "All anchors."); + AddInput("Variances", " variances"); + + AddOutput("RpnRois", "Anchors."); + AddOutput("RpnRoiProbs", "Anchors."); + AddAttr("pre_nms_topN", "pre_nms_topN"); + AddAttr("post_nms_topN", "post_nms_topN"); + AddAttr("nms_thresh", "nms_thres"); + AddAttr("min_size", "min size"); + AddAttr("eta", "eta"); + AddComment(R"DOC( +Generate Proposals OP + +This operator proposes rois according to each box with their probability to be a foreground object and +the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals +could be used to train detection net. + +Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number +of anchors, H and W are height and width of the feature map. +BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W) + +For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and + calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area. +Finally, apply nms to get final proposals as output. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp, + ops::GenerateProposalsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + generate_proposals, + ops::GenerateProposalsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 43f949111104ee56efc8625bdd609e412ef7f37d..2008e7027524ffd1f80a6eede015801b8a0b0254 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -18,15 +18,32 @@ limitations under the License. */ namespace paddle { namespace operators { +template +struct DequantizeFunctor { + void operator()(const platform::CPUDeviceContext& dev_ctx, + const framework::Tensor* in, const framework::Tensor* scale, + T max_range, framework::Tensor* out) { + auto in_e = framework::EigenVector::Flatten(*in); + const T* scale_factor = scale->data(); + auto out_e = framework::EigenVector::Flatten(*out); + + auto& dev = *dev_ctx.eigen_device(); + out_e.device(dev) = (scale_factor[0] / max_range) * in_e; + } +}; + +template struct DequantizeFunctor; +template struct DequantizeFunctor; + class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel { public: - FakeDequantizeMaxAbsOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) + FakeDequantizeMaxAbsOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of FakeDequantizeMaxAbsOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -42,21 +59,17 @@ class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(Tensor) The input with float-32/64 type is the " "low precision tensor."); + AddInput("Scale", "(float) The scale in quantization stage."); AddOutput("Out", "(Tensor) The output is the dequantized high " "precision tensor."); - AddAttr("num_bits", - "(int) `num_bits` is the quantization level bits, " - "such as 2, 5, 8."); - AddAttr("scale", - "(float) The maximum absolute value of low precision tensor." - "It is usually calculated by the fake_quantize_max_abs_op."); + AddAttr("max_range", "(float) The max range in quantization stage."); AddComment(R"DOC( FakeDequantizeMaxAbsOp operator. This calculation is an opposite operation of FakeQuantizeMaxAbsOp: -$$Out = \frac{scale*X}{2^{num_bits} - 1}$$ +$$Out = \frac{scale*X}{ max_range }$$ )DOC"); } diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index 1bd38d1bd2c3a6f90d2fbad415d61efaead3afe9..225bcc45bc65bc9268d1e866a4358731eaf0c3ef 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -14,6 +14,42 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.h" +namespace paddle { +namespace operators { + +template +__global__ void KeDequantize(const T* in, const T* scale, T max_range, int num, + T* out) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < num) { + out[idx] = in[idx] * scale[0] / max_range; + } +} + +template +struct DequantizeFunctor { + void operator()(const platform::CUDADeviceContext& dev_ctx, + const framework::Tensor* in, const framework::Tensor* scale, + T max_range, framework::Tensor* out) { + const T* in_data = in->data(); + const T* scale_factor = scale->data(); + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + + int num = in->numel(); + int block = 512; + int grid = (num + block - 1) / block; + + KeDequantize<<>>( + in_data, scale_factor, max_range, num, out_data); + } +}; + +template struct DequantizeFunctor; +template struct DequantizeFunctor; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index 0901e68b3761159c3cc9c6684567bee38ec3f16d..d9923a10daa01ca06ebabb27cf9285b0628634bc 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -19,22 +19,29 @@ limitations under the License. */ namespace paddle { namespace operators { + +template +struct DequantizeFunctor { + void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in, + const framework::Tensor* scale, T max_range, + framework::Tensor* out); +}; + template class FakeDequantizeMaxAbsKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& ctx) const { auto* in = ctx.Input("X"); + auto* scale = ctx.Input("Scale"); auto* out = ctx.Output("Out"); - out->mutable_data(in->place()); - int num_bits = ctx.Attr("num_bits"); - T scale = static_cast(ctx.Attr("scale")); - int range = std::pow(2, num_bits) - 1; + float max_range = ctx.Attr("max_range"); + + auto& dev_ctx = ctx.template device_context(); + out->mutable_data(dev_ctx.GetPlace()); - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& dev = *ctx.template device_context().eigen_device(); - eigen_out.device(dev) = (scale / range) * eigen_in; + DequantizeFunctor()(dev_ctx, in, scale, + static_cast(max_range), out); } }; diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index d9cd956dfdff3d009d38ee5088f5396080580483..9d7ac7ab6194593747548fac3cefc8d4ed3058d8 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase { class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { + AddOutput("Out", "(Any) Dummy outputs, used for control dependency") + .AsDuplicable(); AddComment(R"DOC( SendBarrier operator diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a34aa86b6331e4fe2813eea97cb6644323807c3 --- /dev/null +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -0,0 +1,332 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fusion_gru_op.h" +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" +#include "paddle/fluid/operators/math/detail/gru_kernel.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { + +void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightX"), + "Input(WeightX) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightH"), + "Input(WeightH) of GRU should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"), + "Output(BatchedGate) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"), + "Output(BatchResetHiddenPrev) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), + "Output(BatchedHidden) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(Hidden) of GRU should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); + + auto wx_dims = ctx->GetInputDim("WeightX"); + PADDLE_ENFORCE_EQ(wx_dims.size(), 2, + "The rank of Input(WeightX) should be 2."); + PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1], + "The first dimension of Input(WeightX) " + "should be %d.", + x_dims[1]); + + int frame_size = wx_dims[1] / 3; + auto wh_dims = ctx->GetInputDim("WeightH"); + PADDLE_ENFORCE_EQ(wh_dims.size(), 2, + "The rank of Input(WeightH) should be 2."); + PADDLE_ENFORCE_EQ(wh_dims[0], frame_size, + "The first dimension of Input(WeightH) " + "should be %d.", + frame_size); + PADDLE_ENFORCE_EQ(wh_dims[1], 3 * frame_size, + "The second dimension of Input(WeightH) " + "should be 3 * %d.", + frame_size); + + if (ctx->HasInput("H0")) { + auto h0_dims = ctx->GetInputDim("H0"); + PADDLE_ENFORCE_EQ(h0_dims[1], frame_size, + "The width of H0 must be equal to frame_size."); + } + if (ctx->HasInput("Bias")) { + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + PADDLE_ENFORCE_EQ(b_dims[1], frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + } + framework::DDim out_dims({x_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]}); + ctx->SetOutputDim("BatchedHidden", out_dims); + ctx->SetOutputDim("BatchResetHiddenPrev", out_dims); + ctx->ShareLoD("X", "Hidden"); + + int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + ctx->SetOutputDim("XX", {x_dims[0], xx_width}); + ctx->ShareLoD("X", "XX"); +} + +framework::OpKernelType FusionGRUOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); +} + +void FusionGRUOpMaker::Make() { + AddInput("X", + "(LoDTensor) the input is a LodTensor, which support " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X M), where T is the " + "total time steps in this mini-batch, M is the dim size of x."); + AddInput("H0", + "(Tensor, optional) The initial hidden state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size, D is the hidden size.") + .AsDispensable(); + AddInput("WeightX", + "(Tensor) The FC weight with shape (M x 3D)," + "where M is the dim size of x, D is the hidden size. "); + AddInput("WeightH", + "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "); + AddInput("Bias", + "(Tensor, optional) (1 x 3D)." + "Almost same as GRUOp." + "Note: if have FC bias it should be added on this bias.") + .AsDispensable(); + AddOutput("XX", + "(LoDTensor) the result after X * WeightX (size is T x 4D)" + " or batched_X (size is T x M), this will be automatically chosen," + " where T is the total time steps in this mini-batch," + " D is the hidden size, M is the dim size of x input.") + .AsIntermediate(); + AddOutput("BatchedGate", "(LoDTensor) Same as GRUOp").AsIntermediate(); + AddOutput("BatchResetHiddenPrev", "(LoDTensor) (T x 3D) Same as GRUOp.") + .AsIntermediate(); + AddOutput("BatchedHidden", "(LoDTensor) (T X D) Same as GRUOp.") + .AsIntermediate(); + AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp"); + AddAttr("activation", + "(string, default tanh) " + "The activation type used for output candidate {h}_t.") + .SetDefault("tanh"); + AddAttr( + "gate_activation", + "(string, default sigmoid) " + "The activation type used in update gate and reset gate.") + .SetDefault("sigmoid"); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed GRU.") + .SetDefault(false); + AddComment(R"DOC( +The Fusion complete GRU Operator. +This operator fuse the fully-connected operator into GRU, +more details can refer to GRU op. +)DOC"); +} + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, + framework::Vector index_lod, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + row_shuffle(ctx, src, index_lod, dst, indexed_src); +} + +template +class FusionGRUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* wx = ctx.Input("WeightX"); + auto* wh = ctx.Input("WeightH"); + auto* bias = ctx.Input("Bias"); + auto* h0 = ctx.Input("H0"); + + auto* xx = ctx.Output("XX"); + auto* batched_gate = ctx.Output("BatchedGate"); + auto* batch_reset_hidden_prev = + ctx.Output("BatchResetHiddenPrev"); + auto* batch_hidden = ctx.Output("BatchedHidden"); + auto* hidden_out = ctx.Output("Hidden"); + bool is_reverse = ctx.Attr("is_reverse"); + + T* xx_data = xx->mutable_data(ctx.GetPlace()); + T* batched_gate_data = batched_gate->mutable_data(ctx.GetPlace()); + batch_reset_hidden_prev->mutable_data(ctx.GetPlace()); + batch_hidden->mutable_data(ctx.GetPlace()); + hidden_out->mutable_data(ctx.GetPlace()); + + const T* x_data = x->data(); + const T* wx_data = wx->data(); + const T* wh_data = wh->data(); + auto x_dims = x->dims(); + auto wx_dims = wx->dims(); + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(dev_ctx); + math::LoDTensor2BatchFunctor to_batch; + if (x_dims[1] > wx_dims[1]) { + math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], + x_data, wx_data, xx_data, + bias ? bias->data() : NULL); + to_batch(dev_ctx, *xx, batched_gate, true, is_reverse); + } else { + to_batch(dev_ctx, *x, xx, true, is_reverse); + batched_gate->set_lod(xx->lod()); + math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], + xx_data, wx_data, batched_gate_data, + bias ? bias->data() : NULL); + } + + int frame_size = static_cast(wx_dims[1] / 3); + math::GRUMetaValue gru_value; + gru_value.gate_weight = const_cast(wh_data); + gru_value.state_weight = + const_cast(wh_data + 2 * frame_size * frame_size); + Tensor ordered_h0; + + framework::Vector order(batched_gate->lod()[2]); + + if (h0) { + ReorderInitState( + ctx.template device_context(), *h0, order, &ordered_h0, + true); + gru_value.prev_out_value = ordered_h0.data(); + } else { + gru_value.prev_out_value = nullptr; + } + auto batch_starts = batched_gate->lod()[0]; + size_t seq_len = batch_starts.size() - 1; + auto active_node = + math::detail::GetActivationType(ctx.Attr("activation")); + auto active_gate = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + +#ifdef PADDLE_WITH_MKLML + // use MKL packed to speedup GEMM + if (FLAGS_paddle_num_threads >= 4) { + auto blas = math::GetBlas(dev_ctx); + T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, + frame_size * 2 /*width of weight*/, + frame_size /*height of height*/); + PADDLE_ENFORCE(packed_gate); + blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2, + frame_size, T(1.0), gru_value.gate_weight, frame_size * 2, + packed_gate); + T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, + frame_size /*width of weight*/, + frame_size /*height of height*/); + PADDLE_ENFORCE(packed_state); + blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size, + frame_size, T(1.0), gru_value.state_weight, frame_size, + packed_state); + for (size_t n = 0; n < seq_len; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + Tensor gate_t = batched_gate->Slice(bstart, bend); + Tensor reset_hidden_prev_t = + batch_reset_hidden_prev->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + gru_value.output_value = hidden_t.data(); + gru_value.gate_value = gate_t.data(); + gru_value.reset_output_value = reset_hidden_prev_t.data(); + + if (gru_value.prev_out_value) { + blas.GEMM_COMPUTE( + CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2, + frame_size, gru_value.prev_out_value, frame_size, packed_gate, + frame_size * 2, T(1), gru_value.gate_value, frame_size * 3); + } + + math::detail::forward_reset_output( + math::detail::forward::gru_resetOutput(), gru_value, frame_size, + cur_batch_size, active_gate); + + if (gru_value.prev_out_value) { + blas.GEMM_COMPUTE( + CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size, + gru_value.reset_output_value, frame_size, packed_state, + frame_size, T(1), gru_value.gate_value + frame_size * 2, + frame_size * 3); + } + + math::detail::forward_final_output( + math::detail::forward::gru_finalOutput(), gru_value, frame_size, + cur_batch_size, active_node); + + gru_value.prev_out_value = gru_value.output_value; + } + + blas.GEMM_FREE(packed_gate); + blas.GEMM_FREE(packed_state); + } else { +#endif + for (size_t n = 0; n < seq_len; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + Tensor gate_t = batched_gate->Slice(bstart, bend); + Tensor reset_hidden_prev_t = + batch_reset_hidden_prev->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + gru_value.output_value = hidden_t.data(); + gru_value.gate_value = gate_t.data(); + gru_value.reset_output_value = reset_hidden_prev_t.data(); + + math::GRUUnitFunctor::compute( + dev_ctx, gru_value, frame_size, cur_batch_size, active_node, + active_gate); + + gru_value.prev_out_value = gru_value.output_value; + } +#ifdef PADDLE_WITH_MKLML + } +#endif + math::Batch2LoDTensorFunctor to_seq; + batch_hidden->set_lod(batched_gate->lod()); + to_seq(dev_ctx, *batch_hidden, hidden_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OP_CPU_KERNEL( + fusion_gru, ops::FusionGRUKernel, + ops::FusionGRUKernel); diff --git a/paddle/fluid/operators/fusion_gru_op.h b/paddle/fluid/operators/fusion_gru_op.h new file mode 100644 index 0000000000000000000000000000000000000000..eaa59cd412f8f2fd0089428f5e25202c70f032c7 --- /dev/null +++ b/paddle/fluid/operators/fusion_gru_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusionGRUOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusionGRUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 3888333ec5626f1d8d35db215085f483c985cf0a..e4e4ac8e333ba423e151dea05e40a0e41042570e 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -15,10 +15,14 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_lstm_op.h" #include #include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/fluid/platform/cpu_info.h" + +DEFINE_bool(seq_mode, true, "Use sequence mode"); namespace paddle { namespace operators { @@ -98,7 +102,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Cell"); - int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + int xx_width; + if (FLAGS_seq_mode) { + xx_width = wx_dims[1]; + } else { + xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + } ctx->SetOutputDim("XX", {x_dims[0], xx_width}); ctx->ShareLoD("X", "XX"); } @@ -205,10 +214,138 @@ inline void ReorderInitState(const DeviceContext& ctx, row_shuffle(ctx, src, index_lod, dst, indexed_src); } -template +template class FuisonLSTMKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void SeqCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = paddle::platform::CPUDeviceContext; + auto* x = ctx.Input("X"); + auto* h0 = ctx.Input("H0"); + auto* c0 = ctx.Input("C0"); + auto* wx = ctx.Input("WeightX"); + auto* wh = ctx.Input("WeightH"); + auto* bias = ctx.Input("Bias"); + + auto* xx = ctx.Output("XX"); + auto* hidden_out = ctx.Output("Hidden"); + auto* cell_out = ctx.Output("Cell"); + bool is_reverse = ctx.Attr("is_reverse"); + + std::function act_gate, act_cell, act_cand; + auto& act_gate_str = ctx.Attr("gate_activation"); + auto& act_cell_str = ctx.Attr("cell_activation"); + auto& act_cand_str = ctx.Attr("candidate_activation"); + if (platform::jit::MayIUse(platform::jit::avx)) { + math::VecActivations act_functor; + act_gate = act_functor(act_gate_str); + act_cell = act_functor(act_cell_str); + act_cand = act_functor(act_cand_str); + } else { + math::VecActivations act_functor; + act_gate = act_functor(act_gate_str); + act_cell = act_functor(act_cell_str); + act_cand = act_functor(act_cand_str); + } + + auto x_lod = x->lod(); + auto x_dims = x->dims(); // T x M + auto wh_dims = wh->dims(); // D x 4D + const int total_T = x_dims[0]; + const int N = x_lod[0].size() - 1; // batch size + const int M = x_dims[1]; // x frame size + const int D = wh_dims[0]; + const int D2 = D * 2; + const int D3 = D * 3; + const int D4 = wh_dims[1]; + + const T* x_data = x->data(); + const T* h0_data = h0 ? h0->data() : NULL; + const T* c0_data = c0 ? c0->data() : NULL; + const T* wx_data = wx->data(); + const T* wh_data = wh->data(); + T* xx_data = xx->mutable_data(ctx.GetPlace()); + T* hidden_out_data = hidden_out->mutable_data(ctx.GetPlace()); + T* cell_out_data = cell_out->mutable_data(ctx.GetPlace()); + + auto blas = math::GetBlas(ctx); + math::FCCompute(blas, total_T, D4, M, x_data, wx_data, + xx_data, bias->data()); + int xx_offset = D4; + int gate_offset = D; + if (is_reverse) { + const int offset = (total_T - 1) * D; + xx_data = xx_data + offset * 4; + hidden_out_data = hidden_out_data + offset; + cell_out_data = cell_out_data + offset; + xx_offset = -D4; + gate_offset = -D; + } + + auto move_step = [&]() { + xx_data = xx_data + xx_offset; + hidden_out_data = hidden_out_data + gate_offset; + cell_out_data = cell_out_data + gate_offset; + }; + + for (int i = 0; i < N; ++i) { + int bid = is_reverse ? N - 1 - i : i; + int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; + const T* prev_cell_data = NULL; + const T* prev_hidden_data = NULL; + int tstart = 0; + if (h0_data) { + prev_hidden_data = h0_data + bid * D; + prev_cell_data = c0_data + bid * D; + } else { + // W_ch, W_ih, W_fh, W_oh + act_gate(D3, xx_data + D, xx_data + D); + act_cand(D, xx_data, xx_data); + // cell out= input*tilde + blas.VMUL(D, xx_data, xx_data + D, cell_out_data); + // hidden out= act_state(cellout) * outgate + act_cell(D, cell_out_data, xx_data + D2); + blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); + + // prev + prev_hidden_data = hidden_out_data; + prev_cell_data = cell_out_data; + tstart = 1; + + move_step(); + } + for (int step = tstart; step < seq_len; ++step) { + blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast(1), + prev_hidden_data, D, wh_data, D4, static_cast(1), xx_data, + D4); + + // W_ch, W_ih, W_fh, W_oh + act_gate(D3, xx_data + D, xx_data + D); + act_cand(D, xx_data, xx_data); + + // a = forget * prev_cell + blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2); + + // b = input * tilde + blas.VMUL(D, xx_data, xx_data + D, xx_data + D); + + // cell out= a+b + blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data); + + // hidden out= act_state(cellout) * outgate + act_cell(D, cell_out_data, xx_data + D2); + blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); + + // prev + prev_hidden_data = hidden_out_data; + prev_cell_data = cell_out_data; + + move_step(); + } + } + } + + void BatchCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = platform::CPUDeviceContext; auto* x = ctx.Input("X"); auto* wx = ctx.Input("WeightX"); auto* wh = ctx.Input("WeightH"); @@ -339,6 +476,13 @@ class FuisonLSTMKernel : public framework::OpKernel { // restore the output cell state in LoDTensor from the batch cell to_seq(dev_ctx, batch_cell, cell_out); } + void Compute(const framework::ExecutionContext& ctx) const override { + if (FLAGS_seq_mode) { + SeqCompute(ctx); + } else { + BatchCompute(ctx); + } + } }; } // namespace operators @@ -348,7 +492,5 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OP_CPU_KERNEL( - fusion_lstm, - ops::FuisonLSTMKernel, - ops::FuisonLSTMKernel); +REGISTER_OP_CPU_KERNEL(fusion_lstm, ops::FuisonLSTMKernel, + ops::FuisonLSTMKernel); diff --git a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0cd3d3887cf5167c779a8b20442fdb458cd7eab4 --- /dev/null +++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc @@ -0,0 +1,206 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h" +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { + +void FusionSeqExpandConcatFCOp::InferShape( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE_GT( + ctx->Inputs("X").size(), 1UL, + "Inputs(X) of FusionSeqExpandConcatFCOp should larger than 1."); + PADDLE_ENFORCE( + ctx->HasInput("FCWeight"), + "Input(FCWeight) of FusionSeqExpandConcatFCOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FusionSeqExpandConcatFCOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("FCOut"), + "Output(FCOut) of FusionSeqExpandConcatFCOp should not be null."); + + auto ins_dims = ctx->GetInputsDim("X"); + auto w_dims = ctx->GetInputDim("FCWeight"); // (M0+M1+M2+..) x D + PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2."); + const int D = w_dims[1]; + int sum = ins_dims[0][1]; + for (size_t i = 1; i < ins_dims.size(); ++i) { + sum += ins_dims[i][1]; + } + PADDLE_ENFORCE_EQ(sum, w_dims[0], + "FC height should be sum of all inputs width."); + if (ctx->HasInput("FCBias")) { + auto b_dims = ctx->GetInputDim("FCBias"); + PADDLE_ENFORCE(b_dims.size() == 1 || b_dims.size() == 2, + "b_dims should be 1 or 2, get %d", b_dims.size()); + if (b_dims.size() == 1) { + PADDLE_ENFORCE_EQ(b_dims[0], D, "FCBias shapes must be %d.", D); + } else { + PADDLE_ENFORCE_EQ(b_dims[0], 1, "FCBias shapes must be 1x%d.", D); + PADDLE_ENFORCE_EQ(b_dims[1], D, "FCBias shapes must be 1x%d.", D); + } + } + + ctx->SetOutputDim("Out", {ins_dims[0][0], D}); + // fcout should be reshape when run since can not get lod in infershape + // explicit share the ref lod + ctx->ShareLoD("X", "Out", 0); +} + +framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::ToDataType(ctx.MultiInput("X")[0]->type()), + ctx.device_context()); +} + +void FusionSeqExpandConcatFCOpMaker::Make() { + AddInput("X", + "(LoDTensor) input LodDTensors, the first one must be have ref lod " + "for sequence expand, and the rest input should have same lod.") + .AsDuplicable(); + AddInput("FCWeight", "(Tensor) the weights of fc."); + AddInput("FCBias", "(Tensor, optional) the bias of fc.").AsDispensable(); + AddOutput("Out", "(LoDTensor) Output LodTensor."); + AddOutput( + "FCOut", + "(Tensor) the intermediate tensor to keep the result of fc." + "Shape is (N x D), where N is the batch size, D is the output dim of fc") + .AsIntermediate(); + AddAttr("fc_activation", + "(string, default: identity)" + "The activation for the result of fc." + "`identity` by default.") + .SetDefault("identity") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Fusion Sequence expand + concat + fc Operator. + +All below conditions should be meet: + +The ref_level of seq_expand should be 0. + +The ref lod of seq_expand level is the first input of concat. + +The other inputs should have same lod and same batch size of ref lod. + +The seq len of other inputs should be 1. + +The concat axis should be 1. + +)DOC"); +} + +template +class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using DeviceContext = paddle::platform::CPUDeviceContext; + auto ins = ctx.MultiInput("X"); + auto* w = ctx.Input("FCWeight"); + auto* b = ctx.Input("FCBias"); + auto* out = ctx.Output("Out"); + auto* fc_out = ctx.Output("FCOut"); + + auto* ref_in = ins[0]; + auto ref_lod = ref_in->lod(); + auto in1_lod = ins[1]->lod(); + auto ref_dims = ref_in->dims(); // T x M0 + auto in1_dims = ins[1]->dims(); // N x M1 + auto w_dims = w->dims(); + const int N = ref_lod[0].size() - 1; + const int total_T = ref_dims[0]; + const int M0 = ref_dims[1]; + const int M1 = in1_dims[1]; + const int D = w_dims[1]; + + // some check and fcout should be reshape here + // since infershape can not get lod info + PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1."); + PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1."); + PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N, + "Batch size of all inputs should be equal."); + PADDLE_ENFORCE_EQ(in1_lod[0][N], N, + "Seq_length of other inputs should be 1."); + PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size."); + for (size_t i = 2; i < ins.size(); ++i) { + PADDLE_ENFORCE_EQ(ins[i]->dims()[0], N, + "All other inputs height should be equal"); + PADDLE_ENFORCE_EQ(ins[i]->lod(), in1_lod, + "All other inputs should have same lod"); + } + fc_out->Resize({N, D}); + + std::function fc_act; + auto& fc_act_str = ctx.Attr("fc_activation"); + if (platform::jit::MayIUse(platform::jit::avx)) { + math::VecActivations act_functor; + fc_act = act_functor(fc_act_str); + } else { + math::VecActivations act_functor; + fc_act = act_functor(fc_act_str); + } + + const T* ref_in_data = ref_in->data(); + const T* in1_data = ins[1]->data(); + const T* w_data = w->data(); + T* out_data = out->mutable_data(ctx.GetPlace()); + T* fc_out_data = fc_out->mutable_data(ctx.GetPlace()); + + auto blas = math::GetBlas(ctx); + math::FCCompute(blas, total_T, D, M0, ref_in_data, w_data, + out_data, b ? b->data() : NULL); + w_data = w_data + M0 * D; + // first write on + blas.MatMul(N, D, M1, in1_data, w_data, fc_out_data); + w_data = w_data + M1 * D; + for (size_t i = 2; i < ins.size(); ++i) { + // add on + const T* in_data = ins[i]->data(); + const int K = ins[i]->dims()[1]; + blas.GEMM(CblasNoTrans, CblasNoTrans, N, D, K, static_cast(1), in_data, + K, w_data, D, static_cast(1), fc_out_data, D); + w_data = w_data + K * D; + } + T* cur_out_data = out_data; + for (int i = 0; i < N; ++i) { + int seq_len = ref_lod[0][i + 1] - ref_lod[0][i]; + T* src = fc_out_data + i * D; + for (int step = 0; step < seq_len; ++step) { + blas.VADD(D, cur_out_data, src, cur_out_data); + cur_out_data = cur_out_data + D; + } + } + fc_act(total_T * D, out_data, out_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOp, + ops::FusionSeqExpandConcatFCOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc, + ops::FusionSeqExpandConcatFCOpKernel, + ops::FusionSeqExpandConcatFCOpKernel); diff --git a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f78e820f603354944bd7fc23aff2d1d72e5ba750 --- /dev/null +++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusionSeqExpandConcatFCOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusionSeqExpandConcatFCOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index 820e73e779720e4f76168e0a84a254ef645784ee..342379268be36cc5b532363e664f6e73990333e1 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -177,6 +177,9 @@ class ConcatFunctor { dev_ins_data, dev_ins_col_data, static_cast(inputs_col.size()), out_row, out_col, output->data()); } + // Wait() must be called because `inputs_data` may be destructed before + // kernel ends + context.Wait(); } }; @@ -252,6 +255,9 @@ class ConcatGradFunctor { input.data(), in_row, in_col, dev_outs_col_data, static_cast(outputs_cols.size()), dev_out_gpu_data); } + // Wait() must be called because `outputs_data` may be destructed before + // kernel ends + context.Wait(); } }; diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 0bae926e9892986b59c6dfc7fa9b8778da1dfcb7..5693761e9ffd96b40040223b5498b63b0274bf0f 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "paddle/fluid/platform/cpu_info.h" #ifdef __AVX__ diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index c3387be6daa3bd34a6e3410ced23fce5d65f2cf7..9a6e646b28fdec78734eb4e7b98c8acf688b2645 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -41,7 +41,8 @@ template struct SetConstant; template struct Transpose; \ template struct Transpose; \ template struct Transpose; \ - template struct Transpose; + template struct Transpose; \ + template struct Transpose; DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(2); diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index d5af718723e8d44da0971ea7756b8c36e771cca2..12d1baa8fb544a8b9684e43204c61ba410d1b295 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -33,10 +33,11 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; -#define DEFINE_GPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(2); diff --git a/paddle/fluid/operators/math/padding.h b/paddle/fluid/operators/math/padding.h new file mode 100644 index 0000000000000000000000000000000000000000..3ae25eae98b25bca015ec4383c7126eb81e52b8a --- /dev/null +++ b/paddle/fluid/operators/math/padding.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +using EigenTensor = framework::EigenTensor; + +template +void PadFunction(const framework::ExecutionContext& context, + const std::vector& pads, const framework::Tensor& src, + T pad_value, framework::Tensor* out) { + Eigen::array, D> paddings; + + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = pads[i * 2]; + paddings[i].second = pads[i * 2 + 1]; + } + + auto src_tensor = EigenTensor::From(src); + auto out_tensor = EigenTensor::From(*out); + + auto& place = + *context.template device_context().eigen_device(); + out_tensor.device(place) = src_tensor.pad(paddings, pad_value); +} + +template +void PadGradFunction(const framework::ExecutionContext& context, + const std::vector& pads, const framework::Tensor& src, + framework::Tensor* d_out) { + Eigen::array, D> paddings; + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = -pads[i * 2]; + paddings[i].second = -pads[i * 2 + 1]; + } + + auto d_out_tensor = EigenTensor::From(*d_out); + auto src_tensor = EigenTensor::From(src); + auto& place = + *context.template device_context().eigen_device(); + d_out_tensor.device(place) = src_tensor.pad(paddings, 0); +} + +template +void PaddingFunctor(int rank, const framework::ExecutionContext& context, + const std::vector& pads, T pad_value, + const framework::Tensor& src, framework::Tensor* out) { + switch (rank) { + case 1: + PadFunction(context, pads, src, pad_value, out); + break; + case 2: + PadFunction(context, pads, src, pad_value, out); + break; + case 3: + PadFunction(context, pads, src, pad_value, out); + break; + case 4: + PadFunction(context, pads, src, pad_value, out); + break; + case 5: + PadFunction(context, pads, src, pad_value, out); + break; + case 6: + PadFunction(context, pads, src, pad_value, out); + break; + default: + PADDLE_THROW( + "PadOp only support tensors with no more than 6 dimensions."); + } +} + +template +void PaddingGradFunctor(int rank, const framework::ExecutionContext& context, + const std::vector& pads, + const framework::Tensor& src, framework::Tensor* out) { + switch (rank) { + case 1: + PadGradFunction(context, pads, src, out); + break; + case 2: + PadGradFunction(context, pads, src, out); + break; + case 3: + PadGradFunction(context, pads, src, out); + break; + case 4: + PadGradFunction(context, pads, src, out); + break; + case 5: + PadGradFunction(context, pads, src, out); + break; + case 6: + PadGradFunction(context, pads, src, out); + break; + default: + PADDLE_THROW( + "PadOp only support tensors with no more than 6 dimensions."); + } +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc index b546b8728217ed6013247555dcd5d7180ddeae74..e4ffeedb5a0061dd60ca3a30aa9928ef8b05887c 100644 --- a/paddle/fluid/operators/math/sequence2batch.cc +++ b/paddle/fluid/operators/math/sequence2batch.cc @@ -38,13 +38,14 @@ class CopyMatrixRowsFunctor { auto width = dst_dims[1]; auto* src_data = src.data(); auto* dst_data = dst->data(); - for (int i = 0; i < height; ++i) { - if (is_src_index) { - memcpy(dst_data + i * width, src_data + index[i] * width, - width * sizeof(T)); - } else { - memcpy(dst_data + index[i] * width, src_data + i * width, - width * sizeof(T)); + const int sz = width * sizeof(T); + if (is_src_index) { + for (int i = 0; i < height; ++i) { + memcpy(dst_data + i * width, src_data + index[i] * width, sz); + } + } else { + for (int i = 0; i < height; ++i) { + memcpy(dst_data + index[i] * width, src_data + i * width, sz); } } } diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc index d63c6c4ed55331235188c1c750468d4e75b9b7f2..25f06a25a0638cbb394df58d35f88307941d117f 100644 --- a/paddle/fluid/operators/math/sequence_padding.cc +++ b/paddle/fluid/operators/math/sequence_padding.cc @@ -18,65 +18,86 @@ namespace paddle { namespace operators { namespace math { +template +void CopyValidData(framework::Tensor* dst_tensor, + const framework::Tensor* src_tensor, + const framework::Vector& seq_offsets, + int pad_seq_len, int step_width, bool norm_by_len, + CopyType type, PadLayout layout) { + int seq_num = seq_offsets.size() - 1; + const T* src_data = src_tensor->data(); + T* dst_data = dst_tensor->data(); + + int seq_cpy_gap = step_width; + int pad_cpy_gap = + layout == kBatchLengthWidth ? step_width : seq_num * step_width; + for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) { + int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; + PADDLE_ENFORCE_GE( + pad_seq_len, valid_seq_len, + "The padded sequence length can not be less than its original length."); + int seq_data_offset = seq_offsets[seq_idx] * step_width; + int pad_data_offset = layout == kBatchLengthWidth + ? seq_idx * pad_seq_len * step_width + : seq_idx * step_width; + float scale = 1.0f / static_cast(valid_seq_len); + + for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) { + const T* src = + src_data + (type == kSeqToPad ? seq_data_offset : pad_data_offset); + T* dst = + dst_data + (type == kSeqToPad ? pad_data_offset : seq_data_offset); + memcpy(dst, src, step_width * sizeof(T)); + if (norm_by_len) { + for (int i = 0; i < step_width; ++i) { + *(dst + i) *= scale; + } + } + seq_data_offset += seq_cpy_gap; + pad_data_offset += pad_cpy_gap; + } + } +} + template class PaddingLoDTensorFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::LoDTensor& seq, framework::Tensor* padding, - bool norm_by_times) { - auto lod = seq.lod(); - PADDLE_ENFORCE_GT(lod.size(), 0UL, - "The LoD of LoDTensor seq should not be null."); - - const size_t level = 0; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - - auto seq_dims = seq.dims(); - PADDLE_ENFORCE_EQ(seq_dims[0], - static_cast(abs_offset_lod[level].back()), - "The first dimension of LoDTensor seq should be " - "equal to the sum of all sequences's length."); - - auto padding_dims = padding->dims(); - PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, - "The input padding should be a 3-D Tensor of shape " - "[max_sequence_length, num_sequences, sequence_width]."); - - const int64_t max_sequence_length = MaximumSequenceLength(lod, level); - PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, - "The first dimension of Tensor padding should be the " - "maximum length of all sequences in LoDTensor seq."); - - const int64_t num_sequences = abs_offset_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, - "The second dimension of Tensor padding should be the " - "number of sequences in LoDTensor seq."); - - const int64_t sequence_width = seq.numel() / seq_dims[0]; - PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, - "The third dimension of Tensor padding should be the " - "width of sequence in LoDTensor seq."); - - const T* seq_data = seq.data(); - T* padding_data = padding->data(); - for (int64_t i = 0; i < max_sequence_length; ++i) { - for (int64_t j = 0; j < num_sequences; ++j) { - int64_t start_pos = abs_offset_lod[level][j]; - int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos; - if (i < sequence_length) { - // i > 0 => sequence_length > 0 - T scale = - norm_by_times ? (1.0f / static_cast(sequence_length)) : 1.0f; - for (int64_t k = 0; k < sequence_width; ++k) { - padding_data[(i * num_sequences + j) * sequence_width + k] = - seq_data[(start_pos + i) * sequence_width + k] * scale; - } - } else { - memset(padding_data + (i * num_sequences + j) * sequence_width, 0, - sequence_width * sizeof(T)); - } + const framework::LoDTensor& seq_tensor, + framework::LoDTensor* pad_tensor, + const framework::LoDTensor& pad_value, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth) { + auto seq_lod = seq_tensor.lod(); + const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; + const auto& seq_tensor_dims = seq_tensor.dims(); + const auto& pad_tensor_dims = pad_tensor->dims(); + if (pad_seq_len == -1) { + pad_seq_len = MaximumSequenceLength(seq_offsets); + } + int step_width = seq_tensor.numel() / seq_tensor_dims[0]; + + CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, + step_width, layout); + PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width, + "The numel of 'pad_value' can only be 1 or be equal to the " + "'step_width'."); + + // fill padding value + T* pad_data = pad_tensor->data(); + const T* pad_value_data = pad_value.data(); + if (pad_value.numel() == 1) { + for (int i = 0; i < pad_tensor->numel(); ++i) { + pad_data[i] = *pad_value_data; + } + } else { + for (int i = 0; i < pad_tensor->numel(); i += step_width) { + memcpy(pad_data + i, pad_value_data, step_width * sizeof(T)); } } + + CopyValidData(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len, + step_width, norm_by_times, kSeqToPad, layout); } }; @@ -84,62 +105,35 @@ template class UnpaddingLoDTensorFunctor { public: void operator()(const platform::CPUDeviceContext& context, - framework::LoDTensor* seq, const framework::Tensor& padding, - bool norm_by_times) { - auto lod = seq->lod(); - PADDLE_ENFORCE_GT(lod.size(), 0UL, - "The LoD of LoDTensor seq should not be null."); - - const size_t level = 0; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - - auto seq_dims = seq->dims(); - PADDLE_ENFORCE_EQ(seq_dims[0], - static_cast(abs_offset_lod[level].back()), - "The first dimension of LoDTensor seq should be " - "equal to the sum of all sequences's length."); - - auto padding_dims = padding.dims(); - PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, - "The input padding should be a 3-D Tensor of shape " - "[max_sequnece_length, num_sequences, sequence_width]."); - - const int64_t max_sequence_length = MaximumSequenceLength(lod, level); - PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, - "The first dimension of Tensor padding should be " - "the maximum length of all sequences in LoDTensor seq."); - - const int64_t num_sequences = abs_offset_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, - "The second dimension of Tensor padding should be " - "the number of sequences in LoDTensor seq."); - - const int64_t sequence_width = seq->numel() / seq_dims[0]; - PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, - "The third dimension of Tensor padding should be the " - "width of sequence in LoDTensor seq."); - - const T* padding_data = padding.data(); - T* seq_data = seq->data(); - for (int64_t i = 0; i < num_sequences; ++i) { - int64_t start_pos = abs_offset_lod[level][i]; - int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos; - for (int64_t j = 0; j < sequence_length; ++j) { - // sequence_width > j > 0 - T scale = - norm_by_times ? (1.0f / static_cast(sequence_length)) : 1.0f; - for (int64_t k = 0; k < sequence_width; ++k) { - seq_data[(start_pos + j) * sequence_width + k] = - padding_data[(j * num_sequences + i) * sequence_width + k] * - scale; - } - } + const framework::LoDTensor& pad_tensor, + framework::LoDTensor* seq_tensor, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth) { + auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; + const auto& seq_tensor_dims = seq_tensor->dims(); + const auto& pad_tensor_dims = pad_tensor.dims(); + if (pad_seq_len == -1) { + pad_seq_len = MaximumSequenceLength(seq_offsets); } + int step_width = seq_tensor->numel() / seq_tensor_dims[0]; + + CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, + step_width, layout); + + CopyValidData(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len, + step_width, norm_by_times, kPadToSeq, layout); } }; +template class PaddingLoDTensorFunctor; +template class PaddingLoDTensorFunctor; template class PaddingLoDTensorFunctor; +template class PaddingLoDTensorFunctor; + +template class UnpaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; template class UnpaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index 0956a0c17d387f4a174c7ed4e9b1b1f816dcf4ae..035e10dcbe4e2083723e47d7dda75ce267a9f141 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -19,41 +19,32 @@ namespace paddle { namespace operators { namespace math { -template -__global__ void SequencePaddingKernel(T* padding, T* sequence, - const size_t* sequence_start_positions, - const size_t sequence_width, - const size_t max_sequence_length, - const size_t num_sequences) { - size_t padding_idx = blockIdx.y; - size_t start_pos = sequence_start_positions[padding_idx]; - size_t sequence_length = - sequence_start_positions[padding_idx + 1] - start_pos; - - size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y; - size_t padding_base_idx = - (sequence_idx * num_sequences + padding_idx) * sequence_width; - size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width; - - if (sequence_idx < sequence_length) { - T scale = NormByTimes ? (1.0f / static_cast(sequence_length)) : 1.0f; - if (Padding) { - /* sequence -> padding */ - for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { - padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i]; - } - } else { - /* padding -> sequence */ - for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { - sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i]; - } +template +__global__ void SequencePaddingKernel( + T* dst, const T* src, const T* pad_value, bool is_constant_pad, + const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len, + const size_t step_width, bool norm_by_len, const PadLayout layout) { + size_t seq_idx = blockIdx.y; + size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; + + size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y; + size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width; + size_t pad_data_offset = layout == kBatchLengthWidth + ? (seq_idx * pad_seq_len + step_idx) * step_width + : (step_idx * seq_num + seq_idx) * step_width; + + T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset); + const T* src_data = + src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset); + + if (step_idx < seq_len) { + float scale = norm_by_len ? (1.0f / static_cast(seq_len)) : 1.0f; + for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { + dst_data[i] = scale * src_data[i]; } - } else if (sequence_idx < max_sequence_length) { - if (Padding) { - /* sequence -> padding */ - for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { - padding[padding_base_idx + i] = 0; - } + } else if (step_idx < pad_seq_len && Type == kSeqToPad) { + for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { + dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i]; } } } @@ -62,74 +53,59 @@ template class PaddingLoDTensorFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::LoDTensor& seq, framework::Tensor* padding, - bool norm_by_times) { - auto lod = seq.lod(); - PADDLE_ENFORCE_GT(lod.size(), 0UL, - "The lod of LoDTensor seq should not be null."); - - const size_t level = 0; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - - auto seq_dims = seq.dims(); - PADDLE_ENFORCE_EQ(seq_dims[0], - static_cast(abs_offset_lod[level].back()), - "The first dimension of LoDTensor seq should be " - "equal to the sum of all sequences's length."); - - auto padding_dims = padding->dims(); - PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, - "The input padding should be a 3-D Tensor of shape " - "[max_sequence_length, num_sequences, sequence_width]."); - - int64_t max_sequence_length = MaximumSequenceLength(lod, level); - PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, - "The first dimension of Tensor padding should be the " - "maximum length of all sequences in LoDTensor seq."); - - const int64_t num_sequences = abs_offset_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, - "The second dimension of Tensor padding should be the " - "number of sequences in LoDTensor seq."); - - const int64_t sequence_width = seq.numel() / seq_dims[0]; - PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, - "The third dimension of Tensor padding should be the " - "width of sequence in LoDTensor seq."); - - if (!norm_by_times && num_sequences == 1UL) { - TensorCopy(seq, context.GetPlace(), context, padding); - padding->Resize(padding_dims); + const framework::LoDTensor& seq_tensor, + framework::LoDTensor* pad_tensor, + const framework::LoDTensor& pad_value, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth) { + auto seq_lod = seq_tensor.lod(); + const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; + const auto& seq_tensor_dims = seq_tensor.dims(); + const auto& pad_tensor_dims = pad_tensor->dims(); + int max_seq_len = MaximumSequenceLength(seq_offsets); + if (pad_seq_len == -1) { + pad_seq_len = max_seq_len; + } + PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len, + "The pad_seq_len must be equal to or greater than the " + "original max sequence length."); + int step_width = seq_tensor.numel() / seq_tensor_dims[0]; + int seq_num = seq_offsets.size() - 1; + + CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, + step_width, layout); + PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width, + "The numel of 'pad_value' can only be 1 or be equal to the " + "'step_width'."); + + if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { + TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor); + pad_tensor->Resize(pad_tensor_dims); return; } - const int64_t kBlockSize = 512; + const int kBlockSize = 512; /* At least use 32 threads to copy sequence_width elements, * and at least 8 elements for each thread. */ size_t block_dim_x = - std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); size_t block_dim_y = kBlockSize / block_dim_x; dim3 threads(block_dim_x, block_dim_y); - size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; - size_t grid_dim_y = num_sequences; + size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = seq_num; dim3 grid(grid_dim_x, grid_dim_y); - const T* seq_data = seq.data(); - T* padding_data = padding->data(); - if (norm_by_times) { - SequencePaddingKernel<<>>( - padding_data, const_cast(seq_data), - abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, - max_sequence_length, num_sequences); - } else { - SequencePaddingKernel<<>>( - padding_data, const_cast(seq_data), - abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, - max_sequence_length, num_sequences); - } + const T* seq_data = seq_tensor.data(); + T* pad_data = pad_tensor->data(); + const T* pad_value_data = pad_value.data(); + + SequencePaddingKernel<<>>( + pad_data, seq_data, pad_value_data, pad_value.numel() == 1, + seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, + step_width, norm_by_times, layout); } }; @@ -137,79 +113,62 @@ template class UnpaddingLoDTensorFunctor { public: void operator()(const platform::CUDADeviceContext& context, - framework::LoDTensor* seq, const framework::Tensor& padding, - bool norm_by_times) { - auto lod = seq->lod(); - PADDLE_ENFORCE_GT(lod.size(), 0UL, - "The lod of LoDTensor seq should not be null."); - - const size_t level = 0; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - - auto seq_dims = seq->dims(); - PADDLE_ENFORCE_EQ(seq_dims[0], - static_cast(abs_offset_lod[level].back()), - "The first dimension of LoDTensor seq should be " - "equal to the sum of all sequences's length."); - - auto padding_dims = padding.dims(); - PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, - "The input padding should be a 3-D Tensor of shape " - "[max_sequnece_length, num_sequences, sequence_width]."); - - int64_t max_sequence_length = MaximumSequenceLength(lod, level); - PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, - "The first dimension of Tensor padding should be " - "the maximum length of all sequences in LoDTensor seq."); - - const int64_t num_sequences = abs_offset_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, - "The second dimension of Tensor padding should be " - "the number of sequences in LoDTensor seq."); - - const int64_t sequence_width = seq->numel() / seq_dims[0]; - PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, - "The third dimension of Tensor padding should be the " - "width of sequence in LoDTensor seq."); - - if (!norm_by_times && num_sequences == 1UL) { - TensorCopy(padding, context.GetPlace(), context, seq); - seq->Resize(seq_dims); + const framework::LoDTensor& pad_tensor, + framework::LoDTensor* seq_tensor, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth) { + auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; + const auto& seq_tensor_dims = seq_tensor->dims(); + const auto& pad_tensor_dims = pad_tensor.dims(); + int max_seq_len = MaximumSequenceLength(seq_offsets); + if (pad_seq_len == -1) { + pad_seq_len = max_seq_len; + } + int step_width = seq_tensor->numel() / seq_tensor_dims[0]; + int seq_num = seq_offsets.size() - 1; + + CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, + step_width, layout); + + if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { + TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor); + seq_tensor->Resize(seq_tensor_dims); return; } - const int64_t kBlockSize = 512; + const int kBlockSize = 512; /* At least use 32 threads to copy sequence_width elements, * and at least 8 elements for each thread. */ size_t block_dim_x = - std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); size_t block_dim_y = kBlockSize / block_dim_x; dim3 threads(block_dim_x, block_dim_y); - size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; - size_t grid_dim_y = num_sequences; + size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = seq_num; dim3 grid(grid_dim_x, grid_dim_y); - const T* padding_data = padding.data(); - T* seq_data = seq->data(); - if (norm_by_times) { - SequencePaddingKernel<<>>( - const_cast(padding_data), seq_data, - abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, - max_sequence_length, num_sequences); - } else { - SequencePaddingKernel<<>>( - const_cast(padding_data), seq_data, - abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, - max_sequence_length, num_sequences); - } + const T* pad_data = pad_tensor.data(); + T* seq_data = seq_tensor->data(); + + SequencePaddingKernel<<>>( + seq_data, pad_data, nullptr, false, + seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, + step_width, norm_by_times, layout); } }; +template class PaddingLoDTensorFunctor; +template class PaddingLoDTensorFunctor; template class PaddingLoDTensorFunctor; +template class PaddingLoDTensorFunctor; + +template class UnpaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; template class UnpaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h index b56e6db1ebdac1a00561c07845c03bb8fbd8d35a..e752aa58979dddba4d010071d2c4b5dc3e0c6756 100644 --- a/paddle/fluid/operators/math/sequence_padding.h +++ b/paddle/fluid/operators/math/sequence_padding.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device_context.h" @@ -22,17 +23,33 @@ namespace paddle { namespace operators { namespace math { -inline static size_t MaximumSequenceLength(const framework::LoD& lod, - const size_t level) { - const size_t num_sequences = lod[level].size() - 1; - size_t max_sequence_length = 0; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - for (size_t i = 0; i < num_sequences; ++i) { - max_sequence_length = - std::max(max_sequence_length, - abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]); +enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth }; + +enum CopyType { kSeqToPad, kPadToSeq }; + +inline static size_t MaximumSequenceLength( + const framework::Vector& seq_offset) { + size_t seq_num = seq_offset.size() - 1; + size_t max_seq_len = 0; + for (size_t i = 0; i < seq_num; ++i) { + max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]); } - return max_sequence_length; + return max_seq_len; +} + +inline static void CheckDims(const framework::DDim& seq_tensor_dims, + const framework::DDim& pad_tensor_dims, + const framework::Vector& seq_offset, + int64_t padded_seq_len, int64_t step_width, + const PadLayout& layout) { + PADDLE_ENFORCE_EQ(static_cast(seq_tensor_dims[0]), seq_offset.back(), + "Value of 1st dimension of the sequence tensor should be " + "equal to sum of lengths of all sequences."); + + PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() || + seq_tensor_dims.size() == pad_tensor_dims.size(), + "pad_tensor's rank should be 1 greater than seq_tensor's " + "rank, or be equal with it."); } /* @@ -64,15 +81,22 @@ inline static size_t MaximumSequenceLength(const framework::LoD& lod, template class PaddingLoDTensorFunctor { public: - void operator()(const DeviceContext& context, const framework::LoDTensor& seq, - framework::Tensor* padding, bool norm_by_times); + void operator()(const DeviceContext& context, + const framework::LoDTensor& seq_tensor, + framework::LoDTensor* pad_tensor, + const framework::LoDTensor& pad_value, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth); }; template class UnpaddingLoDTensorFunctor { public: - void operator()(const DeviceContext& context, framework::LoDTensor* seq, - const framework::Tensor& padding, bool norm_by_times); + void operator()(const DeviceContext& context, + const framework::LoDTensor& pad_tensor, + framework::LoDTensor* seq_tensor, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth); }; } // namespace math diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index b0c201db0ccbe81d8f57cd984d2cdfd2f6a48f25..4f61b1029c65aedaf4fce771866964fe1d0d6112 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -23,7 +23,9 @@ void TestSequencePadding(const paddle::framework::LoD& lod, paddle::framework::LoDTensor cpu_seq_back; paddle::framework::LoDTensor seq; paddle::framework::LoDTensor seq_back; - paddle::framework::Tensor padding; + paddle::framework::LoDTensor padding; + paddle::framework::LoDTensor cpu_pad_value; + paddle::framework::LoDTensor pad_value; const size_t level = lod.size() - 1; auto seq_dims = @@ -46,20 +48,33 @@ void TestSequencePadding(const paddle::framework::LoD& lod, } const size_t max_sequence_length = - paddle::operators::math::MaximumSequenceLength(lod, level); + paddle::operators::math::MaximumSequenceLength(lod[level]); const size_t num_sequences = lod[level].size() - 1; auto padding_dims = paddle::framework::make_ddim({static_cast(max_sequence_length), static_cast(num_sequences), static_cast(sequence_width)}); + padding.mutable_data(padding_dims, *place); + + T* pad_value_data = + cpu_pad_value.mutable_data({1}, paddle::platform::CPUPlace()); + *pad_value_data = static_cast(0); + if (paddle::platform::is_cpu_place(*place)) { + pad_value = cpu_pad_value; + } else { + TensorCopySync(cpu_pad_value, *place, &pad_value); + } + paddle::operators::math::PaddingLoDTensorFunctor()( - *context, seq, &padding, false); + *context, seq, &padding, pad_value, -1, 0, false, + paddle::operators::math::kLengthBatchWidth); seq_back.set_lod(lod); seq_back.mutable_data(seq_dims, *place); paddle::operators::math::UnpaddingLoDTensorFunctor()( - *context, &seq_back, padding, false); + *context, padding, &seq_back, -1, 0, false, + paddle::operators::math::kLengthBatchWidth); if (paddle::platform::is_cpu_place(*place)) { cpu_seq_back = seq_back; diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..37646c7b4c50fc7409002aca56e5462bde93cc30 --- /dev/null +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -0,0 +1,212 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pad_constant_like_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class PadConstantLikeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of PadConstantLikeOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of PadConstantLikeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PadConstantLikeOp should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dim.size(), y_dim.size(), + "The dimention of X and Y should be the same."); + + for (int i = 0; i < x_dim.size(); ++i) { + PADDLE_ENFORCE_GE(x_dim[i], y_dim[i]); + } + ctx->SetOutputDim("Out", x_dim); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Y")->type()), + ctx.device_context()); + } +}; + +class PadConstantLikeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input of pad_constant_like op. " + "The input should be a k-D tensor(k > 0 and k < 7)"); + AddInput("Y", + "The input of pad_constant_like op. " + "The input should be a k-D tensor(k > 0 and k < 7)"); + AddOutput("Out", + "The output of pad_constant_like op. " + "A tensor with the same shape as X."); + AddAttr("pad_value", + "(float, default 0.0) " + "The value to fill the padded areas.") + .SetDefault(0.0f); + AddComment(R"DOC( +PadConstantLikeOp Operator. + +Pad input(Y) with a pad_value, the number of values padded to the edges of each +axis is specified by the difference of the shape of X and Y. +((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for +each axis. +The input should be a k-D tensor(k > 0 and k < 7). As an example: + +case1: + Given: + X = [[1, 2], + [3, 4], + [1, 2], + [3, 4]]], + X.shape = (4, 2) + + Y = [[5, 6], + [7, 8]], + Y.shape = (2, 2) + + And + pad_value = 0, + + Return: + Out = [[5, 6], + [7, 8], + [0, 0], + [0, 0]] + Out.shape = (4, 2) + +case2: + Given: + X = [[[[ 0, 1, 2], + [ 3, 4, 5]], + [[ 6, 7, 8], + [ 9, 10, 11]], + [[12, 13, 14], + [15, 16, 17]]], + [[[18, 19, 20], + [21, 22, 23]], + [[24, 25, 26], + [27, 28, 29]], + [[30, 31, 32], + [33, 34, 35]]]] + X.shape = (2, 3, 2, 3) + + Y = [[[[35, 36, 37]], + [[38, 39, 40]], + [[41, 42, 43]]]] + Y.shape = (1, 3, 1, 3) + + And + pad_value = -1, + + Return: + + Out = [[[[35, 36, 37], + [-1, -1, -1]], + [[38, 39, 40], + [-1, -1, -1]], + [[41, 42, 43], + [-1, -1, -1]]], + [[[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]]]] + Out.shape = (2, 3, 2, 3) +)DOC"); + } +}; + +class PadConstantLikeOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto y_dim = ctx->GetInputDim("Y"); + auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ(dout_dim.size(), y_dim.size(), + "The dimention of X and Y should be the same."); + + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dim); + ctx->ShareLoD("Y", /*->*/ y_grad_name); + + for (int i = 0; i < y_dim.size(); ++i) { + PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i]); + } + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Y")->type()), + ctx.device_context()); + } +}; + +class PadConstantLikeOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *bind = new framework::OpDesc(); + bind->SetType("pad_constant_like_grad"); + bind->SetInput("Y", Input("Y")); + bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + bind->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + bind->SetAttrMap(Attrs()); + return std::unique_ptr(bind); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(pad_constant_like, ops::PadConstantLikeOp, + ops::PadConstantLikeOpMaker, ops::PadConstantLikeOpGradMaker); +REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad); + +REGISTER_OP_CPU_KERNEL( + pad_constant_like, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel); +REGISTER_OP_CPU_KERNEL( + pad_constant_like_grad, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel); diff --git a/paddle/fluid/operators/pad_constant_like_op.cu b/paddle/fluid/operators/pad_constant_like_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ea69577904577de353b63491973bf74b7724e18e --- /dev/null +++ b/paddle/fluid/operators/pad_constant_like_op.cu @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/pad_constant_like_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + pad_constant_like, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel); +REGISTER_OP_CUDA_KERNEL( + pad_constant_like_grad, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel); diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h new file mode 100644 index 0000000000000000000000000000000000000000..01d66901afc49a487c344b039b65f547967e95ff --- /dev/null +++ b/paddle/fluid/operators/pad_constant_like_op.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/padding.h" + +namespace paddle { +namespace operators { + +template +class PadConstantLikeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto in_x = context.Input("X"); + auto in_y = context.Input("Y"); + auto* out = context.Output("Out"); + + if (in_x->dims() == in_y->dims()) { + // TensorCopy(in_y, context.GetPlace(), context, out); + out->ShareDataWith(*in_y); + return; + } + + T pad_value = context.Attr("pad_value"); + out->mutable_data(context.GetPlace()); + + int rank = context.Input("X")->dims().size(); + + std::vector pads(rank * 2, 0); + + for (int j = 0; j < rank; ++j) { + pads[j * 2] = 0; + pads[j * 2 + 1] = static_cast(in_x->dims()[j] - in_y->dims()[j]); + } + + math::PaddingFunctor(rank, context, pads, pad_value, + *in_y, out); + } +}; + +template +class PadConstantLikeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto in_y = context.Input("Y"); + auto in_dout = + context.Input(framework::GradVarName("Out")); + auto* d_y = context.Output(framework::GradVarName("Y")); + + if (d_y == nullptr) { + return; + } + + if (in_dout->dims() == in_y->dims()) { + // TensorCopy(in_dout, context.GetPlace(), context, d_y); + d_y->ShareDataWith(*in_dout); + return; + } + + d_y->mutable_data(context.GetPlace()); + int rank = in_dout->dims().size(); + + std::vector pads(static_cast(rank) * 2, 0); + for (int j = 0; j < rank; ++j) { + pads[j * 2] = 0; + pads[j * 2 + 1] = static_cast(in_dout->dims()[j] - in_y->dims()[j]); + } + + math::PaddingGradFunctor(rank, context, pads, *in_dout, + d_y); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h index c93c096575a30dd9344894ead4b81acc16930e21..32698dac4917e183cfe36c831787b049985b19b3 100644 --- a/paddle/fluid/operators/pad_op.h +++ b/paddle/fluid/operators/pad_op.h @@ -18,117 +18,44 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/padding.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenTensor = framework::EigenTensor; - -template -void PadFunction(const framework::ExecutionContext& context) { - auto pads = context.Attr>("paddings"); - Eigen::array, D> paddings; - for (size_t i = 0; i < paddings.size(); ++i) { - paddings[i].first = pads[i * 2]; - paddings[i].second = pads[i * 2 + 1]; - } - T pad_value = context.Attr("pad_value"); - - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - auto x_tensor = EigenTensor::From(*x); - auto out_tensor = EigenTensor::From(*out); - auto& place = - *context.template device_context().eigen_device(); - out_tensor.device(place) = x_tensor.pad(paddings, pad_value); -} - template class PadKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - int rank = context.Input("X")->dims().size(); - switch (rank) { - case 1: - PadFunction(context); - break; - case 2: - PadFunction(context); - break; - case 3: - PadFunction(context); - break; - case 4: - PadFunction(context); - break; - case 5: - PadFunction(context); - break; - case 6: - PadFunction(context); - break; - default: - PADDLE_THROW( - "PadOp only support tensors with no more than 6 dimensions."); - } + auto pads = context.Attr>("paddings"); + T pad_value = context.Attr("pad_value"); + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + int rank = x->dims().size(); + math::PaddingFunctor(rank, context, pads, pad_value, *x, + out); } }; -template -void PadGradFunction(const framework::ExecutionContext& context) { - auto pads = context.Attr>("paddings"); - Eigen::array, D> paddings; - for (size_t i = 0; i < paddings.size(); ++i) { - paddings[i].first = -pads[i * 2]; - paddings[i].second = -pads[i * 2 + 1]; - } - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - if (d_x != nullptr) { - d_x->mutable_data(context.GetPlace()); - auto d_x_tensor = EigenTensor::From(*d_x); - auto d_out_tensor = EigenTensor::From(*d_out); - auto& place = - *context.template device_context().eigen_device(); - d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0); - } -} - template class PadGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - size_t rank = - context.Input(framework::GradVarName("Out"))->dims().size(); - switch (rank) { - case 1: - PadGradFunction(context); - break; - case 2: - PadGradFunction(context); - break; - case 3: - PadGradFunction(context); - break; - case 4: - PadGradFunction(context); - break; - case 5: - PadGradFunction(context); - break; - case 6: - PadGradFunction(context); - break; - default: - PADDLE_THROW( - "PadOp only support tensors with no more than 6 dimensions."); + auto pads = context.Attr>("paddings"); + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); + if (d_x == nullptr) { + return; } + + d_x->mutable_data(context.GetPlace()); + int rank = d_out->dims().size(); + math::PaddingGradFunctor(rank, context, pads, *d_out, + d_x); } }; diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index eb09470f37eabb5524f774bc289fc68f5884c540..97c36a83fc5eff421725d05f66fca05f5169d1bb 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -355,6 +355,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { grad->SetInput(framework::GradVarName(output_param), og_names); } } + grad->SetInput("Communicator", {"nccl_com__do_not_change_"}); grad->SetAttrMap(this->Attrs()); grad->SetBlockAttr(kParallelBlock, grad_block_[0]); diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc index cceac402951ae6bf3fe0b4c96af5b7ce9ca1ba0e..e7f1caf4d3a81dc7633139933c6a4c3d51a4e2a0 100644 --- a/paddle/fluid/operators/print_op.cc +++ b/paddle/fluid/operators/print_op.cc @@ -13,14 +13,12 @@ limitations under the License. */ #include -#include - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/framework/variable.h" namespace paddle { namespace operators { +using framework::GradVarName; #define CLOG std::cout @@ -35,7 +33,7 @@ struct Formater { std::type_index dtype{typeid(const char)}; framework::LoD lod; int summarize; - void* data{nullptr}; + void *data{nullptr}; void operator()(size_t size) { PrintMessage(); @@ -101,7 +99,7 @@ struct Formater { template void Display(size_t size) { - auto* d = reinterpret_cast(data); + auto *d = reinterpret_cast(data); CLOG << "\tdata: "; if (summarize != -1) { summarize = std::min(size, (size_t)summarize); @@ -120,51 +118,36 @@ struct Formater { // TODO(ChunweiYan) there should be some other printers for TensorArray class TensorPrintOp : public framework::OperatorBase { public: - TensorPrintOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) + TensorPrintOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} - TensorPrintOp(const TensorPrintOp& o) + TensorPrintOp(const TensorPrintOp &o) : framework::OperatorBase( - static_cast(o)) { + static_cast(o)) { PADDLE_THROW("Not implemented."); } private: - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - const framework::Variable* in_var_ptr = nullptr; - std::string phase(kForward); + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + const framework::Variable *in_var_ptr = nullptr; std::string printed_var_name = ""; - auto& inputs = Inputs(); - if (inputs.find("In") != inputs.end() && !Inputs("In").empty()) { - in_var_ptr = scope.FindVar(Input("In")); - printed_var_name = Inputs("In").front(); - } else if (inputs.find("In@GRAD") != inputs.end() && - !Inputs("In@GRAD").empty()) { - in_var_ptr = scope.FindVar(Input("In@GRAD")); - printed_var_name = Inputs("In@GRAD").front(); - phase = std::string(kBackward); - } else { - PADDLE_THROW("Unknown phase, should be forward or backward."); - } + in_var_ptr = scope.FindVar(Input("In")); + printed_var_name = Inputs("In").front(); PADDLE_ENFORCE_NOT_NULL(in_var_ptr); - auto& in_tensor = in_var_ptr->Get(); - auto* out_var_ptr = scope.FindVar(Output("Out")); - auto& out_tensor = *out_var_ptr->GetMutable(); - - // Just copy data from input tensor to output tensor - // output tensor share same memory with input tensor - out_tensor.ShareDataWith(in_tensor); - out_tensor.set_lod(in_tensor.lod()); + auto &in_tensor = in_var_ptr->Get(); std::string print_phase = Attr("print_phase"); - if (print_phase != phase && print_phase != std::string(kBoth)) { + bool is_forward = Attr("is_forward"); + + if ((is_forward && print_phase == kBackward) || + (!is_forward && print_phase == kForward)) { return; } @@ -192,7 +175,7 @@ class TensorPrintOp : public framework::OperatorBase { formater.dtype = printed_tensor.type(); } if (Attr("print_tensor_shape")) { - auto& dims = printed_tensor.dims(); + auto &dims = printed_tensor.dims(); formater.dims.resize(dims.size()); for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i]; } @@ -200,7 +183,7 @@ class TensorPrintOp : public framework::OperatorBase { formater.lod = printed_tensor.lod(); } formater.summarize = Attr("summarize"); - formater.data = reinterpret_cast(printed_tensor.data()); + formater.data = reinterpret_cast(printed_tensor.data()); formater(printed_tensor.numel()); } @@ -219,14 +202,14 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker { AddAttr("print_tensor_type", "Whether to print the tensor's dtype."); AddAttr("print_tensor_shape", "Whether to print the tensor's shape."); AddAttr("print_tensor_lod", "Whether to print the tensor's lod."); - AddAttr( - "print_phase", - "(string, default 'BOTH') Which phase to display including 'FORWARD' " - "'BACKWARD' and 'BOTH'.") + AddAttr("print_phase", + "(string, default 'FORWARD') Which phase to display " + "including 'FORWARD' " + "'BACKWARD' and 'BOTH'.") .SetDefault(std::string(kBoth)) .InEnum({std::string(kForward), std::string(kBackward), std::string(kBoth)}); - AddOutput("Out", "Output tensor with same data as input tensor."); + AddAttr("is_forward", "Whether is forward or not").SetDefault(true); AddComment(R"DOC( Creates a print op that will print when a tensor is accessed. @@ -238,40 +221,21 @@ tensor `t`.)DOC"); class InferShapeForward : public framework::InferShapeBase { public: - void operator()(framework::InferShapeContext* context) const override { + void operator()(framework::InferShapeContext *context) const override { PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null."); - context->ShareLoD("In", /*->*/ "Out"); - context->SetOutputDim("Out", context->GetInputDim("In")); - } -}; - -class InferShapeBackward : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* context) const override { - PADDLE_ENFORCE(context->HasInput("In@GRAD"), - "Input(In@GRAD) should not be null."); - context->ShareLoD("In@GRAD", /*->*/ "Out"); - context->SetOutputDim("Out", context->GetInputDim("In@GRAD")); } }; -class InferVarType : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override {} -}; - -class PrintOpProtoAndCheckGradOpMaker - : public framework::SingleGradOpDescMaker { +class PrintOpGradientMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; std::unique_ptr Apply() const override { - auto* op_desc_ptr = new framework::OpDesc(); - op_desc_ptr->SetType("print_grad"); - op_desc_ptr->SetInput("In@GRAD", OutputGrad("Out")); - op_desc_ptr->SetOutput("Out", InputGrad("In")); + auto *op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("print"); + op_desc_ptr->SetInput("In", InputGrad("In")); op_desc_ptr->SetAttrMap(Attrs()); + op_desc_ptr->SetAttr("is_forward", false); return std::unique_ptr(op_desc_ptr); } }; @@ -282,6 +246,4 @@ class PrintOpProtoAndCheckGradOpMaker namespace ops = paddle::operators; REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker, - ops::PrintOpProtoAndCheckGradOpMaker, ops::InferShapeForward, - ops::InferVarType); -REGISTER_OPERATOR(print_grad, ops::TensorPrintOp, ops::InferShapeBackward); + ops::PrintOpGradientMaker, ops::InferShapeForward); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 7f8822e40053b5bcd394f446138a2292d80b69bf..c614de2eac143b3a545c60226aefa93dd72dea4f 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -13,8 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/scale_op.h" + #include +#include "paddle/fluid/operators/detail/safe_ref.h" + namespace paddle { namespace operators { @@ -52,6 +55,21 @@ $$Out = scale*X$$ } }; +class ScaleOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto &in_var_name = op_desc.Input("X").front(); + auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name)); + + auto out_var_name = op_desc.Output("Out").front(); + auto *out_var = block->FindVarRecursive(out_var_name); + + out_var->SetType(in_var.GetType()); + out_var->SetDataType(in_var.GetDataType()); + } +}; + class ScaleGradMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; @@ -71,7 +89,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { namespace ops = paddle::operators; -REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker); +REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, + ops::ScaleOpVarTypeInference); REGISTER_OP_CPU_KERNEL( scale, ops::ScaleKernel, ops::ScaleKernel, diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index c6a59b76adcd6b4d3e7db5e7c7185f266f46841f..fe035aba81dd74d21539974beed255275be3013b 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -22,17 +22,29 @@ namespace operators { template class ScaleKernel : public framework::OpKernel { public: - virtual void Compute(const framework::ExecutionContext& context) const { - auto* tensor = context.Output("Out"); - auto* in = context.Input("X"); - tensor->mutable_data(in->place()); + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto* in_var = ctx.InputVar("X"); + auto* in = ctx.Input("X"); - auto scale = static_cast(context.Attr("scale")); + auto* out_var = ctx.OutputVar("Out"); + auto* out = ctx.Output("Out"); + out->mutable_data(in->place()); - auto eigen_out = framework::EigenVector::Flatten(*tensor); + PADDLE_ENFORCE_EQ(in->dims(), out->dims(), + "in and out should have the same dim"); + + auto scale = static_cast(ctx.Attr("scale")); + + if (in_var->IsType() && in_var != out_var) { + auto& in_slr = in_var->Get(); + auto* out_slr = out_var->GetMutable(); + out_slr->set_rows(in_slr.rows()); + out_slr->set_height(in_slr.height()); + } + + auto eigen_out = framework::EigenVector::Flatten(*out); auto eigen_in = framework::EigenVector::Flatten(*in); - auto& dev = - *context.template device_context().eigen_device(); + auto& dev = *ctx.template device_context().eigen_device(); eigen_out.device(dev) = scale * eigen_in; } }; diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index 14b07649c416ff1b671fc9b5ee4eb956b44570c5..40404295266899c6ac2f7b1e08fdf7db40958794 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -56,6 +56,10 @@ class SendBarrierOp : public framework::OperatorBase { class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { + AddInput("X", "(Any) Dummy inputs, used for control dependency") + .AsDuplicable(); + AddOutput("Out", "(Any) Dummy outputs, used for control dependency") + .AsDuplicable(); AddComment(R"DOC( SendBarrier operator diff --git a/paddle/fluid/operators/sequence_pad_op.cc b/paddle/fluid/operators/sequence_pad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..44d73aa4076abfe15c906478702ac7c4a55303d4 --- /dev/null +++ b/paddle/fluid/operators/sequence_pad_op.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_pad_op.h" + +namespace paddle { +namespace operators { + +class SequencePadOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePadOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PadValue"), + "Input(PadValue) of SequencePadOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequencePadOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "The rank of Input(x) can't be less than 2."); + auto time_step_dims = framework::slice_ddim(x_dims, 1, x_dims.size()); + auto pad_value_dims = ctx->GetInputDim("PadValue"); + PADDLE_ENFORCE(pad_value_dims == framework::make_ddim({1}) || + pad_value_dims == time_step_dims, + "The Input(PadValue) must be a scalar or a tensor whose " + "shape equals to time steps in sequences"); + + int out_dim_0 = -1; + int out_dim_1 = -1; + + if (ctx->IsRuntime()) { + // run time + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + PADDLE_ENFORCE_GE(x_lod_0.size(), 2, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod_0.back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + + int seq_num = x_lod_0.size() - 1; + int max_seq_len = math::MaximumSequenceLength(x_lod_0); + int padded_length = ctx->Attrs().Get("padded_length"); + if (padded_length == -1) { + padded_length = max_seq_len; + } + PADDLE_ENFORCE_GE(padded_length, max_seq_len, + "The Attr(padded_length) must be -1 or an int greater " + "than the length of the longest original sequence."); + out_dim_0 = seq_num; + out_dim_1 = padded_length; + } else { + // compile time + framework::VarDesc* x_desc = + boost::get(ctx->GetInputVarPtrs("X")[0]); + PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1); + } + + std::vector out_dims_vec{out_dim_0, out_dim_1}; + auto time_step_dims_vec = framework::vectorize2int(time_step_dims); + out_dims_vec.insert(out_dims_vec.end(), time_step_dims_vec.begin(), + time_step_dims_vec.end()); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + } +}; + +class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("PadValue", + "(LoDTensor), this Tensor holds values that will be fill into " + "padded steps. It can be a scalar or a tensor whose shape equals " + "to time steps in sequences. If it's a scalar, it will be " + "automatically broadcasted to the shape of time step."); + AddOutput( + "Out", + "(LoDTensor) The output vairable, which contains padded sequences."); + AddAttr( + "padded_length", + "The length of padded sequences. It can be setted to -1 or " + "any positive int. When it is -1, all sequences will be padded up to " + "the length of the longest one among them; when it a certain positive " + "value, it must be greater than the length of the longest original " + "sequence.") + .SetDefault(-1); + AddComment(R"DOC( + Sequence Pad Operator + + This operator pads sequences in a same batch to a consistent length. + The length is specified by attribute 'padded_length'. New elements, + whose values are specified by input 'PadValue', will be appended to + the end of each sequence, to make their final lengths consistent. + + Following are cases to better explain how this works: + + Case 1: + + Given a 1-level LoDTensor input(X): + X.lod = [[0, 2, 5]] + X.data = [a, b, c, d, e] + and Input(PadValue): + PadValue.data = [0] + and attribite 'padded_length' = 4, + then we get LoDTensor: + Out.data = [[a, b, 0, 0], + [c, d, e, 0]] + + Case 2: + + Given a 1-level LoDTensor input(X): + X.lod = [[0, 2, 5]] + X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]] + and Input(PadValue): + PadValue.data = [0] + and attribite 'padded_length' = -1, which mean using the length + of longest input sequence(3 in this case), + then we get LoDTensor: + Out.data = [[[a1, a2], [b1, b2], [0, 0]], + [[c1, c2], [d1, d2], [e1, e2]]] + + Case 3: + + Given a 1-level LoDTensor input(X): + X.lod = [[0, 2, 5]] + X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]] + and Input(PadValue): + PadValue.data = [p1, p2] + and attribite 'padded_length' = -1, which mean using the length + of longest input sequence(3 in this case), + then we get LoDTensor: + Out.data = [[[a1, a2], [b1, b2], [p1, p2]], + [[c1, c2], [d1, d2], [e1, e2]]] + + )DOC"); + } +}; + +class SequencePadGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequencePadGradOp should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_pad, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel); +REGISTER_OP_CPU_KERNEL( + sequence_pad_grad, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_pad_op.cu b/paddle/fluid/operators/sequence_pad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ff8f81a2f0ec4a72befc3be2a5fc48c3a586c824 --- /dev/null +++ b/paddle/fluid/operators/sequence_pad_op.cu @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_pad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_pad, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_pad_grad, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_pad_op.h b/paddle/fluid/operators/sequence_pad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..5fc9da69d787ff3aeffa716689d44772ad8f7bd2 --- /dev/null +++ b/paddle/fluid/operators/sequence_pad_op.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +class SequencePadOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + const auto* pad_value = ctx.Input("PadValue"); + + int padded_length = ctx.Attr("padded_length"); + + math::PaddingLoDTensorFunctor()( + ctx.template device_context(), *x, out, *pad_value, + padded_length, 0, false, math::kBatchLengthWidth); + } +}; + +template +class SequencePadGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_x = ctx.Output(framework::GradVarName("X")); + if (d_x) { + const auto* d_out = ctx.Input(framework::GradVarName("Out")); + d_x->mutable_data(ctx.GetPlace()); + + int padded_length = ctx.Attr("padded_length"); + + math::UnpaddingLoDTensorFunctor()( + ctx.template device_context(), *d_out, d_x, + padded_length, 0, false, math::kBatchLengthWidth); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ff3249cc333231a0624cd5aab9603a6a75f4480 --- /dev/null +++ b/paddle/fluid/operators/unstack_op.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/unstack_op.h" + +namespace plat = paddle::platform; +namespace ops = paddle::operators; + +USE_OP(stack); + +REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker, + ops::UnStackOpInferShape, ops::UnStackGradOpDescMaker); + +REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp, + ops::UnStackOpGradInferShape); diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h new file mode 100644 index 0000000000000000000000000000000000000000..348a1038804ccb2551e5f729cc1a38bcef1511f5 --- /dev/null +++ b/paddle/fluid/operators/unstack_op.h @@ -0,0 +1,135 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class UnStackOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist."); + + int axis = ctx->Attrs().Get("axis"); + int num = ctx->Attrs().Get("num"); + auto x_dim = ctx->GetInputDim("X"); + int rank = x_dim.size(); + PADDLE_ENFORCE(axis >= -rank && axis < rank, + "Attr(axis) must be inside [-rank, rank), where rank = %d", + rank); + if (axis < 0) axis += rank; + + PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast(num), + "Number of Outputs(Y) is wrong"); + if (x_dim[axis] > 0) { + PADDLE_ENFORCE_EQ(num, x_dim[axis], "Number of Outputs(Y) is wrong"); + } + auto vec = framework::vectorize2int(x_dim); + vec.erase(vec.begin() + axis); + ctx->SetOutputsDim("Y", std::vector( // NOLINT + x_dim[axis], framework::make_ddim(vec))); + } +}; + +class UnStackOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input of unstack op."); + AddOutput("Y", "The output of unstack op.").AsDuplicable(); + AddAttr("axis", "The axis along which Input(X) should be unstacked.") + .SetDefault(0); + AddAttr("num", "The number of outputs(Y).").GreaterThan(0); + AddComment(R"DOC( + UnStack Operator. + + UnStack Input(X) into several tensors along Attr(axis). + )DOC"); + } +}; + +class UnStackOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto stack_grad_op = framework::OpRegistry::CreateOp( + "stack_grad", {{framework::GradVarName("Y"), {Input("X")}}}, + {{framework::GradVarName("X"), Outputs("Y")}}, Attrs()); + stack_grad_op->Run(scope, place); + } +}; + +class UnStackOpGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0, + "Number of Inputs(Y@Grad) must be larger than 0"); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@Grad) must exist."); + + auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y")); + for (size_t i = 1; i < input_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0], + "Dims of all Inputs(Y@Grad) must be the same"); + } + + int axis = ctx->Attrs().Get("axis"); + int rank = input_dims[0].size(); + PADDLE_ENFORCE( + axis >= -(rank + 1) && axis < rank + 1, + "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank); + if (axis < 0) axis += (rank + 1); + + auto vec = framework::vectorize2int(input_dims[0]); + vec.insert(vec.begin() + axis, input_dims.size()); + ctx->SetOutputDim(framework::GradVarName("X"), framework::make_ddim(vec)); + } +}; + +class UnStackGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("unstack_grad"); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +class UnStackGradOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto stack_op = framework::OpRegistry::CreateOp( + "stack", {{"X", Inputs(framework::GradVarName("Y"))}}, + {{"Y", {Output(framework::GradVarName("X"))}}}, Attrs()); + stack_op->Run(scope, place); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index ab70c1f0592d122ba248a101db487e64c0bdae6f..444265f58de732f07c5db2abd87811a063016866 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -153,17 +153,29 @@ class WarpCTCKernel : public framework::OpKernel { framework::make_ddim({static_cast(num_sequences), 1}); // warpctc needs sequences data stored in transposed padding format - Tensor warpctc_logits; + LoDTensor warpctc_logits; const size_t max_sequence_length = - math::MaximumSequenceLength(logits_lod, level); + math::MaximumSequenceLength(logits_lod[level]); auto warpctc_logits_dims = framework::make_ddim({static_cast(max_sequence_length), static_cast(num_sequences), static_cast(sequence_width)}); warpctc_logits.mutable_data(warpctc_logits_dims, ctx.GetPlace()); + + LoDTensor cpu_pad_value; + T* pad_value_data = + cpu_pad_value.mutable_data({1}, platform::CPUPlace()); + *pad_value_data = static_cast(0); + LoDTensor pad_value; + if (platform::is_cpu_place(ctx.GetPlace())) { + pad_value = cpu_pad_value; + } else { + TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value); + } + math::PaddingLoDTensorFunctor()( ctx.template device_context(), *logits, &warpctc_logits, - false); + pad_value, -1, 0, false /* norm_by_times */, math::kLengthBatchWidth); const T* warpctc_logits_data = warpctc_logits.data(); std::vector warpctc_label_lengths(num_sequences); @@ -209,15 +221,15 @@ template class WarpCTCGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* warpctc_grad = ctx.Input("WarpCTCGrad"); + auto* warpctc_grad = ctx.Input("WarpCTCGrad"); auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); const Tensor* loss_grad = ctx.Input(framework::GradVarName("Loss")); logits_grad->mutable_data(ctx.GetPlace()); bool norm_by_times = ctx.Attr("norm_by_times"); math::UnpaddingLoDTensorFunctor()( - ctx.template device_context(), logits_grad, - *warpctc_grad, norm_by_times); + ctx.template device_context(), *warpctc_grad, + logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth); const T* loss_grad_data = loss_grad->data(); math::ScaleLoDTensorFunctor()( diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 75d3856d0dda8b5bf6a4fa11954a611bf140c9bc..e25efebe6c3555958f4f75e2b87b7dc45d4a4177 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,3 +1,4 @@ +if (NOT WIN32) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) py_proto_compile(profiler_py_proto SRCS profiler.proto) @@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +endif(NOT WIN32) if(WITH_GPU) nv_library(enforce SRCS enforce.cc) @@ -58,9 +60,12 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) + +if (NOT WIN32) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) +endif(NOT WIN32) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index fcd658d67cf4551dbdb9696ef49b5ab3cc58bf95..2880c09263f10e9c624e11b77188171f48d9db28 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -22,9 +22,13 @@ limitations under the License. */ #ifdef __APPLE__ #include #include + +#elif defined(_WIN32) +#define NOMINMAX // msvc max/min macro conflict with std::min/max +#include #else #include -#endif +#endif // _WIN32 #include #include "gflags/gflags.h" @@ -32,16 +36,20 @@ limitations under the License. */ DEFINE_double(fraction_of_cpu_memory_to_use, 1, "Default use 100% of CPU memory for PaddlePaddle," "reserve the rest for page tables, etc"); - +#if !defined(_WIN32) DEFINE_uint64(initial_cpu_memory_in_mb, #ifdef PADDLE_WITH_MKLDNN /* Aligned with mozga-intel, MKLDNN need at least 5000 MB * to obtain the best performance*/ - 5000, + 5000ul, #else - 500, + 500ul, #endif "Initial CPU memory for PaddlePaddle, in MD unit."); +#else +DEFINE_uint64(initial_cpu_memory_in_mb, 500ul, + "Initial CPU memory for PaddlePaddle, in MD unit."); +#endif // !defined(_WIN32) DEFINE_double( fraction_of_cuda_pinned_memory_to_use, 0.5, @@ -60,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() { size_t len = sizeof(size); if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size; return 0L; +#elif defined(_WIN32) + MEMORYSTATUSEX sMeminfo; + sMeminfo.dwLength = sizeof(sMeminfo); + GlobalMemoryStatusEx(&sMeminfo); + return sMeminfo.ullTotalPhys; #else int64_t pages = sysconf(_SC_PHYS_PAGES); int64_t page_size = sysconf(_SC_PAGE_SIZE); diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 322996fb4f54d34ebbb034a6e1de420e9c532545..f59fc40b71699a790978e22fd7e26da8d4d94c5f 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -13,7 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#if !defined(_WIN32) #include +#else +#include +#endif // !_WIN32 + #include #include // NOLINT #include @@ -27,12 +32,15 @@ namespace platform { /////////////////////// // WARN: Under Development. Don't depend on it yet. ////////////////////// - +#if !defined(_WIN32) inline uint64_t PosixInNsec() { struct timeval tv; gettimeofday(&tv, nullptr); return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); } +#else +inline uint64_t PosixInNsec() { return static_cast(0); } +#endif // !_WIN32 // DeviceTracer performs the following tasks: // 1. Register cuda callbacks for various events: kernel, memcpy, etc. diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 07159d4a12ef4b628f7705ed206d3334be46dfc8..5939c500c946c44579d1de645ac9700c7701a4e9 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -16,7 +16,9 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) +if (NOT WIN32) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +endif(NOT WIN32) if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 93bf7c13516ffa4baca6a30f1daf946939726d85..4fbfa6354ab45fed4839227a2a4be8fe147e5fd9 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include - #include #include // NOLINT #include @@ -23,6 +21,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/dynload/cupti_lib_path.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/port.h" DEFINE_string(cudnn_dir, "", "Specify path for loading libcudnn.so. For instance, " diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a76ba75f9eeb8c3f42fbf7254f629b0960a8f2d8..61a653d9313daff96d39c08e80f17d7e33acceb1 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -18,6 +18,11 @@ limitations under the License. */ #include // for __cxa_demangle #endif // __GNUC__ +#if defined(_WIN32) +#define NOMINMAX // msvc max/min macro conflict with std::min/max +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#endif + #ifdef PADDLE_WITH_CUDA #include #include @@ -117,7 +122,12 @@ struct EOFException : public std::exception { // always forces branch prediction of true. // This generates faster binary code. __builtin_expect is since C++11. // For more details, please check https://stackoverflow.com/a/43870188/724872. +#if !defined(_WIN32) #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) +#else +// there is no equivalent intrinsics in msvc. +#define UNLIKELY(condition) (condition == 0) +#endif template inline typename std::enable_if::type throw_on_error( @@ -230,6 +240,7 @@ inline void throw_on_error(T e) { throw_on_error(e, ""); } +#if !defined(_WIN32) #define PADDLE_THROW(...) \ do { \ throw ::paddle::platform::EnforceNotMet( \ @@ -248,15 +259,28 @@ inline void throw_on_error(T e) { __FILE__, __LINE__); \ } \ } while (false) -#else -#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); -#endif #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ __LINE__); \ } while (false) + +#else +#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__) +#endif // REPLACE_ENFORCE_GLOG + +#else // !_WIN32 +// disable enforce, caused by the varardic macro exception error +#define PADDLE_THROW(x) \ + do { \ + throw std::make_exception_ptr( \ + std::runtime_error("Windows disable the enforce.")); \ + } while (false) + +#define PADDLE_ENFORCE(x, ...) x +#endif // !_WIN32 + /* * Some enforce helpers here, usage: * int a = 1; diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 020ce4d6f59412490657767a096f1ce185287864..4c99f4be321160caf0ee2f89a655bdfb933408e3 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -85,9 +85,6 @@ void InitDevices(bool init_p2p) { } catch (const std::exception &exp) { LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; } -#else - LOG(WARNING) - << "'CUDA' is not supported, Please re-compile with WITH_GPU option"; #endif InitDevices(init_p2p, devices); } @@ -101,9 +98,6 @@ void InitDevices(bool init_p2p, const std::vector devices) { } catch (const std::exception &exp) { LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; } -#else - LOG(WARNING) - << "'CUDA' is not supported, Please re-compile with WITH_GPU option"; #endif for (size_t i = 0; i < devices.size(); ++i) { diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index c99d9c807d1bfb45d1ce0725b84b9fff09049511..38630686f7cf3c669373f941d989adf11ba6cfe6 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx); +#if !defined(_WIN32) struct RecordEvent { RecordEvent(const std::string& name, const DeviceContext* dev_ctx); @@ -94,6 +95,15 @@ struct RecordBlock { std::string name_; uint64_t start_ns_; }; +#else +// windows do not support profiler temporarily. +struct RecordEvent { + RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {} +}; +struct RecordBlock { + explicit RecordBlock(int block_id) {} +}; +#endif // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index c2137ec6d7df24251432a4dfb8fffc3d3f77194e..f21f8d23f99c27529b2ed1995c92fd4eee4a5807 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -234,6 +234,7 @@ void BindVarDsec(pybind11::module *m) { pybind11::enum_(var_desc, "VarType", "") .value("BOOL", pd::proto::VarType::BOOL) .value("UINT8", pd::proto::VarType::UINT8) + .value("INT8", pd::proto::VarType::INT8) .value("INT16", pd::proto::VarType::INT16) .value("INT32", pd::proto::VarType::INT32) .value("INT64", pd::proto::VarType::INT64) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 67734659233515ca8110f4212a2b1553fe4e9d24..5b20b87174e42f4dfdd22214e8f9dd20c7296374 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -130,6 +130,7 @@ PYBIND11_PLUGIN(core) { .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) + .def("set", PyCPUTensorSetFromArray) #ifdef PADDLE_WITH_CUDA .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) @@ -138,6 +139,7 @@ PYBIND11_PLUGIN(core) { .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) + .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDAPinnedTensorSetFromArray) .def("set", PyCUDAPinnedTensorSetFromArray) .def("set", PyCUDAPinnedTensorSetFromArray) @@ -145,6 +147,7 @@ PYBIND11_PLUGIN(core) { .def("set", PyCUDAPinnedTensorSetFromArray) .def("set", PyCUDAPinnedTensorSetFromArray) .def("set", PyCUDAPinnedTensorSetFromArray) + .def("set", PyCUDAPinnedTensorSetFromArray) #endif .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("_set_float_element", TensorSetElement) diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 3e2ea1ef88b03f5b2576c1cee2b5d26a439943da..51614a6a3dd2f7f830cf533fc365b56a99d3b918 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -97,7 +97,7 @@ struct CastToPyBufferImpl { inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) { auto buffer_info = details::CastToPyBufferImpl()(tensor); + uint8_t, int8_t, platform::float16>()(tensor); return buffer_info; } diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index a643e0ded01975277ecd5eb9b2174a7a1d040a76..7199424b4709fbe9fc962cf98aea6223b9f3e51d 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -335,12 +335,18 @@ function assert_api_not_changed() { fi python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec deactivate +} + +function assert_api_spec_approvals() { + if [ -z ${BRANCH} ]; then + BRANCH="develop" + fi - API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true` + API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/API.spec" || true` echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then - # TODO: curl -H 'Authorization: token ${TOKEN}' - APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews | \ + # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. + APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "FALSE" ]; then @@ -622,11 +628,12 @@ function main() { cicheck) cmake_gen ${PYTHON_ABI:-""} build + assert_api_not_changed ${PYTHON_ABI:-""} run_test gen_capi_package gen_fluid_inference_lib test_fluid_inference_lib - assert_api_not_changed ${PYTHON_ABI:-""} + assert_api_spec_approvals ;; *) print_usage diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index b32736ee7c265e3a94207afc04673eec4fcf1c6e..920dbf3b4ebb0bc3d98c9ea986d7d039deed4a4c 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -203,7 +203,7 @@ def resize_short(im, size): h_new = size * h // w else: w_new = size * w // h - im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) + im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC) return im @@ -345,7 +345,6 @@ def simple_transform(im, if np.random.randint(2) == 0: im = left_right_flip(im, is_color) else: - im = center_crop(im, crop_size, is_color) im = center_crop(im, crop_size, is_color=is_color) if len(im.shape) == 3: im = to_chw(im) diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py index c98e0019f7ab5fc2723e8df919257a59af7c9e5d..64bf7414819ad74365744adbd760b73d4adaff7c 100644 --- a/python/paddle/dataset/movielens.py +++ b/python/paddle/dataset/movielens.py @@ -24,6 +24,7 @@ set and test set into paddle reader creators. from __future__ import print_function +import numpy as np import zipfile import paddle.dataset.common import re @@ -150,12 +151,12 @@ def __initialize_meta_info__(): def __reader__(rand_seed=0, test_ratio=0.1, is_test=False): fn = __initialize_meta_info__() - rand = random.Random(x=rand_seed) + np.random.seed(rand_seed) with zipfile.ZipFile(file=fn) as package: with package.open('ml-1m/ratings.dat') as rating: for line in rating: line = cpt.to_text(line, encoding='latin') - if (rand.random() < test_ratio) == is_test: + if (np.random.random() < test_ratio) == is_test: uid, mov_id, rating, _ = line.strip().split("::") uid = int(uid) mov_id = int(mov_id) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index febb750ee1af26c71e6c1ae1e4c97fb02fb27a04..fbe766336b19719b7a3eac41ad5e877ef3ec4181 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -95,6 +95,8 @@ def convert_np_dtype_to_dtype_(np_dtype): return core.VarDesc.VarType.INT16 elif dtype == np.uint8: return core.VarDesc.VarType.UINT8 + elif dtype == np.int8: + return core.VarDesc.VarType.INT8 else: raise ValueError("Not supported numpy dtype %s" % dtype) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index d2954c4c22069627bac5188cf1d485e50b68c8e4..c9a2f8a0abf9c811074e3fbadec0c61cb6dbf681 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -189,7 +189,6 @@ def Print(input, message="The content of some_layer: ") ''' helper = LayerHelper('print', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype()) helper.append_op( type='print', inputs={'In': input}, @@ -202,9 +201,7 @@ def Print(input, 'print_tensor_shape': print_tensor_shape, 'print_tensor_lod': print_tensor_lod, 'print_phase': print_phase.upper() - }, - outputs={'Out': out}) - return out + }) class BlockGuard(object): diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 72071478845df444ce72ce946787b2d0ce5f0d23..a5bc1fa8f801066a8281a828eb87dbccb4bb0eff 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -39,6 +39,7 @@ __all__ = [ 'detection_map', 'rpn_target_assign', 'anchor_generator', + 'generate_proposals', ] __auto__ = [ @@ -1253,3 +1254,73 @@ def anchor_generator(input, anchor.stop_gradient = True var.stop_gradient = True return anchor, var + + +def generate_proposals(scores, + bbox_deltas, + im_info, + anchors, + variances, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=0.5, + min_size=0.1, + eta=1.0, + name=None): + """ + ** Generate proposal labels Faster-RCNN ** + + This operation proposes RoIs according to each box with their probability to be a foreground object and + the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals + could be used to train detection net. + + For generating proposals, this operation performs following steps: + + 1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) + 2. Calculate box locations as proposals candidates. + 3. Clip boxes to image + 4. Remove predicted boxes with small area. + 5. Apply NMS to get final proposals as output. + + + Args: + scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. + N is batch size, A is number of anchors, H and W are height and width of the feature map. + bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. + im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale + between origin image size and the size of feature map. + anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, + num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. + variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format. + pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default. + post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default. + nms_thresh(float): Threshold in NMS, 0.5 by default. + min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default. + eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration. + """ + helper = LayerHelper('generate_proposals', **locals()) + + rpn_rois = helper.create_tmp_variable(dtype=bbox_deltas.dtype) + rpn_roi_probs = helper.create_tmp_variable(dtype=scores.dtype) + helper.append_op( + type="generate_proposals", + inputs={ + 'Scores': scores, + 'BboxDeltas': bbox_deltas, + 'ImInfo': im_info, + 'Anchors': anchors, + 'Variances': variances + }, + attrs={ + 'pre_nms_topN': pre_nms_top_n, + 'post_nms_topN': post_nms_top_n, + 'nms_thresh': nms_thresh, + 'min_size': min_size, + 'eta': eta + }, + outputs={'RpnRois': rpn_rois, + 'RpnRoiProbs': rpn_roi_probs}) + rpn_rois.stop_gradient = True + rpn_roi_probs.stop_gradient = True + + return rpn_rois, rpn_roi_probs diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index b03ee514f50f9a8c1425bd5b1d409b58ed62351a..0cf7aaef4ab75ca6976465d1b404004a9f2f64c5 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -246,7 +246,11 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True): rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC }) if sync: - helper.append_op(type="send_barrier", attrs={"endpoints": endpoints}) + helper.append_op( + type="send_barrier", + inputs={"X": dummy_output}, + outputs={"Out": []}, + attrs={"endpoints": endpoints}) def Recv(endpoints, get_vars, dummy_input=None, sync=True): @@ -282,7 +286,10 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True): attrs={"endpoints": endpoints, "epmap": epmap}) if sync: - helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints}) + helper.append_op( + type="fetch_barrier", + outputs={"Out": get_vars}, + attrs={"endpoints": endpoints}) return get_vars diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ffb9858108a6843a35a886e402a8d23fe77a1238..5642c8c8ef99cb6f0d635acfb8507396e080ea23 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -17,6 +17,7 @@ All layers just related to the neural network. from __future__ import print_function +import numpy as np from ..layer_helper import LayerHelper from ..initializer import Normal, Constant from ..framework import Variable @@ -24,7 +25,6 @@ from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc from .tensor import concat from . import utils -import random from .. import unique_name from functools import reduce @@ -54,6 +54,7 @@ __all__ = [ 'conv2d_transpose', 'conv3d_transpose', 'sequence_expand', + 'sequence_pad', 'lstm_unit', 'reduce_sum', 'reduce_mean', @@ -87,6 +88,7 @@ __all__ = [ 'lod_reset', 'lrn', 'pad', + 'pad_constant_like', 'label_smooth', 'roi_pool', 'dice_loss', @@ -105,6 +107,7 @@ __all__ = [ 'flatten', 'sequence_mask', 'stack', + 'unstack', 'sequence_enumerate', ] @@ -2656,6 +2659,51 @@ def sequence_expand(x, y, ref_level=-1, name=None): return tmp +@templatedoc() +def sequence_pad(x, pad_value, maxlen=None): + """ + ${comment} + + Args: + x(Variable): Input variable which should contain lod information. + pad_value(Variable): The Variable that holds values that will be fill + into padded steps. It can be a scalar or a tensor whose shape + equals to time steps in sequences. If it's a scalar, it will be + automatically broadcasted to the shape of time step. + maxlen(int, default None): The length of padded sequences. It can be + None or any positive int. When it is None, all sequences will be + padded up to the length of the longest one among them; when it a + certain positive value, it must be greater than the length of the + longest original sequence." + + Returns: + Variable: The padded sequence batch. All sequences has the same length. + + Examples: + .. code-block:: python + + import numpy + + x = fluid.layers.data(name='y', shape=[10, 5], + dtype='float32', lod_level=1) + pad_value = fluid.layers.assign(input=numpy.array([0])) + out = fluid.layers.sequence_pad(x=x, pad_value=pad_value) + """ + + helper = LayerHelper('sequence_pad', input=x, **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + if maxlen is None: + maxlen = -1 + helper.append_op( + type='sequence_pad', + inputs={'X': x, + 'PadValue': pad_value}, + outputs={'Out': out}, + attrs={'padded_length': maxlen}) + return out + + def beam_search(pre_ids, pre_scores, ids, @@ -4709,6 +4757,86 @@ def pad(x, paddings, pad_value=0., name=None): return out +def pad_constant_like(x, y, pad_value=0., name=None): + """ + Pad input(Y) with :attr:`pad_value`, the number of values padded to + the edges of each axis is specified by the difference of the shape + of X and Y. ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) + unique pad widths for each axis. The input should be a k-D + tensor(k > 0 and k < 7). + + See below for an example. + + .. code-block:: text + + Given: + X = [[[[ 0, 1, 2], + [ 3, 4, 5]], + [[ 6, 7, 8], + [ 9, 10, 11]], + [[12, 13, 14], + [15, 16, 17]]], + [[[18, 19, 20], + [21, 22, 23]], + [[24, 25, 26], + [27, 28, 29]], + [[30, 31, 32], + [33, 34, 35]]]] + X.shape = (2, 3, 2, 3) + + Y = [[[[35, 36, 37]], + [[38, 39, 40]], + [[41, 42, 43]]]] + Y.shape = (1, 3, 1, 3) + + And + pad_value = -1, + + Return: + Out = [[[[35, 36, 37], + [-1, -1, -1]], + [[38, 39, 40], + [-1, -1, -1]], + [[41, 42, 43], + [-1, -1, -1]]], + [[[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]]]] + Out.shape = (2, 3, 2, 3) + + Args: + x (Variable): The input tensor variable. + y (Variable): The input tensor variable. + pad_value (float): The constant value used to pad. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The padded tensor variable. + + Examples: + .. code-block:: python + + # x is a rank 4 tensor variable, x.shape = (2, 3, 2, 3) + # y is a rank 4 tensor variable, y.shape = (1, 3, 1, 3) + out = fluid.layers.pad_constant_like(x=x, y=y, pad_value=0.) + # out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3] + """ + helper = LayerHelper('pad_constant_like', input=x, **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + helper.append_op( + type='pad_constant_like', + inputs={'X': x, + 'Y': y}, + outputs={'Out': out}, + attrs={'pad_value': float(pad_value)}) + return out + + def label_smooth(label, prior_dist=None, epsilon=0.1, @@ -5103,7 +5231,7 @@ def random_crop(x, shape, seed=None): dtype = x.dtype out = helper.create_tmp_variable(dtype) if seed is None: - seed = random.randint(-65536, 65535) + seed = np.random.randint(-65536, 65536) op_attrs = {"shape": shape} if isinstance(seed, int): op_attrs["startup_seed"] = seed @@ -5305,7 +5433,7 @@ def crop(x, shape=None, offsets=None, name=None): helper = LayerHelper('crop', **locals()) if not (isinstance(shape, list) or isinstance(shape, tuple) or \ - isinstance(shape, Variable)): + isinstance(shape, Variable)): raise ValueError("The shape should be a list, tuple or Variable.") if offsets is None: @@ -5417,7 +5545,7 @@ def prelu(x, mode, param_attr=None, name=None): channel:elements in a channel share same weight element:each element has a weight name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Returns: Variable: The output tensor with the same shape as input. @@ -5576,23 +5704,23 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None): Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where: - + .. math:: - + y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n)) Args: - x (Variable): Input tensor of sequence_mask layer, + x (Variable): Input tensor of sequence_mask layer, whose elements are integers less than :code:`maxlen`. maxlen (int|None): Maximum length of the sequence. If :code:`maxlen` is None, it would be replace with :math:`max(x)`. dtype (np.dtype|core.VarDesc.VarType|str): Data type of the output. - name (str|None): A name for this layer(optional). If set None, the - layer will be named automatically. - + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + Returns: Variable: The output sequence mask. - + """ helper = LayerHelper('sequence_mask', **locals()) @@ -5617,23 +5745,23 @@ def stack(x, axis=0): **Stack Layer** This layer stacks all of the input :code:`x` along axis. - - Input :code:`x` can be a single variable, a :code:`list` of variables, - or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or - :code:`tuple`, the shapes of all these variables must be the same. - Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`, - the shape of the output variable would be - :math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`. + + Input :code:`x` can be a single variable, a :code:`list` of variables, + or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or + :code:`tuple`, the shapes of all these variables must be the same. + Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`, + the shape of the output variable would be + :math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`. If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`. - If :code:`axis` is None, it would be replaced with 0. + If :code:`axis` is None, it would be replaced with 0. Args: - x (Variable|list(Variable)|tuple(Variable)): Input variables. + x (Variable|list(Variable)|tuple(Variable)): Input variables. axis (int|None): The axis along which all inputs are stacked. - + Returns: Variable: The stacked variable. - + """ helper = LayerHelper('stack', **locals()) @@ -5648,3 +5776,44 @@ def stack(x, axis=0): attrs={'axis': axis}) return out + + +def unstack(x, axis=0, num=None): + """ + **UnStack Layer** + + This layer unstacks input :code:`x` into several tensors along axis. + + If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`. + If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`, + and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is + raised. + + Args: + x (Variable): Input variable. + axis (int): The axis along which the input is unstacked. + num (int|None): The number of output variables. + + Returns: + list(Variable): The unstacked variables. + + """ + + helper = LayerHelper('unstack', **locals()) + if num is None: + if axis is None or x.shape[axis] <= 0: + raise ValueError('unknown unstack number') + else: + num = x.shape[axis] + + outs = [] + for _ in num: + outs.append(helper.create_tmp_variable(x.dtype)) + + helper.append_op( + type='unstack', + inputs={'X': [x]}, + outputs={'Y': outs}, + attrs={'axis': axis, + 'num': num}) + return outs diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 031ddd09a0b27b050b6ac651e4d8c46854092b2f..6b9749a5799ecc0b26babbf088614d6b5de2a5dd 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -46,10 +46,12 @@ class Optimizer(object): def __init__(self, learning_rate, regularization=None, - LARS_weight_decay=0.0): + LARS_weight_decay=0.0, + name=None): if not isinstance(learning_rate, float) and \ not isinstance(learning_rate, framework.Variable): raise TypeError("learning rate should be float or Variable") + self._name = name self.regularization = regularization self._learning_rate = learning_rate # the learning rate type should be inferenced from loss @@ -153,6 +155,8 @@ class Optimizer(object): dtype: data type of the accumulator variable fill_value: value to initialize the accumulator variable """ + if self._name is not None: + name = self._name + "_" + name if (name in self._accumulators and param.name in self._accumulators[name]): raise Exception("Accumulator {} already exists for parameter {}". @@ -181,6 +185,8 @@ class Optimizer(object): Returns: accumulator variable for the parameter """ + if self._name is not None: + name = self._name + "_" + name if (name not in self._accumulators or param.name not in self._accumulators[name]): raise Exception("Accumulator {} does not exist for parameter {}". diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py index 3951e7b8ca649b63eea4b311f6205a6c7d761804..a231bbfbc8d5712275c92b4d27580016825ea91b 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py @@ -125,8 +125,8 @@ opts = optimizer.minimize(avg_cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size) -# fluid.memory_optimize(fluid.default_main_program(), level=0) -fluid.release_memory(fluid.default_main_program()) +fluid.memory_optimize(fluid.default_main_program(), level=0) +# fluid.release_memory(fluid.default_main_program()) BATCH_SIZE = 16 PASS_NUM = 1 diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py index 1ad51936b5b8f7c5149452d6033754a570c72654..e520c8965089263d1ba10a6057acda1a53cc34a9 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py @@ -92,8 +92,8 @@ def main(): optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer.minimize(avg_cost) - # fluid.memory_optimize(fluid.default_main_program()) - fluid.release_memory(fluid.default_main_program()) + fluid.memory_optimize(fluid.default_main_program()) + # fluid.release_memory(fluid.default_main_program()) # fix the order of training data train_data = paddle.batch( diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 1467e72caac26a3ea2a0c770d665141988696630..b71b440d3c9155e63220f5f2d7e849e8332bfb16 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -201,5 +201,44 @@ class TestDetectionMAP(unittest.TestCase): print(str(program)) +class TestGenerateProposals(unittest.TestCase): + def test_generate_proposals(self): + data_shape = [20, 64, 64] + images = fluid.layers.data( + name='images', shape=data_shape, dtype='float32') + im_info = fluid.layers.data( + name='im_info', shape=[1, 3], dtype='float32') + anchors, variances = fluid.layers.anchor_generator( + name='anchor_generator', + input=images, + anchor_sizes=[32, 64], + aspect_ratios=[1.0], + variance=[0.1, 0.1, 0.2, 0.2], + stride=[16.0, 16.0], + offset=0.5) + num_anchors = anchors.shape[2] + scores = fluid.layers.data( + name='scores', shape=[1, num_anchors, 8, 8], dtype='float32') + bbox_deltas = fluid.layers.data( + name='bbox_deltas', + shape=[1, num_anchors * 4, 8, 8], + dtype='float32') + rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals( + name='generate_proposals', + scores=scores, + bbox_deltas=bbox_deltas, + im_info=im_info, + anchors=anchors, + variances=variances, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=0.5, + min_size=0.1, + eta=1.0) + self.assertIsNotNone(rpn_rois) + self.assertIsNotNone(rpn_roi_probs) + print(rpn_rois.shape) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index 0387e911880256ea6b8efb6f2311bbf4c4f8c0f2..a4ffe7d40c40501ebd43fec0b664159227ea34bd 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -134,7 +134,7 @@ class SE_ResNeXt(): size=class_dim, act='softmax', param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.2))) + initializer=fluid.initializer.Constant(value=0.05))) return out def shortcut(self, input, ch_out, stride): @@ -184,7 +184,7 @@ class SE_ResNeXt(): act=None, # avoid pserver CPU init differs from GPU param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.2)), + initializer=fluid.initializer.Constant(value=0.05)), bias_attr=False) return fluid.layers.batch_norm(input=conv, act=act) @@ -192,13 +192,19 @@ class SE_ResNeXt(): pool = fluid.layers.pool2d( input=input, pool_size=0, pool_type='avg', global_pooling=True) stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) - squeeze = fluid.layers.fc(input=pool, - size=num_channels // reduction_ratio, - act='relu') + squeeze = fluid.layers.fc( + input=pool, + size=num_channels // reduction_ratio, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.05)), + act='relu') stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) - excitation = fluid.layers.fc(input=squeeze, - size=num_channels, - act='sigmoid') + excitation = fluid.layers.fc( + input=squeeze, + size=num_channels, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.05)), + act='sigmoid') scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) return scale diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 239adcb9d5900d4073a6c07cb189ab7503aea86e..179c2540f812feac32632540e9070b58451c9a90 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -18,54 +18,129 @@ import numpy as np import argparse import time import math +import os +import sys +import six +import argparse +import ast +import multiprocessing +import time +from functools import partial +from os.path import expanduser +import glob +import random +import tarfile import paddle import paddle.fluid as fluid +import paddle.fluid.layers as layers from paddle.fluid import core -import os -import sys -import six -import transformer_model -import paddle.dataset.wmt16 as wmt16 +from test_dist_base import TestDistRunnerBase, runtime_main +from paddle.compat import long_type + +import hashlib + +from paddle.fluid.transpiler.details import program_to_code + +const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001)) +const_bias_attr = const_para_attr # Fix seed for test fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 -WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio" +#from transformer_config import ModelHyperParams, TrainTaskConfig, merge_cfg_from_list +class TrainTaskConfig(object): + # only support GPU currently + use_gpu = True + # the epoch number to train. + pass_num = 1 + # the number of sequences contained in a mini-batch. + # deprecated, set batch_size in args. + batch_size = 20 + # the hyper parameters for Adam optimizer. + # This static learning_rate will be multiplied to the LearningRateScheduler + # derived learning rate the to get the final learning rate. + learning_rate = 1 + beta1 = 0.9 + beta2 = 0.98 + eps = 1e-9 + # the parameters for learning rate scheduling. + warmup_steps = 4000 + # the weight used to mix up the ground-truth distribution and the fixed + # uniform distribution in label smoothing when training. + # Set this as zero if label smoothing is not wanted. + label_smooth_eps = 0.1 + # the directory for saving trained models. + model_dir = "trained_models" + # the directory for saving checkpoints. + ckpt_dir = "trained_ckpts" + # the directory for loading checkpoint. + # If provided, continue training from the checkpoint. + ckpt_path = None + # the parameter to initialize the learning rate scheduler. + # It should be provided if use checkpoints, since the checkpoint doesn't + # include the training step counter currently. + start_step = 0 -class ModelHyperParams(object): - # Dictionary size for source and target language. This model directly uses - # paddle.dataset.wmt16 in which , and token has - # alreay been added, but the token is not added. Transformer requires - # sequences in a mini-batch are padded to have the same length. A token is - # added into the original dictionary in paddle.dateset.wmt16. + check_acc = True - # size of source word dictionary. - src_vocab_size = 10000 - # index for token in source language. - src_pad_idx = src_vocab_size + data_path = expanduser("~") + ( + "/.cache/paddle/dataset/test_dist_transformer/") + src_vocab_fpath = data_path + "vocab.bpe.32000" + trg_vocab_fpath = data_path + "vocab.bpe.32000" + train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de" + val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de" + pool_size = 2000 + sort_type = None + local = True + shuffle = False + shuffle_batch = False + special_token = ['', '', ''] + token_delimiter = ' ' + use_token_batch = False - # size of target word dictionay - trg_vocab_size = 10000 - # index for token in target language. - trg_pad_idx = trg_vocab_size - # position value corresponding to the token. - pos_pad_idx = 0 +class InferTaskConfig(object): + use_gpu = True + # the number of examples in one run for sequence generation. + batch_size = 10 + # the parameters for beam search. + beam_size = 5 + max_out_len = 256 + # the number of decoded sentences to output. + n_best = 1 + # the flags indicating whether to output the special tokens. + output_bos = False + output_eos = False + output_unk = True + # the directory for loading the trained model. + model_path = "trained_models/pass_1.infer.model" - # max length of sequences. It should plus 1 to include position - # padding token for position encoding. - max_length = 50 +class ModelHyperParams(object): + # These following five vocabularies related configurations will be set + # automatically according to the passed vocabulary path and special tokens. + # size of source word dictionary. + src_vocab_size = 10000 + # size of target word dictionay + trg_vocab_size = 10000 + # index for token + bos_idx = 0 + # index for token + eos_idx = 1 + # index for token + unk_idx = 2 + # max length of sequences deciding the size of position encoding table. + # Start from 1 and count start and end tokens in. + max_length = 256 # the dimension for word embeddings, which is also the last dimension of # the input and output of multi-head attention, position-wise feed-forward # networks, encoder and decoder. - d_model = 512 # size of the hidden layer in position-wise feed-forward networks. - d_inner_hid = 1024 + d_inner_hid = 2048 # the dimension that keys are projected to for dot-product attention. d_key = 64 # the dimension that values are projected to for dot-product attention. @@ -75,95 +150,1521 @@ class ModelHyperParams(object): # number of sub-layers to be stacked in the encoder and decoder. n_layer = 6 # dropout rate used by all dropout layers. - dropout = 0.1 + dropout = 0.0 # no random + # random seed used in dropout for CE. + dropout_seed = None + # the flag indicating whether to share embedding and softmax weights. + # vocabularies in source and target should be same for weight sharing. + weight_sharing = True -def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): +def merge_cfg_from_list(cfg_list, g_cfgs): + """ + Set the above global configurations using the cfg_list. + """ + assert len(cfg_list) % 2 == 0 + for key, value in zip(cfg_list[0::2], cfg_list[1::2]): + for g_cfg in g_cfgs: + if hasattr(g_cfg, key): + try: + value = eval(value) + except Exception: # for file path + pass + setattr(g_cfg, key, value) + break + + +# The placeholder for batch_size in compile time. Must be -1 currently to be +# consistent with some ops' infer-shape output in compile time, such as the +# sequence_expand op used in beamsearch decoder. +batch_size = -1 +# The placeholder for squence length in compile time. +seq_len = ModelHyperParams.max_length +# Here list the data shapes and data types of all inputs. +# The shapes here act as placeholder and are set to pass the infer-shape in +# compile time. +input_descs = { + # The actual data shape of src_word is: + # [batch_size * max_src_len_in_batch, 1] + "src_word": [(batch_size, seq_len, long_type(1)), "int64", 2], + # The actual data shape of src_pos is: + # [batch_size * max_src_len_in_batch, 1] + "src_pos": [(batch_size, seq_len, long_type(1)), "int64"], + # This input is used to remove attention weights on paddings in the + # encoder. + # The actual data shape of src_slf_attn_bias is: + # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch] + "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, + seq_len), "float32"], + # The actual data shape of trg_word is: + # [batch_size * max_trg_len_in_batch, 1] + "trg_word": [(batch_size, seq_len, long_type(1)), "int64", + 2], # lod_level is only used in fast decoder. + # The actual data shape of trg_pos is: + # [batch_size * max_trg_len_in_batch, 1] + "trg_pos": [(batch_size, seq_len, long_type(1)), "int64"], + # This input is used to remove attention weights on paddings and + # subsequent words in the decoder. + # The actual data shape of trg_slf_attn_bias is: + # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch] + "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, + seq_len), "float32"], + # This input is used to remove attention weights on paddings of the source + # input in the encoder-decoder attention. + # The actual data shape of trg_src_attn_bias is: + # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch] + "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, + seq_len), "float32"], + # This input is used in independent decoder program for inference. + # The actual data shape of enc_output is: + # [batch_size, max_src_len_in_batch, d_model] + "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"], + # The actual data shape of label_word is: + # [batch_size * max_trg_len_in_batch, 1] + "lbl_word": [(batch_size * seq_len, long_type(1)), "int64"], + # This input is used to mask out the loss of paddding tokens. + # The actual data shape of label_weight is: + # [batch_size * max_trg_len_in_batch, 1] + "lbl_weight": [(batch_size * seq_len, long_type(1)), "float32"], + # These inputs are used to change the shape tensor in beam-search decoder. + "trg_slf_attn_pre_softmax_shape_delta": [(long_type(2), ), "int32"], + "trg_slf_attn_post_softmax_shape_delta": [(long_type(4), ), "int32"], + "init_score": [(batch_size, long_type(1)), "float32"], +} + +# Names of word embedding table which might be reused for weight sharing. +word_emb_param_names = ( + "src_word_emb_table", + "trg_word_emb_table", ) +# Names of position encoding table which will be initialized externally. +pos_enc_param_names = ( + "src_pos_enc_table", + "trg_pos_enc_table", ) +# separated inputs for different usages. +encoder_data_input_fields = ( + "src_word", + "src_pos", + "src_slf_attn_bias", ) +decoder_data_input_fields = ( + "trg_word", + "trg_pos", + "trg_slf_attn_bias", + "trg_src_attn_bias", + "enc_output", ) +label_data_input_fields = ( + "lbl_word", + "lbl_weight", ) +# In fast decoder, trg_pos (only containing the current time step) is generated +# by ops and trg_slf_attn_bias is not needed. +fast_decoder_data_input_fields = ( + "trg_word", + "init_score", + "trg_src_attn_bias", ) + +# fast_decoder_util_input_fields = ( +# "trg_slf_attn_pre_softmax_shape_delta", +# "trg_slf_attn_post_softmax_shape_delta", ) + + +#from optim import LearningRateScheduler +class LearningRateScheduler(object): + """ + Wrapper for learning rate scheduling as described in the Transformer paper. + LearningRateScheduler adapts the learning rate externally and the adapted + learning rate will be feeded into the main_program as input data. + """ + + def __init__(self, + d_model, + warmup_steps, + learning_rate=0.001, + current_steps=0, + name="learning_rate"): + self.current_steps = current_steps + self.warmup_steps = warmup_steps + self.d_model = d_model + self.static_lr = learning_rate + self.learning_rate = layers.create_global_var( + name=name, + shape=[1], + value=float(learning_rate), + dtype="float32", + persistable=True) + + def update_learning_rate(self): + self.current_steps += 1 + lr_value = np.power(self.d_model, -0.5) * np.min([ + np.power(self.current_steps, -0.5), + np.power(self.warmup_steps, -1.5) * self.current_steps + ]) * self.static_lr + return np.array([lr_value], dtype="float32") + + +#from transformer_train import train_loop +def pad_batch_data(insts, + pad_idx, + n_head, + is_target=False, + is_label=False, + return_attn_bias=True, + return_max_len=True, + return_num_token=False): """ Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. Then, convert the numpy - data to tensors and return a dict mapping names to tensors. + corresponding position data and attention bias. """ + return_list = [] + max_len = max(len(inst) for inst in insts) + num_token = reduce(lambda x, y: x + y, + [len(inst) for inst in insts]) if return_num_token else 0 + # Any token included in dict can be used to pad, since the paddings' loss + # will be masked out by weights and make no effect on parameter gradients. + inst_data = np.array( + [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, 1])] + if is_label: # label weight + inst_weight = np.array( + [[1.] * len(inst) + [0.] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_weight.astype("float32").reshape([-1, 1])] + else: # position data + inst_pos = np.array([ + range(1, len(inst) + 1) + [0] * (max_len - len(inst)) + for inst in insts + ]) + return_list += [inst_pos.astype("int64").reshape([-1, 1])] + if return_attn_bias: + if is_target: + # This is used to avoid attention on paddings and subsequent + # words. + slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len)) + slf_attn_bias_data = np.triu(slf_attn_bias_data, + 1).reshape([-1, 1, max_len, max_len]) + slf_attn_bias_data = np.tile(slf_attn_bias_data, + [1, n_head, 1, 1]) * [-1e9] + else: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * + (max_len - len(inst)) + for inst in insts]) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, 1, max_len]), + [1, n_head, max_len, 1]) + return_list += [slf_attn_bias_data.astype("float32")] + if return_max_len: + return_list += [max_len] + if return_num_token: + return_list += [num_token] + return return_list if len(return_list) > 1 else return_list[0] + + +def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx, + n_head, d_model): + """ + Put all padded data needed by training into a dict. + """ + src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False) + src_word = src_word.reshape(-1, src_max_len, 1) + src_pos = src_pos.reshape(-1, src_max_len, 1) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) + trg_word = trg_word.reshape(-1, trg_max_len, 1) + trg_pos = trg_pos.reshape(-1, trg_max_len, 1) - def __pad_batch_data(insts, - pad_idx, - is_target=False, - return_pos=True, - return_attn_bias=True, - return_max_len=True): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. - """ - return_list = [] - max_len = max(len(inst) for inst in insts) - inst_data = np.array( - [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) - return_list += [inst_data.astype("int64").reshape([-1, 1])] - if return_pos: - inst_pos = np.array([[ - pos_i + 1 if w_i != pad_idx else 0 - for pos_i, w_i in enumerate(inst) - ] for inst in inst_data]) - - return_list += [inst_pos.astype("int64").reshape([-1, 1])] - if return_attn_bias: - if is_target: - # This is used to avoid attention on paddings and subsequent - # words. - slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, - max_len)) - slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( - [-1, 1, max_len, max_len]) - slf_attn_bias_data = np.tile(slf_attn_bias_data, - [1, n_head, 1, 1]) * [-1e9] - else: - # This is used to avoid attention on paddings. - slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * - (max_len - len(inst)) - for inst in insts]) - slf_attn_bias_data = np.tile( - slf_attn_bias_data.reshape([-1, 1, 1, max_len]), - [1, n_head, max_len, 1]) - return_list += [slf_attn_bias_data.astype("float32")] - if return_max_len: - return_list += [max_len] - return return_list if len(return_list) > 1 else return_list[0] - - src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( - [inst[0] for inst in insts], src_pad_idx, is_target=False) - trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( - [inst[1] for inst in insts], trg_pad_idx, is_target=True) trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1]).astype("float32") - lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False, - False, False, False) - lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) + lbl_word, lbl_weight, num_token = pad_batch_data( + [inst[2] for inst in insts], + trg_pad_idx, + n_head, + is_target=False, + is_label=True, + return_attn_bias=False, + return_max_len=False, + return_num_token=True) + + data_input_dict = dict( + zip(data_input_names, [ + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + ])) + return data_input_dict, np.asarray([num_token], dtype="float32") + + +def read_multiple(reader, count, clip_last=True): + """ + Stack data from reader for multi-devices. + """ + + def __impl__(): + res = [] + for item in reader(): + res.append(item) + if len(res) == count: + yield res + res = [] + if len(res) == count: + yield res + elif not clip_last: + data = [] + for item in res: + data += item + if len(data) > count: + inst_num_per_part = len(data) // count + yield [ + data[inst_num_per_part * i:inst_num_per_part * (i + 1)] + for i in range(count) + ] + + return __impl__ + + +def split_data(data, num_part): + """ + Split data for each device. + """ + if len(data) == num_part: + return data + data = data[0] + inst_num_per_part = len(data) // num_part return [ - src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, - trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + data[inst_num_per_part * i:inst_num_per_part * (i + 1)] + for i in range(num_part) ] -def transformer(use_feed): - assert not use_feed, "transfomer doesn't support feed yet" - return transformer_model.transformer( - ModelHyperParams.src_vocab_size + 1, - ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, - ModelHyperParams.n_layer, ModelHyperParams.n_head, - ModelHyperParams.d_key, ModelHyperParams.d_value, - ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, - ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) +def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names, + sum_cost, token_num): + # Context to do validation. + test_program = train_progm.clone() + with fluid.program_guard(test_program): + test_program = fluid.io.get_inference_program([avg_cost]) + + val_data = DataReader( + src_vocab_fpath=TrainTaskConfig.src_vocab_fpath, + trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath, + fpattern=TrainTaskConfig.val_file_pattern, + token_delimiter=TrainTaskConfig.token_delimiter, + use_token_batch=TrainTaskConfig.use_token_batch, + batch_size=TrainTaskConfig.batch_size * + (1 if TrainTaskConfig.use_token_batch else dev_count), + pool_size=TrainTaskConfig.pool_size, + sort_type=TrainTaskConfig.sort_type, + start_mark=TrainTaskConfig.special_token[0], + end_mark=TrainTaskConfig.special_token[1], + unk_mark=TrainTaskConfig.special_token[2], + # count start and end tokens out + max_length=ModelHyperParams.max_length - 2, + clip_last_batch=False, + shuffle=False, + shuffle_batch=False) + + build_strategy = fluid.BuildStrategy() + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + + test_exe = fluid.ParallelExecutor( + use_cuda=TrainTaskConfig.use_gpu, + main_program=test_program, + share_vars_from=train_exe, + build_strategy=build_strategy, + exec_strategy=strategy) + + def test(exe=test_exe): + test_total_cost = 0 + test_total_token = 0 + test_data = read_multiple( + reader=val_data.batch_generator, + count=dev_count if TrainTaskConfig.use_token_batch else 1) + for batch_id, data in enumerate(test_data()): + feed_list = [] + for place_id, data_buffer in enumerate( + split_data( + data, num_part=dev_count)): + data_input_dict, _ = prepare_batch_input( + data_buffer, data_input_names, ModelHyperParams.eos_idx, + ModelHyperParams.eos_idx, ModelHyperParams.n_head, + ModelHyperParams.d_model) + feed_list.append(data_input_dict) + + outs = exe.run(feed=feed_list, + fetch_list=[sum_cost.name, token_num.name]) + sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) + test_total_cost += sum_cost_val.sum() + test_total_token += token_num_val.sum() + test_avg_cost = test_total_cost / test_total_token + test_ppl = np.exp([min(test_avg_cost, 100)]) + return test_avg_cost, test_ppl + + return test + + +def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, + token_num, predict): + # Initialize the parameters. + if TrainTaskConfig.ckpt_path: + lr_scheduler.current_steps = TrainTaskConfig.start_step + else: + exe.run(fluid.framework.default_startup_program()) + + train_data = DataReader( + src_vocab_fpath=TrainTaskConfig.src_vocab_fpath, + trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath, + fpattern=TrainTaskConfig.train_file_pattern, + token_delimiter=TrainTaskConfig.token_delimiter, + use_token_batch=TrainTaskConfig.use_token_batch, + batch_size=TrainTaskConfig.batch_size * + (1 if TrainTaskConfig.use_token_batch else dev_count), + pool_size=TrainTaskConfig.pool_size, + sort_type=TrainTaskConfig.sort_type, + shuffle=TrainTaskConfig.shuffle, + shuffle_batch=TrainTaskConfig.shuffle_batch, + start_mark=TrainTaskConfig.special_token[0], + end_mark=TrainTaskConfig.special_token[1], + unk_mark=TrainTaskConfig.special_token[2], + # count start and end tokens out + max_length=ModelHyperParams.max_length - 2, + clip_last_batch=False) + train_data = read_multiple( + reader=train_data.batch_generator, + count=dev_count if TrainTaskConfig.use_token_batch else 1) + + build_strategy = fluid.BuildStrategy() + # Since the token number differs among devices, customize gradient scale to + # use token average cost among multi-devices. and the gradient scale is + # `1 / token_number` for average cost. + build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + + train_exe = fluid.ParallelExecutor( + use_cuda=TrainTaskConfig.use_gpu, + loss_name=sum_cost.name, + main_program=train_progm, + build_strategy=build_strategy, + exec_strategy=strategy) + + data_input_names = encoder_data_input_fields + decoder_data_input_fields[: + -1] + label_data_input_fields + + if TrainTaskConfig.val_file_pattern is not None: + test = test_context(train_progm, avg_cost, train_exe, dev_count, + data_input_names, sum_cost, token_num) + + # the best cross-entropy value with label smoothing + loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log( + (1. - TrainTaskConfig.label_smooth_eps + )) + TrainTaskConfig.label_smooth_eps * + np.log(TrainTaskConfig.label_smooth_eps / ( + ModelHyperParams.trg_vocab_size - 1) + 1e-20)) + init = False + for pass_id in xrange(TrainTaskConfig.pass_num): + pass_start_time = time.time() + for batch_id, data in enumerate(train_data()): + if batch_id >= 5: + break + + feed_list = [] + total_num_token = 0 + + #if TrainTaskConfig.local: + # lr_rate = lr_scheduler.update_learning_rate() + #for place_id, data_buffer in enumerate( + # split_data( + # data, num_part=dev_count)): + + if TrainTaskConfig.local: + lr_rate = lr_scheduler.update_learning_rate() + + for place_id, data_buffer in enumerate( + split_data( + data, num_part=dev_count)): + data_input_dict, num_token = prepare_batch_input( + data_buffer, data_input_names, ModelHyperParams.eos_idx, + ModelHyperParams.eos_idx, ModelHyperParams.n_head, + ModelHyperParams.d_model) + total_num_token += num_token + feed_kv_pairs = data_input_dict.items() + if TrainTaskConfig.local: + feed_kv_pairs += { + lr_scheduler.learning_rate.name: lr_rate + }.items() + feed_list.append(dict(feed_kv_pairs)) + + if not init: + for pos_enc_param_name in pos_enc_param_names: + pos_enc = position_encoding_init( + ModelHyperParams.max_length + 1, + ModelHyperParams.d_model) + feed_list[place_id][pos_enc_param_name] = pos_enc + + if not TrainTaskConfig.check_acc: + for feed_dict in feed_list: + feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token + else: + b = 100 * TrainTaskConfig.batch_size + a = np.asarray([b], dtype="float32") + for feed_dict in feed_list: + feed_dict[sum_cost.name + "@GRAD"] = 1. / a + + outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name], + feed=feed_list) + + sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) + total_sum_cost = sum_cost_val.sum() + total_token_num = token_num_val.sum() + total_avg_cost = total_sum_cost / total_token_num + + init = True + + # Validate and save the model for inference. + if TrainTaskConfig.val_file_pattern is not None: + val_avg_cost, val_ppl = test() + print("[%f]" % val_avg_cost) + else: + assert (False) + + +#import transformer_reader as reader +class SortType(object): + GLOBAL = 'global' + POOL = 'pool' + NONE = "none" + + +class Converter(object): + def __init__(self, vocab, beg, end, unk, delimiter): + self._vocab = vocab + self._beg = beg + self._end = end + self._unk = unk + self._delimiter = delimiter + + def __call__(self, sentence): + return [self._beg] + [ + self._vocab.get(w, self._unk) + for w in sentence.split(self._delimiter) + ] + [self._end] + + +class ComposedConverter(object): + def __init__(self, converters): + self._converters = converters + + def __call__(self, parallel_sentence): + return [ + self._converters[i](parallel_sentence[i]) + for i in range(len(self._converters)) + ] + + +class SentenceBatchCreator(object): + def __init__(self, batch_size): + self.batch = [] + self._batch_size = batch_size + + def append(self, info): + self.batch.append(info) + if len(self.batch) == self._batch_size: + tmp = self.batch + self.batch = [] + return tmp + + +class TokenBatchCreator(object): + def __init__(self, batch_size): + self.batch = [] + self.max_len = -1 + self._batch_size = batch_size + + def append(self, info): + cur_len = info.max_len + max_len = max(self.max_len, cur_len) + if max_len * (len(self.batch) + 1) > self._batch_size: + result = self.batch + self.batch = [info] + self.max_len = cur_len + return result + else: + self.max_len = max_len + self.batch.append(info) + + +class SampleInfo(object): + def __init__(self, i, max_len, min_len): + self.i = i + self.min_len = min_len + self.max_len = max_len + + +class MinMaxFilter(object): + def __init__(self, max_len, min_len, underlying_creator): + self._min_len = min_len + self._max_len = max_len + self._creator = underlying_creator + + def append(self, info): + if info.max_len > self._max_len or info.min_len < self._min_len: + return + else: + return self._creator.append(info) + + @property + def batch(self): + return self._creator.batch + + +class DataReader(object): + """ + The data reader loads all data from files and produces batches of data + in the way corresponding to settings. + + An example of returning a generator producing data batches whose data + is shuffled in each pass and sorted in each pool: + + ``` + train_data = DataReader( + src_vocab_fpath='data/src_vocab_file', + trg_vocab_fpath='data/trg_vocab_file', + fpattern='data/part-*', + use_token_batch=True, + batch_size=2000, + pool_size=10000, + sort_type=SortType.POOL, + shuffle=True, + shuffle_batch=True, + start_mark='', + end_mark='', + unk_mark='', + clip_last_batch=False).batch_generator + ``` + + :param src_vocab_fpath: The path of vocabulary file of source language. + :type src_vocab_fpath: basestring + :param trg_vocab_fpath: The path of vocabulary file of target language. + :type trg_vocab_fpath: basestring + :param fpattern: The pattern to match data files. + :type fpattern: basestring + :param batch_size: The number of sequences contained in a mini-batch. + or the maximum number of tokens (include paddings) contained in a + mini-batch. + :type batch_size: int + :param pool_size: The size of pool buffer. + :type pool_size: int + :param sort_type: The grain to sort by length: 'global' for all + instances; 'pool' for instances in pool; 'none' for no sort. + :type sort_type: basestring + :param clip_last_batch: Whether to clip the last uncompleted batch. + :type clip_last_batch: bool + :param tar_fname: The data file in tar if fpattern matches a tar file. + :type tar_fname: basestring + :param min_length: The minimum length used to filt sequences. + :type min_length: int + :param max_length: The maximum length used to filt sequences. + :type max_length: int + :param shuffle: Whether to shuffle all instances. + :type shuffle: bool + :param shuffle_batch: Whether to shuffle the generated batches. + :type shuffle_batch: bool + :param use_token_batch: Whether to produce batch data according to + token number. + :type use_token_batch: bool + :param field_delimiter: The delimiter used to split source and target in + each line of data file. + :type field_delimiter: basestring + :param token_delimiter: The delimiter used to split tokens in source or + target sentences. + :type token_delimiter: basestring + :param start_mark: The token representing for the beginning of + sentences in dictionary. + :type start_mark: basestring + :param end_mark: The token representing for the end of sentences + in dictionary. + :type end_mark: basestring + :param unk_mark: The token representing for unknown word in dictionary. + :type unk_mark: basestring + :param seed: The seed for random. + :type seed: int + """ + + def __init__(self, + src_vocab_fpath, + trg_vocab_fpath, + fpattern, + batch_size, + pool_size, + sort_type=SortType.GLOBAL, + clip_last_batch=True, + tar_fname=None, + min_length=0, + max_length=100, + shuffle=True, + shuffle_batch=False, + use_token_batch=False, + field_delimiter="\t", + token_delimiter=" ", + start_mark="", + end_mark="", + unk_mark="", + seed=0): + self._src_vocab = self.load_dict(src_vocab_fpath) + self._only_src = True + if trg_vocab_fpath is not None: + self._trg_vocab = self.load_dict(trg_vocab_fpath) + self._only_src = False + self._pool_size = pool_size + self._batch_size = batch_size + self._use_token_batch = use_token_batch + self._sort_type = sort_type + self._clip_last_batch = clip_last_batch + self._shuffle = shuffle + self._shuffle_batch = shuffle_batch + self._min_length = min_length + self._max_length = max_length + self._field_delimiter = field_delimiter + self._token_delimiter = token_delimiter + self.load_src_trg_ids(end_mark, fpattern, start_mark, tar_fname, + unk_mark) + self._random = random.Random(x=seed) + + def load_src_trg_ids(self, end_mark, fpattern, start_mark, tar_fname, + unk_mark): + converters = [ + Converter( + vocab=self._src_vocab, + beg=self._src_vocab[start_mark], + end=self._src_vocab[end_mark], + unk=self._src_vocab[unk_mark], + delimiter=self._token_delimiter) + ] + if not self._only_src: + converters.append( + Converter( + vocab=self._trg_vocab, + beg=self._trg_vocab[start_mark], + end=self._trg_vocab[end_mark], + unk=self._trg_vocab[unk_mark], + delimiter=self._token_delimiter)) + + converters = ComposedConverter(converters) + + self._src_seq_ids = [] + self._trg_seq_ids = None if self._only_src else [] + self._sample_infos = [] + + for i, line in enumerate(self._load_lines(fpattern, tar_fname)): + src_trg_ids = converters(line) + self._src_seq_ids.append(src_trg_ids[0]) + lens = [len(src_trg_ids[0])] + if not self._only_src: + self._trg_seq_ids.append(src_trg_ids[1]) + lens.append(len(src_trg_ids[1])) + self._sample_infos.append(SampleInfo(i, max(lens), min(lens))) + + def _load_lines(self, fpattern, tar_fname): + fpaths = glob.glob(fpattern) + + if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]): + if tar_fname is None: + raise Exception("If tar file provided, please set tar_fname.") + + f = tarfile.open(fpaths[0], "r") + for line in f.extractfile(tar_fname): + fields = line.strip("\n").split(self._field_delimiter) + if (not self._only_src and len(fields) == 2) or ( + self._only_src and len(fields) == 1): + yield fields + else: + for fpath in fpaths: + if not os.path.isfile(fpath): + raise IOError("Invalid file: %s" % fpath) + + with open(fpath, "r") as f: + for line in f: + fields = line.strip("\n").split(self._field_delimiter) + if (not self._only_src and len(fields) == 2) or ( + self._only_src and len(fields) == 1): + yield fields + + @staticmethod + def load_dict(dict_path, reverse=False): + word_dict = {} + with open(dict_path, "r") as fdict: + for idx, line in enumerate(fdict): + if reverse: + word_dict[idx] = line.strip("\n") + else: + word_dict[line.strip("\n")] = idx + return word_dict + + def batch_generator(self): + # global sort or global shuffle + if self._sort_type == SortType.GLOBAL: + infos = sorted( + self._sample_infos, key=lambda x: x.max_len, reverse=True) + else: + if self._shuffle: + infos = self._sample_infos + self._random.shuffle(infos) + else: + infos = self._sample_infos + + if self._sort_type == SortType.POOL: + for i in range(0, len(infos), self._pool_size): + infos[i:i + self._pool_size] = sorted( + infos[i:i + self._pool_size], key=lambda x: x.max_len) + + # concat batch + batches = [] + batch_creator = TokenBatchCreator( + self._batch_size + ) if self._use_token_batch else SentenceBatchCreator(self._batch_size) + batch_creator = MinMaxFilter(self._max_length, self._min_length, + batch_creator) + + for info in infos: + batch = batch_creator.append(info) + if batch is not None: + batches.append(batch) + + if not self._clip_last_batch and len(batch_creator.batch) != 0: + batches.append(batch_creator.batch) + + if self._shuffle_batch: + self._random.shuffle(batches) + + for batch in batches: + batch_ids = [info.i for info in batch] + + if self._only_src: + yield [[self._src_seq_ids[idx]] for idx in batch_ids] + else: + yield [(self._src_seq_ids[idx], self._trg_seq_ids[idx][:-1], + self._trg_seq_ids[idx][1:]) for idx in batch_ids] + + +#from transformer_model import transformer +def position_encoding_init(n_position, d_pos_vec): + """ + Generate the initial values for the sinusoid position encoding table. + """ + position_enc = np.array([[ + pos / np.power(10000, 2 * (j // 2) / d_pos_vec) + for j in range(d_pos_vec) + ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + return position_enc.astype("float32") + + +def multi_head_attention(queries, + keys, + values, + attn_bias, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + cache=None): + """ + Multi-Head Attention. Note that attn_bias is added to the logit before + computing softmax activiation to mask certain selected positions so that + they will not considered in attention weights. + """ + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): + raise ValueError( + "Inputs: quries, keys and values should all be 3-D tensors.") + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = layers.fc(input=queries, + size=d_key * n_head, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + k = layers.fc(input=keys, + size=d_key * n_head, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + v = layers.fc(input=values, + size=d_value * n_head, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + return q, k, v + + def __split_heads(x, n_head): + """ + Reshape the last dimension of inpunt tensor x so that it becomes two + dimensions and then transpose. Specifically, input a tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] then output a tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + if n_head == 1: + return x + + hidden_size = x.shape[-1] + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped = layers.reshape( + x=x, shape=[0, 0, n_head, hidden_size // n_head]) + + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) == 3: return x + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + return layers.reshape( + x=trans_x, + shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])) + + def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): + """ + Scaled Dot-Product Attention + """ + scaled_q = layers.scale(x=q, scale=d_model**-0.5) + product = layers.matmul(x=scaled_q, y=k, transpose_y=True) + if attn_bias: + product += attn_bias + weights = layers.softmax(product) + if dropout_rate: + weights = layers.dropout( + weights, + dropout_prob=dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) + out = layers.matmul(weights, v) + return out + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + + if cache is not None: # use cache and concat time steps + k = cache["k"] = layers.concat([cache["k"], k], axis=1) + v = cache["v"] = layers.concat([cache["v"], v], axis=1) + + q = __split_heads(q, n_head) + k = __split_heads(k, n_head) + v = __split_heads(v, n_head) + + ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model, + dropout_rate) + + out = __combine_heads(ctx_multiheads) + + # Project back to the model size. + proj_out = layers.fc(input=out, + size=d_model, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + return proj_out + + +def positionwise_feed_forward(x, d_inner_hid, d_hid): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=2, + act="relu", + param_attr=const_para_attr, + bias_attr=const_bias_attr) + out = layers.fc(input=hidden, + size=d_hid, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + return out + + +def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.initializer.Constant(1.), + bias_attr=fluid.initializer.Constant(0.)) + elif cmd == "d": # add dropout + if dropout_rate: + out = layers.dropout( + out, + dropout_prob=dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) + return out + + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + + +def prepare_encoder(src_word, + src_pos, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0., + word_emb_param_name=None, + pos_enc_param_name=None): + """Add word embeddings and position encodings. + The output tensor has a shape of: + [batch_size, max_src_length_in_batch, d_model]. + This module is used at the bottom of the encoder stacks. + """ + if TrainTaskConfig.check_acc: + src_word_emb = layers.embedding( + src_word, + size=[src_vocab_size, src_emb_dim], + param_attr=fluid.ParamAttr( + name=word_emb_param_name, + initializer=fluid.initializer.ConstantInitializer(0.001))) + else: + src_word_emb = layers.embedding( + src_word, + size=[src_vocab_size, src_emb_dim], + param_attr=fluid.ParamAttr( + name=word_emb_param_name, + initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) + + src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) + src_pos_enc = layers.embedding( + src_pos, + size=[src_max_len, src_emb_dim], + param_attr=fluid.ParamAttr( + name=pos_enc_param_name, + trainable=False, + initializer=fluid.initializer.ConstantInitializer(0.001))) + enc_input = src_word_emb + src_pos_enc + return layers.dropout( + enc_input, + dropout_prob=dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) if dropout_rate else enc_input + + +prepare_encoder = partial( + prepare_encoder, pos_enc_param_name=pos_enc_param_names[0]) +prepare_decoder = partial( + prepare_encoder, pos_enc_param_name=pos_enc_param_names[1]) + + +def encoder_layer(enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0.): + """The encoder layers that can be stacked to form a deep encoder. + This module consits of a multi-head (self) attention followed by + position-wise feed-forward networks and both the two components companied + with the post_process_layer to add residual connection, layer normalization + and droput. + """ + attn_output = multi_head_attention(enc_input, enc_input, enc_input, + attn_bias, d_key, d_value, d_model, + n_head, dropout_rate) + attn_output = post_process_layer(enc_input, attn_output, "dan", + dropout_rate) + ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model) + return post_process_layer(attn_output, ffd_output, "dan", dropout_rate) + + +def encoder(enc_input, + attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0.): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + for i in range(n_layer): + enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value, + d_model, d_inner_hid, dropout_rate) + enc_input = enc_output + return enc_output + + +def decoder_layer(dec_input, + enc_output, + slf_attn_bias, + dec_enc_attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0., + cache=None): + """ The layer to be stacked in decoder part. + The structure of this module is similar to that in the encoder part except + a multi-head attention is added to implement encoder-decoder attention. + """ + slf_attn_output = multi_head_attention( + dec_input, + dec_input, + dec_input, + slf_attn_bias, + d_key, + d_value, + d_model, + n_head, + dropout_rate, + cache, ) + slf_attn_output = post_process_layer( + dec_input, + slf_attn_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + enc_attn_output = multi_head_attention( + slf_attn_output, + enc_output, + enc_output, + dec_enc_attn_bias, + d_key, + d_value, + d_model, + n_head, + dropout_rate, ) + enc_attn_output = post_process_layer( + slf_attn_output, + enc_attn_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + ffd_output = positionwise_feed_forward( + enc_attn_output, + d_inner_hid, + d_model, ) + dec_output = post_process_layer( + enc_attn_output, + ffd_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + return dec_output -def get_model(): - avg_cost = transformer(use_feed=False) - optimizer = fluid.optimizer.Adam() - optimizer.minimize(avg_cost) - fluid.memory_optimize(fluid.default_main_program()) - return avg_cost +def decoder(dec_input, + enc_output, + dec_slf_attn_bias, + dec_enc_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0., + caches=None): + """ + The decoder is composed of a stack of identical decoder_layer layers. + """ + for i in range(n_layer): + cache = None + if caches is not None: + cache = caches[i] + + dec_output = decoder_layer( + dec_input, + enc_output, + dec_slf_attn_bias, + dec_enc_attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + cache=cache) + dec_input = dec_output + return dec_output + + +def make_all_inputs(input_fields): + """ + Define the input data layers for the transformer model. + """ + inputs = [] + for input_field in input_fields: + input_var = layers.data( + name=input_field, + shape=input_descs[input_field][0], + dtype=input_descs[input_field][1], + lod_level=input_descs[input_field][2] + if len(input_descs[input_field]) == 3 else 0, + append_batch_size=False) + inputs.append(input_var) + return inputs + + +def transformer( + src_vocab_size, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + label_smooth_eps, ): + if weight_sharing: + assert src_vocab_size == src_vocab_size, ( + "Vocabularies in source and target should be same for weight sharing." + ) + enc_inputs = make_all_inputs(encoder_data_input_fields) + + enc_output = wrap_encoder( + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + enc_inputs, ) + + dec_inputs = make_all_inputs(decoder_data_input_fields[:-1]) + + predict = wrap_decoder( + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + dec_inputs, + enc_output, ) + + # Padding index do not contribute to the total loss. The weights is used to + # cancel padding index in calculating the loss. + label, weights = make_all_inputs(label_data_input_fields) + if label_smooth_eps: + label = layers.label_smooth( + label=layers.one_hot( + input=label, depth=trg_vocab_size), + epsilon=label_smooth_eps) + + cost = layers.softmax_with_cross_entropy( + logits=layers.reshape( + predict, shape=[-1, trg_vocab_size]), + label=label, + soft_label=True if label_smooth_eps else False) + weighted_cost = cost * weights + sum_cost = layers.reduce_sum(weighted_cost) + token_num = layers.reduce_sum(weights) + avg_cost = sum_cost / token_num + avg_cost.stop_gradient = True + return sum_cost, avg_cost, predict, token_num + + +def wrap_encoder(src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + enc_inputs=None): + """ + The wrapper assembles together all needed layers for the encoder. + """ + if enc_inputs is None: + # This is used to implement independent encoder program in inference. + src_word, src_pos, src_slf_attn_bias = \ + make_all_inputs(encoder_data_input_fields) + else: + src_word, src_pos, src_slf_attn_bias = \ + enc_inputs + enc_input = prepare_encoder( + src_word, + src_pos, + src_vocab_size, + d_model, + max_length, + dropout_rate, + word_emb_param_name=word_emb_param_names[0]) + enc_output = encoder(enc_input, src_slf_attn_bias, n_layer, n_head, d_key, + d_value, d_model, d_inner_hid, dropout_rate) + return enc_output + + +def wrap_decoder(trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + dec_inputs=None, + enc_output=None, + caches=None): + """ + The wrapper assembles together all needed layers for the decoder. + """ + if dec_inputs is None: + # This is used to implement independent decoder program in inference. + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + enc_output = make_all_inputs( + decoder_data_input_fields + decoder_util_input_fields) + else: + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs + + dec_input = prepare_decoder( + trg_word, + trg_pos, + trg_vocab_size, + d_model, + max_length, + dropout_rate, + word_emb_param_name=word_emb_param_names[0] + if weight_sharing else word_emb_param_names[1]) + dec_output = decoder( + dec_input, + enc_output, + trg_slf_attn_bias, + trg_src_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + caches=caches) + # Return logits for training and probs for inference. + if weight_sharing: + predict = layers.matmul( + x=dec_output, + y=fluid.get_var(word_emb_param_names[0]), + transpose_y=True) + else: + predict = layers.fc(input=dec_output, + size=trg_vocab_size, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + if dec_inputs is None: + predict = layers.softmax(predict) + return predict + + +def fast_decode( + src_vocab_size, + trg_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + beam_size, + max_out_len, + eos_idx, ): + """ + Use beam search to decode. Caches will be used to store states of history + steps which can make the decoding faster. + """ + enc_output = wrap_encoder(src_vocab_size, max_in_len, n_layer, n_head, + d_key, d_value, d_model, d_inner_hid, + dropout_rate, weight_sharing) + start_tokens, init_scores, trg_src_attn_bias = \ + make_all_inputs(fast_decoder_data_input_fields ) + + def beam_search(): + max_len = layers.fill_constant( + shape=[1], dtype=start_tokens.dtype, value=max_out_len) + step_idx = layers.fill_constant( + shape=[1], dtype=start_tokens.dtype, value=0) + cond = layers.less_than(x=step_idx, y=max_len) + while_op = layers.While(cond) + # array states will be stored for each step. + ids = layers.array_write( + layers.reshape(start_tokens, (-1, 1)), step_idx) + scores = layers.array_write(init_scores, step_idx) + # cell states will be overwrited at each step. + # caches contains states of history steps to reduce redundant + # computation in decoder. + caches = [{ + "k": layers.fill_constant_batch_size_like( + input=start_tokens, + shape=[-1, 0, d_model], + dtype=enc_output.dtype, + value=0), + "v": layers.fill_constant_batch_size_like( + input=start_tokens, + shape=[-1, 0, d_model], + dtype=enc_output.dtype, + value=0) + } for i in range(n_layer)] + with while_op.block(): + pre_ids = layers.array_read(array=ids, i=step_idx) + pre_ids = layers.reshape(pre_ids, (-1, 1, 1)) + pre_scores = layers.array_read(array=scores, i=step_idx) + # sequence_expand can gather sequences according to lod thus can be + # used in beam search to sift states corresponding to selected ids. + pre_src_attn_bias = layers.sequence_expand( + x=trg_src_attn_bias, y=pre_scores) + pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores) + pre_caches = [{ + "k": layers.sequence_expand( + x=cache["k"], y=pre_scores), + "v": layers.sequence_expand( + x=cache["v"], y=pre_scores), + } for cache in caches] + pre_pos = layers.elementwise_mul( + x=layers.fill_constant_batch_size_like( + input=pre_enc_output, # cann't use pre_ids here since it has lod + value=1, + shape=[-1, 1, 1], + dtype=pre_ids.dtype), + y=layers.increment( + x=step_idx, value=1.0, in_place=False), + axis=0) + logits = wrap_decoder( + trg_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), + enc_output=pre_enc_output, + caches=pre_caches) + logits = layers.reshape(logits, (-1, trg_vocab_size)) + + topk_scores, topk_indices = layers.topk( + input=layers.softmax(logits), k=beam_size) + accu_scores = layers.elementwise_add( + x=layers.log(topk_scores), + y=layers.reshape( + pre_scores, shape=[-1]), + axis=0) + # beam_search op uses lod to distinguish branches. + topk_indices = layers.lod_reset(topk_indices, pre_ids) + selected_ids, selected_scores = layers.beam_search( + pre_ids=pre_ids, + pre_scores=pre_scores, + ids=topk_indices, + scores=accu_scores, + beam_size=beam_size, + end_id=eos_idx) + + layers.increment(x=step_idx, value=1.0, in_place=True) + # update states + layers.array_write(selected_ids, i=step_idx, array=ids) + layers.array_write(selected_scores, i=step_idx, array=scores) + layers.assign(pre_src_attn_bias, trg_src_attn_bias) + layers.assign(pre_enc_output, enc_output) + for i in range(n_layer): + layers.assign(pre_caches[i]["k"], caches[i]["k"]) + layers.assign(pre_caches[i]["v"], caches[i]["v"]) + length_cond = layers.less_than(x=step_idx, y=max_len) + finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) + layers.logical_and(x=length_cond, y=finish_cond, out=cond) + + finished_ids, finished_scores = layers.beam_search_decode( + ids, scores, beam_size=beam_size, end_id=eos_idx) + return finished_ids, finished_scores + + finished_ids, finished_scores = beam_search() + return finished_ids, finished_scores + + +def get_model(is_dist, is_async): + sum_cost, avg_cost, predict, token_num = transformer( + ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, + ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, + ModelHyperParams.n_head, ModelHyperParams.d_key, + ModelHyperParams.d_value, ModelHyperParams.d_model, + ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, + ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps) + + local_lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, + TrainTaskConfig.warmup_steps, + TrainTaskConfig.learning_rate) + + if not is_dist: + optimizer = fluid.optimizer.Adam( + learning_rate=local_lr_scheduler.learning_rate, + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + optimizer.minimize(sum_cost) + elif is_async: + optimizer = fluid.optimizer.SGD(0.003) + optimizer.minimize(sum_cost) + else: + lr_decay = fluid.layers\ + .learning_rate_scheduler\ + .noam_decay(ModelHyperParams.d_model, + TrainTaskConfig.warmup_steps) + + optimizer = fluid.optimizer.Adam( + learning_rate=lr_decay, + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + optimizer.minimize(sum_cost) + + return sum_cost, avg_cost, predict, token_num, local_lr_scheduler def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): @@ -176,10 +1677,23 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): return t -class DistTransformer2x2(object): +def update_args(): + src_dict = DataReader.load_dict(TrainTaskConfig.src_vocab_fpath) + trg_dict = DataReader.load_dict(TrainTaskConfig.trg_vocab_fpath) + dict_args = [ + "src_vocab_size", str(len(src_dict)), "trg_vocab_size", + str(len(trg_dict)), "bos_idx", + str(src_dict[TrainTaskConfig.special_token[0]]), "eos_idx", + str(src_dict[TrainTaskConfig.special_token[1]]), "unk_idx", + str(src_dict[TrainTaskConfig.special_token[2]]) + ] + merge_cfg_from_list(dict_args, [TrainTaskConfig, ModelHyperParams]) + + +class DistTransformer2x2(TestDistRunnerBase): def run_pserver(self, pserver_endpoints, trainers, current_endpoint, - trainer_id): - get_model() + trainer_id, sync_mode): + get_model(True, not sync_mode) t = get_transpiler(trainer_id, fluid.default_main_program(), pserver_endpoints, trainers) @@ -196,7 +1710,6 @@ class DistTransformer2x2(object): while True: assert retry_times >= 0, "wait ps ready failed" time.sleep(3) - print("waiting ps ready: ", pid) try: # the listen_and_serv_op would touch a file which contains the listen port # on the /tmp directory until it was ready to process all the RPC call. @@ -205,63 +1718,35 @@ class DistTransformer2x2(object): except os.error: retry_times -= 1 - def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): - avg_cost = get_model() + def run_trainer(self, + place, + endpoints, + trainer_id, + trainers, + is_dist=True, + sync_mode=True): + + sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( + is_dist, not sync_mode) + if is_dist: t = get_transpiler(trainer_id, fluid.default_main_program(), endpoints, trainers) trainer_prog = t.get_trainer_program() + TrainTaskConfig.batch_size = 10 + TrainTaskConfig.train_file_pattern = TrainTaskConfig.data_path + "train.tok.clean.bpe.32000.en-de.train_{}".format( + trainer_id) else: + TrainTaskConfig.batch_size = 20 trainer_prog = fluid.default_main_program() startup_exe = fluid.Executor(place) - startup_exe.run(fluid.default_startup_program()) - - strategy = fluid.ExecutionStrategy() - strategy.num_threads = 1 - strategy.allow_op_delay = False - exe = fluid.ParallelExecutor( - True, loss_name=avg_cost.name, exec_strategy=strategy) - - first_loss, = exe.run(fetch_list=[avg_cost.name]) - print(first_loss) - for i in six.moves.xrange(5): - _ = exe.run(fetch_list=[avg_cost.name]) - last_loss, = exe.run(fetch_list=[avg_cost.name]) - print(last_loss) - - -def main(role="pserver", - endpoints="127.0.0.1:9123", - trainer_id=0, - current_endpoint="127.0.0.1:9123", - trainers=1, - is_dist=True): - - reader = paddle.batch( - wmt16.train(ModelHyperParams.src_vocab_size, - ModelHyperParams.trg_vocab_size), - batch_size=transformer_model.batch_size) - - with fluid.recordio_writer.create_recordio_writer( - WMT16_RECORDIO_FILE) as writer: - for batch in reader(): - for tensor in prepare_batch_input( - batch, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head): - t = fluid.LoDTensor() - t.set(tensor, fluid.CPUPlace()) - writer.append_tensor(t) - writer.complete_append_tensor() - - model = DistTransformer2x2() - if role == "pserver": - model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) - else: - p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( - ) else fluid.CPUPlace() - model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) + + TrainTaskConfig.local = not is_dist + + train_loop(startup_exe, trainer_prog, 1, sum_cost, avg_cost, + local_lr_scheduler, token_num, predict) if __name__ == "__main__": @@ -269,18 +1754,6 @@ if __name__ == "__main__": print( "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]" ) - role = sys.argv[1] - endpoints = sys.argv[2] - trainer_id = int(sys.argv[3]) - current_endpoint = sys.argv[4] - trainers = int(sys.argv[5]) - is_dist = True if sys.argv[6] == "TRUE" else False - # FIXME(typhoonzero): refine this test. - is_async = True if sys.argv[7] == "TRUE" else False - main( - role=role, - endpoints=endpoints, - trainer_id=trainer_id, - current_endpoint=current_endpoint, - trainers=trainers, - is_dist=is_dist) + + update_args() + runtime_main(DistTransformer2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py index 0ad994a258c04cabc807823b7d2a8ae8bb62ab2c..f3e740fc7027a4a562b836c3113b87d55062c185 100644 --- a/python/paddle/fluid/tests/unittests/dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py @@ -49,28 +49,32 @@ class TestDistWord2vec2x2(TestDistRunnerBase): dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) embed_second = fluid.layers.embedding( input=words[1], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) embed_third = fluid.layers.embedding( input=words[2], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) embed_forth = fluid.layers.embedding( input=words[3], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) concat_embed = fluid.layers.concat( input=[embed_first, embed_second, embed_third, embed_forth], @@ -80,13 +84,13 @@ class TestDistWord2vec2x2(TestDistRunnerBase): size=HIDDEN_SIZE, act='sigmoid', param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant())) + initializer=fluid.initializer.Constant(value=0.1))) predict_word = fluid.layers.fc( input=hidden1, size=dict_size, act='softmax', param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant())) + initializer=fluid.initializer.Constant(value=0.1))) cost = fluid.layers.cross_entropy( input=predict_word, label=words[4]) avg_cost = fluid.layers.mean(cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py index 9581abdf394d738470d32ae609838832077ee519..083525ccf54d389b60c4aaa9f8c6223f07c773cd 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_train.py +++ b/python/paddle/fluid/tests/unittests/test_dist_train.py @@ -100,7 +100,7 @@ class TestSendOp(unittest.TestCase): main.global_block().append_op( type="fetch_barrier", inputs={}, - outputs={}, + outputs={"Out": []}, attrs={ "endpoints": ["127.0.0.1:{0}".format(port)], RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py index 62fcf5953f93637a20beed649de21476a8673419..a8e6ce4cfe18384e405f1602429628914d2c2e00 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -15,17 +15,55 @@ from __future__ import print_function import unittest +import paddle from test_dist_base import TestDistBase -class TestDistTransformer2x2(TestDistBase): +def download_files(): + url_prefix = 'http://paddle-unittest-data.cdn.bcebos.com/dist_transformer/' + vocab_url = url_prefix + 'vocab.bpe.32000' + vocab_md5 = 'a86d345ca6e27f6591d0dccb1b9be853' + paddle.dataset.common.download(vocab_url, 'test_dist_transformer', + vocab_md5) + + local_train_url = url_prefix + 'train.tok.clean.bpe.32000.en-de' + local_train_md5 = '033eb02b9449e6dd823f050782ac8914' + paddle.dataset.common.download(local_train_url, 'test_dist_transformer', + local_train_md5) + + train0_url = url_prefix + 'train.tok.clean.bpe.32000.en-de.train_0' + train0_md5 = 'ddce7f602f352a0405267285379a38b1' + paddle.dataset.common.download(train0_url, 'test_dist_transformer', + train0_md5) + + train1_url = url_prefix + 'train.tok.clean.bpe.32000.en-de.train_1' + train1_md5 = '8757798200180285b1a619cd7f408747' + paddle.dataset.common.download(train1_url, 'test_dist_transformer', + train1_md5) + + test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de' + test_md5 = '9dd74a266dbdb25314183899f269b4a2' + paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5) + + +class TestDistTransformer2x2Sync(TestDistBase): def _setup_config(self): self._sync_mode = True def test_transformer(self): - # TODO(paddle-dev): check if the delta is OK. - # Usually start around ~8000 and converge to ~5000 - self.check_with_place("dist_transformer.py", delta=400) + download_files() + #Note: loss on test dataset of the first 5 batch are: + # 10.518872, 10.518871, 10.518868, 10.518862, 10.518855 + self.check_with_place("dist_transformer.py", delta=1e-7) + + +class TestDistTransformer2x2Async(TestDistBase): + def _setup_config(self): + self._sync_mode = False + + def test_transformer(self): + download_files() + self.check_with_place("dist_transformer.py", delta=1.0) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py index 38af149ad336fcb818c3cbc9c686bcbdf00238be..9a3e92e8d775a37e0c24ee1bcc5435628d61bb91 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py @@ -22,7 +22,7 @@ class TestDistSeResneXt2x2(TestDistBase): self._sync_mode = True def test_se_resnext(self): - self.check_with_place("dist_word2vec.py", delta=1e-7) + self.check_with_place("dist_word2vec.py", delta=1e-4) class TestDistSeResneXt2x2Async(TestDistBase): diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index d84ebed3fac67db323392494c701cf2a51b28305..1bb4662e8d83ac0c34b209e4e7a605869fdb59d5 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -20,41 +20,50 @@ import math from op_test import OpTest -def quantize_max_abs(x, num_bits): - range = math.pow(2, num_bits) - 1 +def quantize_max_abs(x, max_range): scale = np.max(np.abs(x).flatten()) - y = np.round(x / scale * range) + y = np.round(x / scale * max_range) return y, scale -def dequantize_max_abs(x, num_bits, scale): - range = math.pow(2, num_bits) - 1 - y = (scale / range) * x +def dequantize_max_abs(x, scale, max_range): + y = (scale / max_range) * x return y class TestFakeDequantizeMaxAbsOp(OpTest): def set_args(self): self.num_bits = 8 + self.max_range = math.pow(2, self.num_bits - 1) - 1 + self.data_type = "float32" def setUp(self): self.set_args() self.op_type = "fake_dequantize_max_abs" - x = np.random.randn(31, 65).astype("float32") - yq, scale = quantize_max_abs(x, self.num_bits) - ydq = dequantize_max_abs(yq, self.num_bits, scale) + x = np.random.randn(31, 65).astype(self.data_type) + yq, scale = quantize_max_abs(x, self.max_range) + ydq = dequantize_max_abs(yq, scale, self.max_range) - self.inputs = {'X': yq} - self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)} + self.inputs = {'X': yq, 'Scale': np.array(scale).astype(self.data_type)} + self.attrs = {'max_range': self.max_range} self.outputs = {'Out': ydq} def test_check_output(self): self.check_output() -class TestFakeDequantizeMaxAbsOp5Bits(OpTest): +class TestFakeDequantizeMaxAbsOpDouble(TestFakeDequantizeMaxAbsOp): + def set_args(self): + self.num_bits = 8 + self.max_range = math.pow(2, self.num_bits - 1) - 1 + self.data_type = "float64" + + +class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp): def set_args(self): self.num_bits = 5 + self.max_range = math.pow(2, self.num_bits - 1) - 1 + self.data_type = "float32" if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py new file mode 100644 index 0000000000000000000000000000000000000000..764f83b534c8a183dbf21511f0b05741c13c9528 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py @@ -0,0 +1,133 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import math +from op_test import OpTest +from test_gru_op import gru +from test_fusion_lstm_op import fc, ACTIVATION + + +def fusion_gru( + x, # T x M + lod, # 1 x N + h0, # N x D + wx, # M x 3D + wh, # D x 3D + bias, # 1 x 3D + is_reverse, + act_state, + act_gate): + return gru(fc(x, wx, bias), + lod, + h0, + wh, + np.zeros( + (1, wh.shape[1]), dtype='float64'), + is_reverse, + act_state, + act_gate) + + +class TestFusionGRUOp(OpTest): + def set_confs(self): + pass + + def setUp(self): + self.op_type = "fusion_gru" + self.lod = [[2, 4, 3]] + self.M = 3 + self.D = 5 + self.is_reverse = False + self.with_h0 = True + self.with_bias = True + self.act_state = 'tanh' + self.act_gate = 'sigmoid' + self.set_confs() + + T = sum(self.lod[0]) + N = len(self.lod[0]) + + x = np.random.rand(T, self.M).astype('float64') + wx = np.random.rand(self.M, 3 * self.D).astype('float64') + wh = np.random.rand(self.D, 3 * self.D).astype('float64') + bias = np.random.rand( + 1, 3 * self.D).astype('float64') if self.with_bias else np.zeros( + (1, 3 * self.D), dtype='float64') + h0 = np.random.rand( + N, self.D).astype('float64') if self.with_h0 else np.zeros( + (N, self.D), dtype='float64') + + _, _, _, hidden = fusion_gru( + x, self.lod, h0, wx, wh, bias, self.is_reverse, + ACTIVATION[self.act_state], ACTIVATION[self.act_gate]) + + self.inputs = {'X': (x, self.lod), 'WeightX': wx, 'WeightH': wh} + + if self.with_bias: + self.inputs['Bias'] = bias + + if self.with_h0: + self.inputs['H0'] = h0 + + self.outputs = {'Hidden': (hidden, self.lod)} + + self.attrs = { + 'activation': self.act_state, + 'gate_activation': self.act_gate, + 'is_reverse': self.is_reverse + } + + def test_check_output(self): + self.check_output(atol=1e-8) + + +class TestFusionGRUOpNoInitial(TestFusionGRUOp): + def set_confs(self): + self.with_h0 = False + + +class TestFusionGRUOpNoBias(TestFusionGRUOp): + def set_confs(self): + self.with_bias = False + + +class TestFusionGRUOpReverse(TestFusionGRUOp): + def set_confs(self): + self.is_reverse = True + + +class TestFusionGRUOpMD1(TestFusionGRUOp): + def set_confs(self): + self.M = 36 + self.D = 8 + + +class TestFusionGRUOpMD2(TestFusionGRUOp): + def set_confs(self): + self.M = 8 + self.D = 8 + + +class TestFusionGRUOpBS1(TestFusionGRUOp): + def set_confs(self): + self.lod = [[3]] + self.D = 16 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py index 9d8bef677fd16fb6bdc20b929137b4d885f4efd1..5805bdf461998e90611dec05b079cd55feda520d 100644 --- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py @@ -43,13 +43,13 @@ def fusion_lstm( act_cell, act_cand) -class TestLstmOp(OpTest): - def set_argument(self): - self.lod = [[2, 3, 2]] +class TestFusionLSTMOp(OpTest): + def set_conf(self): + pass def setUp(self): self.op_type = 'fusion_lstm' - self.lod = [[2, 3, 2]] + self.lod = [[2, 3, 5, 4]] self.M = 8 self.D = 16 self.has_initial_state = False @@ -58,33 +58,33 @@ class TestLstmOp(OpTest): self.act_cell = 'tanh' self.act_cand = 'tanh' self.use_peepholes = False - self.set_argument() + self.set_conf() T = sum(self.lod[0]) bs = len(self.lod[0]) - x = np.random.normal(size=(T, self.M)).astype('float64') + x = np.random.normal(size=(T, self.M)).astype('float32') if self.has_initial_state: - h0 = np.random.normal(size=(bs, self.D)).astype('float64') - c0 = np.random.normal(size=(bs, self.D)).astype('float64') + h0 = np.random.normal(size=(bs, self.D)).astype('float32') + c0 = np.random.normal(size=(bs, self.D)).astype('float32') else: - h0 = np.zeros((bs, self.D)).astype('float64') - c0 = np.zeros((bs, self.D)).astype('float64') + h0 = np.zeros((bs, self.D)).astype('float32') + c0 = np.zeros((bs, self.D)).astype('float32') - wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float64') + wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32') if self.use_peepholes: - b = np.random.normal(size=(1, 7 * self.D)).astype('float64') + b = np.random.normal(size=(1, 7 * self.D)).astype('float32') else: - b = np.random.normal(size=(1, 4 * self.D)).astype('float64') + b = np.random.normal(size=(1, 4 * self.D)).astype('float32') w_b = np.copy(b[:, 0:4 * self.D]) w_c = b[:, 4 * self.D:] if self.use_peepholes else None # this is the weight of fc - wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float64') + wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float32') # this is the bias of fc # and it should be manually added into the bias of this fusion LSTM - bx = np.random.normal(size=(1, 4 * self.D)).astype('float64') + bx = np.random.normal(size=(1, 4 * self.D)).astype('float32') b[0, 0:4 * self.D] += bx[0, :] h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c, self.is_reverse, ACTIVATION[self.act_gate], @@ -114,35 +114,45 @@ class TestLstmOp(OpTest): } def test_check_output(self): - self.check_output(atol=1e-8) + self.check_output() -class TestLstmOpInitReverse(TestLstmOp): - def set_argument(self): +class TestFusionLSTMOpInit(TestFusionLSTMOp): + def set_conf(self): + self.has_initial_state = True + + +class TestFusionLSTMOpReverse(TestFusionLSTMOp): + def set_conf(self): + self.is_reverse = True + + +class TestFusionLSTMOpInitReverse(TestFusionLSTMOp): + def set_conf(self): self.has_initial_state = True self.is_reverse = True -class TestLstmOpMD1(TestLstmOp): - def set_argument(self): +class TestFusionLSTMOpMD1(TestFusionLSTMOp): + def set_conf(self): self.M = 36 self.D = 8 -class TestLstmOpMD2(TestLstmOp): - def set_argument(self): +class TestFusionLSTMOpMD2(TestFusionLSTMOp): + def set_conf(self): self.M = 8 self.D = 8 -class TestLstmOpMD3(TestLstmOp): - def set_argument(self): +class TestFusionLSTMOpMD3(TestFusionLSTMOp): + def set_conf(self): self.M = 15 self.D = 3 -class TestLstmOpBS1(TestLstmOp): - def set_argument(self): +class TestFusionLSTMOpBS1(TestFusionLSTMOp): + def set_conf(self): self.lod = [[3]] self.D = 16 diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py new file mode 100644 index 0000000000000000000000000000000000000000..aeee3a9999a94b4979fc3793150101352e50be85 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py @@ -0,0 +1,139 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +from test_fusion_lstm_op import fc, ACTIVATION + + +def fusion_seqexpand_concat_fc(xs, lod, w, b, fc_act): + + T = sum(lod[0]) + N = len(lod[0]) + num_inputs = len(xs) + D = w.shape[1] + + expanded_inputs = [xs[0]] + for i in range(num_inputs - 1): + x = xs[i + 1] + assert x.shape[0] == N + expanded = np.repeat(x, lod[0], axis=0) + assert expanded.shape[0] == T + assert expanded.shape[1] == x.shape[1] + expanded_inputs.append(expanded) + + fc_input = np.concatenate(expanded_inputs, axis=1) + assert fc_input.shape[0] == T + assert fc_input.shape[1] == w.shape[0] + fc_out = fc(fc_input, w, b) + fc_out = fc_act(fc_out) + assert fc_out.shape[0] == T + assert fc_out.shape[1] == D + return fc_out + + +class TestFusionSeqExpandConcatFCOp(OpTest): + def set_conf(self): + pass + + def setUp(self): + self.op_type = 'fusion_seqexpand_concat_fc' + self.lod = [[3, 5, 8, 2]] + self.inputs_M = [15, 10, 10] + self.D = 20 + self.with_bias = True + self.fc_act = 'relu' + self.set_conf() + + T = sum(self.lod[0]) + bs = len(self.lod[0]) + num_inputs = len(self.inputs_M) + + x0 = np.random.normal(size=(T, self.inputs_M[0])).astype('float32') + xs = [x0] + for i in range(num_inputs - 1): + xi = np.random.normal(size=(bs, + self.inputs_M[i + 1])).astype('float32') + xs.append(xi) + + # fc weight and bias + w = np.random.normal(size=(sum(self.inputs_M), + self.D)).astype('float32') + b = np.random.normal(size=( + 1, self.D)).astype('float32') if self.with_bias else np.zeros( + (1, self.D)).astype('float32') + + out = fusion_seqexpand_concat_fc(xs, self.lod, w, b, + ACTIVATION[self.fc_act]) + + self.inputs = {'X': [('x0', (x0, self.lod))], 'FCWeight': w} + normal_lod = [[1] * bs] + for i in range(num_inputs - 1): + self.inputs['X'].append(('x%d' % (i + 1), (xs[i + 1], normal_lod))) + + if self.with_bias: + self.inputs['FCBias'] = b + + self.outputs = {'Out': (out, self.lod)} + self.attrs = {'fc_activation': self.fc_act} + + def test_check_output(self): + self.check_output() + + +class TestFusionSECFCOpNonBias(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.with_bias = False + + +class TestFusionSECFCOpNonAct(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.fc_act = 'identity' + + +class TestFusionSECFCOpMD1(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.inputs_M = [3, 4, 2, 1, 5] + self.D = 8 + + +class TestFusionSECFCOpMD2(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.lod = [[5, 6]] + self.inputs_M = [1, 1] + + +class TestFusionSECFCOpBS1_1(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.lod = [[1]] + self.inputs_M = [3, 4, 2] + + +class TestFusionSECFCOpBS1_2(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.lod = [[1]] + self.inputs_M = [3, 4] + + +class TestFusionSECFCOpBS1_3(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.lod = [[5]] + self.inputs_M = [6, 3] + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals.py b/python/paddle/fluid/tests/unittests/test_generate_proposals.py new file mode 100644 index 0000000000000000000000000000000000000000..3fbd2ce95a4f22b91cd4955f914e12f422b0ee83 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals.py @@ -0,0 +1,320 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://w_idxw.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +import paddle.fluid as fluid +from op_test import OpTest +from test_multiclass_nms_op import nms +from test_anchor_generator_op import anchor_generator_in_python +import copy + + +def generate_proposals_in_python(scores, bbox_deltas, im_info, anchors, + variances, pre_nms_topN, post_nms_topN, + nms_thresh, min_size, eta): + all_anchors = anchors.reshape(-1, 4) + rois = np.empty((0, 5), dtype=np.float32) + roi_probs = np.empty((0, 1), dtype=np.float32) + + rpn_rois = [] + rpn_roi_probs = [] + lod = [] + num_images = scores.shape[0] + for img_idx in range(num_images): + img_i_boxes, img_i_probs = proposal_for_one_image( + im_info[img_idx, :], all_anchors, variances, + bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :], + pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta) + lod.append(img_i_probs.shape[0]) + rpn_rois.append(img_i_boxes) + rpn_roi_probs.append(img_i_probs) + + return rpn_rois, rpn_roi_probs, lod + + +def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores, + pre_nms_topN, post_nms_topN, nms_thresh, min_size, + eta): + # Transpose and reshape predicted bbox transformations to get them + # into the same order as the anchors: + # - bbox deltas will be (4 * A, H, W) format from conv output + # - transpose to (H, W, 4 * A) + # - reshape to (H * W * A, 4) where rows are ordered by (H, W, A) + # in slowest to fastest order to match the enumerated anchors + bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape(-1, 4) + all_anchors = all_anchors.reshape(-1, 4) + variances = variances.reshape(-1, 4) + # Same story for the scores: + # - scores are (A, H, W) format from conv output + # - transpose to (H, W, A) + # - reshape to (H * W * A, 1) where rows are ordered by (H, W, A) + # to match the order of anchors and bbox_deltas + scores = scores.transpose((1, 2, 0)).reshape(-1, 1) + + # sort all (proposal, score) pairs by score from highest to lowest + # take top pre_nms_topN (e.g. 6000) + if pre_nms_topN <= 0 or pre_nms_topN >= len(scores): + order = np.argsort(-scores.squeeze()) + else: + # Avoid sorting possibly large arrays; + # First partition to get top K unsorted + # and then sort just thoes + inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN] + order = np.argsort(-scores[inds].squeeze()) + order = inds[order] + scores = scores[order, :] + bbox_deltas = bbox_deltas[order, :] + all_anchors = all_anchors[order, :] + proposals = box_coder(all_anchors, bbox_deltas, variances) + # clip proposals to image (may result in proposals with zero area + # that will be removed in the next step) + proposals = clip_tiled_boxes(proposals, im_info[:2]) + # remove predicted boxes with height or width < min_size + keep = filter_boxes(proposals, min_size, im_info) + proposals = proposals[keep, :] + scores = scores[keep, :] + + # apply loose nms (e.g. threshold = 0.7) + # take post_nms_topN (e.g. 1000) + # return the top proposals + if nms_thresh > 0: + keep = nms(boxes=proposals, + scores=scores, + nms_threshold=nms_thresh, + eta=eta) + if post_nms_topN > 0 and post_nms_topN < len(keep): + keep = keep[:post_nms_topN] + proposals = proposals[keep, :] + scores = scores[keep, :] + + return proposals, scores + + +def box_coder(all_anchors, bbox_deltas, variances): + """ + Decode proposals by anchors and bbox_deltas from RPN + """ + #proposals: xmin, ymin, xmax, ymax + proposals = np.zeros_like(bbox_deltas, dtype=np.float32) + + #anchor_loc: width, height, center_x, center_y + anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32) + + anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2 + anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2 + + #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height + pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32) + if variances is not None: + for i in range(bbox_deltas.shape[0]): + pred_bbox[i, 0] = variances[i, 0] * bbox_deltas[i, 0] * anchor_loc[ + i, 0] + anchor_loc[i, 2] + pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[ + i, 1] + anchor_loc[i, 3] + pred_bbox[i, 2] = math.exp(variances[i, 2] * + bbox_deltas[i, 2]) * anchor_loc[i, 0] + pred_bbox[i, 3] = math.exp(variances[i, 3] * + bbox_deltas[i, 3]) * anchor_loc[i, 1] + else: + for i in range(bbox_deltas.shape[0]): + pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[ + i, 2] + pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[ + i, 3] + pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0] + pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1] + + proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2 + proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2 + proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 + proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 + + return proposals + + +def clip_tiled_boxes(boxes, im_shape): + """Clip boxes to image boundaries. im_shape is [height, width] and boxes + has shape (N, 4 * num_tiled_boxes).""" + assert boxes.shape[1] % 4 == 0, \ + 'boxes.shape[1] is {:d}, but must be divisible by 4.'.format( + boxes.shape[1] + ) + # x1 >= 0 + boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) + # y1 >= 0 + boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) + # x2 < im_shape[1] + boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) + # y2 < im_shape[0] + boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) + return boxes + + +def filter_boxes(boxes, min_size, im_info): + """Only keep boxes with both sides >= min_size and center within the image. + """ + # Scale min_size to match image scale + min_size *= im_info[2] + ws = boxes[:, 2] - boxes[:, 0] + 1 + hs = boxes[:, 3] - boxes[:, 1] + 1 + x_ctr = boxes[:, 0] + ws / 2. + y_ctr = boxes[:, 1] + hs / 2. + keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) & + (y_ctr < im_info[0]))[0] + return keep + + +def iou(box_a, box_b): + """ + Apply intersection-over-union overlap between box_a and box_b + """ + xmin_a = min(box_a[0], box_a[2]) + ymin_a = min(box_a[1], box_a[3]) + xmax_a = max(box_a[0], box_a[2]) + ymax_a = max(box_a[1], box_a[3]) + + xmin_b = min(box_b[0], box_b[2]) + ymin_b = min(box_b[1], box_b[3]) + xmax_b = max(box_b[0], box_b[2]) + ymax_b = max(box_b[1], box_b[3]) + + area_a = (ymax_a - ymin_a + 1) * (xmax_a - xmin_a + 1) + area_b = (ymax_b - ymin_b + 1) * (xmax_b - xmin_b + 1) + if area_a <= 0 and area_b <= 0: + return 0.0 + + xa = max(xmin_a, xmin_b) + ya = max(ymin_a, ymin_b) + xb = min(xmax_a, xmax_b) + yb = min(ymax_a, ymax_b) + + inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) + + iou_ratio = inter_area / (area_a + area_b - inter_area) + + return iou_ratio + + +def nms(boxes, scores, nms_threshold, eta=1.0): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + nms_threshold: (float) The overlap thresh for suppressing unnecessary + boxes. + eta: (float) The parameter for adaptive NMS. + Return: + The indices of the kept boxes with respect to num_priors. + """ + all_scores = copy.deepcopy(scores) + all_scores = all_scores.flatten() + + sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort') + sorted_scores = all_scores[sorted_indices] + selected_indices = [] + adaptive_threshold = nms_threshold + for i in range(sorted_scores.shape[0]): + idx = sorted_indices[i] + keep = True + for k in range(len(selected_indices)): + if keep: + kept_idx = selected_indices[k] + overlap = iou(boxes[idx], boxes[kept_idx]) + keep = True if overlap <= adaptive_threshold else False + else: + break + if keep: + selected_indices.append(idx) + if keep and eta < 1 and adaptive_threshold > 0.5: + adaptive_threshold *= eta + return selected_indices + + +class TestGenerateProposalsOp(OpTest): + def set_data(self): + self.init_test_params() + self.init_test_input() + self.init_test_output() + self.inputs = { + 'Scores': self.scores, + 'BboxDeltas': self.bbox_deltas, + 'ImInfo': self.im_info.astype(np.float32), + 'Anchors': self.anchors, + 'Variances': self.variances + } + + self.attrs = { + 'pre_nms_topN': self.pre_nms_topN, + 'post_nms_topN': self.post_nms_topN, + 'nms_thresh': self.nms_thresh, + 'min_size': self.min_size, + 'eta': self.eta + } + + print("lod = ", self.lod) + self.outputs = { + 'RpnRois': (self.rpn_rois[0], [self.lod]), + 'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]) + } + + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "generate_proposals" + self.set_data() + + def init_test_params(self): + self.pre_nms_topN = 12000 # train 12000, test 2000 + self.post_nms_topN = 5000 # train 6000, test 1000 + self.nms_thresh = 0.7 + self.min_size = 3.0 + self.eta = 0.8 + + def init_test_input(self): + batch_size = 1 + input_channels = 20 + layer_h = 16 + layer_w = 16 + input_feat = np.random.random( + (batch_size, input_channels, layer_h, layer_w)).astype('float32') + self.anchors, self.variances = anchor_generator_in_python( + input_feat=input_feat, + anchor_sizes=[16., 32.], + aspect_ratios=[0.5, 1.0], + variances=[1.0, 1.0, 1.0, 1.0], + stride=[16.0, 16.0], + offset=0.5) + self.im_info = np.array([[64., 64., 8.]]) #im_height, im_width, scale + num_anchors = self.anchors.shape[2] + self.scores = np.random.random( + (batch_size, num_anchors, layer_h, layer_w)).astype('float32') + self.bbox_deltas = np.random.random( + (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32') + + def init_test_output(self): + self.rpn_rois, self.rpn_roi_probs, self.lod = generate_proposals_in_python( + self.scores, self.bbox_deltas, self.im_info, self.anchors, + self.variances, self.pre_nms_topN, self.post_nms_topN, + self.nms_thresh, self.min_size, self.eta) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py index 001fd7efb159e60bdf3cd0698d85dea90ad71616..9f6f03f9cfe3c505a7b1227e2b20db3c3c84c745 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_op.py @@ -19,22 +19,19 @@ import numpy as np import math import functools from op_test import OpTest -from test_lstm_op import identity, sigmoid, tanh, relu - - -class TestGRUOp(OpTest): - lod = [[2, 4, 3]] - batch_size = sum(lod[0]) - frame_size = 5 - activate = { - 'identity': identity, - 'sigmoid': sigmoid, - 'tanh': tanh, - 'relu': relu - } - - @staticmethod - def seq_to_batch(lod, is_reverse): +from test_lstm_op import ACTIVATION + + +def gru( + input, # T x 3D + lod, # 1 x N + h0, # N x D + weight, # D x 3D + bias, # 1 x 3D + is_reverse, + act_state, + act_gate): + def _seq_to_batch(lod, is_reverse): idx_in_seq_list = [] seq_lens = lod[0] seq_starts = [0] @@ -56,121 +53,125 @@ class TestGRUOp(OpTest): idx_in_seq_list.append(idx_in_seq) return idx_in_seq_list, sorted_seqs - def gru_step(self, x, h_p, w, b): - batch_size = x.shape[0] - frame_size = w.shape[0] - g = x + np.tile(b, (batch_size, 1)) - w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape( - (frame_size, frame_size * 2)) - u_r = self.activate[self.attrs['gate_activation']](np.dot( - h_p, w_u_r) + g[:, :frame_size * 2]) - u = u_r[:, :frame_size] - r = u_r[:, frame_size:frame_size * 2] + def _step(x, h_p, w, b, act_state, act_gate): + T = x.shape[0] + D = w.shape[0] + g = x + np.tile(b, (T, 1)) + w_u_r = w.flatten()[:D * D * 2].reshape((D, D * 2)) + u_r = act_gate(np.dot(h_p, w_u_r) + g[:, :D * 2]) + u = u_r[:, :D] + r = u_r[:, D:D * 2] r_h_p = r * h_p - w_c = w.flatten()[frame_size * frame_size * 2:].reshape( - (frame_size, frame_size)) - c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) + - g[:, frame_size * 2:]) + w_c = w.flatten()[D * D * 2:].reshape((D, D)) + c = act_state(np.dot(r_h_p, w_c) + g[:, D * 2:]) g = np.hstack((u_r, c)) h = u * c + (1 - u) * h_p return g, r_h_p, h - def gru(self): - input, lod = self.inputs['Input'] - w = self.inputs['Weight'] - b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros( - (1, self.frame_size * 3)) - batch_gate = self.outputs['BatchGate'] - batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev'] - batch_hidden = self.outputs['BatchHidden'] - hidden = self.outputs['Hidden'] - idx_in_seq_list = self.idx_in_seq_list - h_p = self.inputs['H0'][ - self.sorted_seqs] if 'H0' in self.inputs else np.zeros( - (len(idx_in_seq_list[0]), self.frame_size)) - num_batch = len(idx_in_seq_list) - end_idx = 0 - for batch_idx in range(num_batch): - x = input[idx_in_seq_list[batch_idx]] - g, r_h_p, h = self.gru_step(x, h_p, w, b) - if batch_idx < (num_batch - 1): - h_p = h[:len(idx_in_seq_list[batch_idx + 1])] - start_idx = end_idx - end_idx = start_idx + len(idx_in_seq_list[batch_idx]) - batch_gate[start_idx:end_idx] = g - batch_reset_hidden_prev[start_idx:end_idx] = r_h_p - batch_hidden[start_idx:end_idx] = h - hidden[idx_in_seq_list[batch_idx]] = h - return batch_gate, batch_reset_hidden_prev, hidden - - def set_data(self): - lod = self.lod - self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch( - lod, self.is_reverse) - batch_size = self.batch_size - frame_size = self.frame_size - input = np.random.rand(batch_size, frame_size * 3).astype('float64') - h0 = np.random.rand(len(self.idx_in_seq_list[0]), - frame_size).astype('float64') - weight = np.random.rand(frame_size, frame_size * 3).astype('float64') - bias = np.random.rand(1, frame_size * 3).astype('float64') - - self.inputs = { - 'Input': (input, lod), - 'H0': h0, - 'Weight': weight, - 'Bias': bias - } + T = sum(lod[0]) + N = len(lod[0]) + D = weight.shape[0] + batch_gate = np.zeros((T, 3 * D), dtype='float64') + batch_reset_hidden_prev = np.zeros((T, D), dtype='float64') + batch_hidden = np.zeros((T, D), dtype='float64') + hidden = np.zeros((T, D), dtype='float64') + + idx_in_seq_list, sorted_seqs = _seq_to_batch(lod, is_reverse) + h_p = h0[sorted_seqs] + max_seq_len = len(idx_in_seq_list) + assert len(idx_in_seq_list[0]) == N + end_idx = 0 + for batch_idx in range(max_seq_len): + x = input[idx_in_seq_list[batch_idx]] + g, r_h_p, h = _step(x, h_p, weight, bias, act_state, act_gate) + if batch_idx < (max_seq_len - 1): + h_p = h[:len(idx_in_seq_list[batch_idx + 1])] + start_idx = end_idx + end_idx = start_idx + len(idx_in_seq_list[batch_idx]) + batch_gate[start_idx:end_idx] = g + batch_reset_hidden_prev[start_idx:end_idx] = r_h_p + batch_hidden[start_idx:end_idx] = h + hidden[idx_in_seq_list[batch_idx]] = h + return batch_gate, batch_reset_hidden_prev, batch_hidden, hidden - self.outputs = { - 'BatchGate': np.zeros( - (batch_size, frame_size * 3), dtype='float64'), - 'BatchResetHiddenPrev': np.zeros( - (batch_size, frame_size), dtype='float64'), - 'BatchHidden': np.zeros( - (batch_size, frame_size), dtype='float64'), - 'Hidden': np.zeros( - (batch_size, frame_size), dtype='float64') - } +class TestGRUOp(OpTest): def set_confs(self): - self.is_reverse = False - self.attrs = { - 'activation': 'tanh', - 'gate_activation': 'sigmoid', - 'is_reverse': self.is_reverse - } + pass def setUp(self): self.op_type = "gru" + self.lod = [[2, 4, 3]] + self.D = 5 + self.is_reverse = False + self.with_h0 = True + self.with_bias = True + self.act_state = 'tanh' + self.act_gate = 'sigmoid' self.set_confs() - self.set_data() - self.gru() + + T = sum(self.lod[0]) + N = len(self.lod[0]) + + input = np.random.rand(T, 3 * self.D).astype('float64') + weight = np.random.rand(self.D, 3 * self.D).astype('float64') + bias = np.random.rand( + 1, 3 * self.D).astype('float64') if self.with_bias else np.zeros( + (1, 3 * self.D), dtype='float64') + h0 = np.random.rand( + N, self.D).astype('float64') if self.with_h0 else np.zeros( + (N, self.D), dtype='float64') + + batch_gate, batch_reset_hidden_prev, batch_hidden, hidden = gru( + input, self.lod, h0, weight, bias, self.is_reverse, + ACTIVATION[self.act_state], ACTIVATION[self.act_gate]) + self.inputs = {'Input': (input, self.lod), 'Weight': weight} + + if self.with_bias: + self.inputs['Bias'] = bias + + if self.with_h0: + self.inputs['H0'] = h0 + + self.outputs = { + 'Hidden': (hidden, self.lod), + 'BatchGate': batch_gate, + 'BatchResetHiddenPrev': batch_reset_hidden_prev, + 'BatchHidden': batch_hidden, + } + + self.attrs = { + 'activation': self.act_state, + 'gate_activation': self.act_gate, + 'is_reverse': self.is_reverse + } def test_check_output(self): - self.check_output() + self.check_output(atol=1e-8) def test_check_grad(self): self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) class TestGRUOpNoInitial(TestGRUOp): - def set_data(self): - super(TestGRUOpNoInitial, self).set_data() - self.inputs.pop('H0') + def set_confs(self): + self.with_h0 = False def test_check_grad(self): self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden']) +class TestGRUOpNoBias(TestGRUOp): + def set_confs(self): + self.with_bias = False + + def test_check_grad(self): + self.check_grad(['Input', 'H0', 'Weight'], ['Hidden']) + + class TestGRUOpReverse(TestGRUOp): def set_confs(self): self.is_reverse = True - self.attrs = { - 'activation': 'tanh', - 'gate_activation': 'sigmoid', - 'is_reverse': self.is_reverse - } if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_pad_constant_like.py b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py new file mode 100644 index 0000000000000000000000000000000000000000..6b733fd8fa023f07013909502dbbd5371297216e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py @@ -0,0 +1,69 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest + + +class TestPadOp(OpTest): + def setUp(self): + self.initTestCase() + self.op_type = "pad_constant_like" + self.inputs = { + 'X': np.random.random(self.x_shape).astype("float32"), + 'Y': np.random.random(self.y_shape).astype("float32") + } + self.attrs = {} + self.attrs['pad_value'] = self.pad_value + self.outputs = { + 'Out': np.pad(self.inputs['Y'], + self.paddings, + mode='constant', + constant_values=self.pad_value) + } + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['Y'], 'Out', max_relative_error=0.006) + + def initTestCase(self): + self.x_shape = (16, 16) + self.y_shape = (3, 16) + self.pad_value = 0.1 + self.paddings = [(0, 13), (0, 0)] + + +class TestCase1(TestPadOp): + def initTestCase(self): + self.x_shape = (4, 3, 4, 4) + self.y_shape = (2, 3, 4, 4) + self.paddings = [(0, 2), (0, 0), (0, 0), (0, 0)] + self.pad_value = 0.5 + + +class TestCase2(TestPadOp): + def initTestCase(self): + self.x_shape = (4, 3, 4, 4) + self.y_shape = (2, 3, 2, 4) + self.paddings = [(0, 2), (0, 0), (0, 2), (0, 0)] + self.pad_value = 0.5 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py index ac682d6181cfcc5a064a51a736b03d493c37b780..8097b5f734343ca97c131474338ed1cd60eefc85 100644 --- a/python/paddle/fluid/tests/unittests/test_print_op.py +++ b/python/paddle/fluid/tests/unittests/test_print_op.py @@ -35,9 +35,8 @@ class TestPrintOpCPU(unittest.TestCase): def build_network(self, only_forward, **kargs): x = layers.data('x', shape=[3], dtype='float32', lod_level=1) x.stop_gradient = False - printed = layers.Print(input=x, **kargs) - if only_forward: return printed - loss = layers.mean(printed) + layers.Print(input=x, **kargs) + loss = layers.mean(x) append_backward(loss=loss) return loss diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py index 0a8a43253d79ba21c7333dd19af05d8adf410289..032af6ed5ce9e1007d6775306ef4c0aefb9dcc41 100644 --- a/python/paddle/fluid/tests/unittests/test_scale_op.py +++ b/python/paddle/fluid/tests/unittests/test_scale_op.py @@ -17,6 +17,8 @@ from __future__ import print_function import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core +from paddle.fluid.op import Operator class TestScaleOp(OpTest): @@ -33,5 +35,57 @@ class TestScaleOp(OpTest): self.check_grad(['X'], 'Out') +class TestScaleOpSelectedRows(unittest.TestCase): + def check_with_place(self, place, in_name, out_name): + scope = core.Scope() + + # create and initialize Grad Variable + in_height = 10 + in_rows = [0, 4, 7] + in_row_numel = 12 + scale = 2.0 + + in_selected_rows = scope.var(in_name).get_selected_rows() + in_selected_rows.set_height(in_height) + in_selected_rows.set_rows(in_rows) + in_array = np.random.random( + (len(in_rows), in_row_numel)).astype("float32") + + in_tensor = in_selected_rows.get_tensor() + in_tensor.set(in_array, place) + + # create and initialize Param Variable + out_selected_rows = scope.var(out_name).get_selected_rows() + out_tensor = out_selected_rows.get_tensor() + out_tensor._set_dims(in_tensor._get_dims()) + + # create and run sgd operator + scale_op = Operator("scale", X=in_name, Out=out_name, scale=scale) + scale_op.run(scope, place) + + # get and compare result + out_height = out_selected_rows.height() + out_rows = out_selected_rows.rows() + result_array = np.array(out_tensor) + + assert (in_array * scale == result_array).all() + assert in_height == out_height + assert in_rows == out_rows + + def test_scale_selected_rows(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place, 'in', 'out') + + def test_scale_selected_rows_inplace(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place, 'in', 'in') + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py new file mode 100644 index 0000000000000000000000000000000000000000..471515c817541976a06eb024fa3d4f77b78f920d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py @@ -0,0 +1,131 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestSequencePadOp(OpTest): + def set_attr(self): + self.x_shape = [12, 4] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0] + self.padded_length = -1 + self.dtype = 'float32' + + def set_data(self): + x_data = np.random.uniform(0.1, 0.5, self.x_shape).astype(self.dtype) + pad_value_data = np.array(self.pad_value).astype(self.dtype) + self.inputs = { + 'X': (x_data, self.x_len_lod), + 'PadValue': pad_value_data + } + self.attrs = {'padded_length': self.padded_length} + + def compute(self): + # get padded length + padded_length = self.padded_length + x_len_lod_0 = self.x_len_lod[0] + if padded_length == -1: + max_seq_len = 0 + for l in x_len_lod_0: + max_seq_len = max(max_seq_len, l) + padded_length = max_seq_len + + # do padding + x_data = self.inputs['X'][0] + pad_value_data = self.inputs['PadValue'] + if pad_value_data.shape == (1, ): + pad_value_data = np.broadcast_to( + pad_value_data, shape=x_data.shape[1:]) + padded_sequences = [] + start_idx = 0 + for l in x_len_lod_0: + end_idx = start_idx + l + seq = x_data[start_idx:end_idx] + to_pad_len = padded_length - l + for _ in range(to_pad_len): + seq = np.append(seq, pad_value_data[np.newaxis, :], axis=0) + padded_sequences.append(seq) + start_idx = end_idx + + out_data = np.array(padded_sequences) + self.outputs = {'Out': out_data} + + def setUp(self): + self.op_type = 'sequence_pad' + self.set_attr() + self.set_data() + self.compute() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestSequencePadOp2(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 4] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0, 2.0, 3.0, 4.0] + self.padded_length = -1 + self.dtype = 'float32' + + +class TestSequencePadOp3(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 4] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0] + self.padded_length = 7 + self.dtype = 'float32' + + +class TestSequencePadOp4(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 4] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0, 2.0, 3.0, 4.0] + self.padded_length = 7 + self.dtype = 'float32' + + +class TestSequencePadOp5(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 2, 2] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0] + self.padded_length = -1 + self.dtype = 'float32' + + +class TestSequencePadOp6(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 2, 2] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [[1.0, 2.0], [3.0, 4.0]] + self.padded_length = -1 + self.dtype = 'float32' + + +class TestSequencePadOp7(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 2, 2] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0] + self.padded_length = 7 + self.dtype = 'float32' diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py index e9d0f8a0193c77da33a8cf128dbf8a1c5087782b..1822957c23d0bb1e4821373515d4faef2b76950e 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_tensor.py @@ -59,6 +59,27 @@ class TestTensor(unittest.TestCase): self.assertAlmostEqual(1.0, tensor_array_2[3, 9]) self.assertAlmostEqual(2.0, tensor_array_2[19, 11]) + def test_int8_tensor(self): + scope = core.Scope() + var = scope.var("int8_tensor") + cpu_tensor = var.get_tensor() + tensor_array = numpy.random.randint( + -127, high=128, size=[100, 200], dtype=numpy.int8) + place = core.CPUPlace() + cpu_tensor.set(tensor_array, place) + cpu_tensor_array_2 = numpy.array(cpu_tensor) + self.assertAlmostEqual(cpu_tensor_array_2.all(), tensor_array.all()) + + if core.is_compiled_with_cuda(): + cuda_tensor = var.get_tensor() + tensor_array = numpy.random.randint( + -127, high=128, size=[100, 200], dtype=numpy.int8) + place = core.CUDAPlace(0) + cuda_tensor.set(tensor_array, place) + cuda_tensor_array_2 = numpy.array(cuda_tensor) + self.assertAlmostEqual(cuda_tensor_array_2.all(), + tensor_array.all()) + def test_int_lod_tensor(self): place = core.CPUPlace() scope = core.Scope() diff --git a/python/paddle/fluid/tests/unittests/test_unstack_op.py b/python/paddle/fluid/tests/unittests/test_unstack_op.py new file mode 100644 index 0000000000000000000000000000000000000000..7cbac8928ec40dc3e1c0e91e7779ec9ec978d884 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_unstack_op.py @@ -0,0 +1,81 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from op_test import OpTest +import numpy as np +import unittest + + +class TestUnStackOpBase(OpTest): + def initDefaultParameters(self): + self.input_dim = (5, 6, 7) + self.axis = 0 + self.dtype = 'float32' + + def initParameters(self): + pass + + def get_y_names(self): + y_names = [] + for i in range(self.input_dim[self.axis]): + y_names.append('y{}'.format(i)) + return y_names + + def setUp(self): + self.initDefaultParameters() + self.initParameters() + self.op_type = 'unstack' + self.x = np.random.random(size=self.input_dim).astype(self.dtype) + + outs = np.split(self.x, self.input_dim[self.axis], self.axis) + new_shape = list(self.input_dim) + del new_shape[self.axis] + y_names = self.get_y_names() + tmp = [] + for i in range(self.input_dim[self.axis]): + tmp.append((y_names[i], np.reshape(outs[i], new_shape))) + + self.inputs = {'X': self.x} + self.outputs = {'Y': tmp} + self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad('X', self.get_y_names()) + + +class TestStackOp3(TestUnStackOpBase): + def initParameters(self): + self.axis = -1 + + +class TestStackOp4(TestUnStackOpBase): + def initParameters(self): + self.axis = -3 + + +class TestStackOp5(TestUnStackOpBase): + def initParameters(self): + self.axis = 1 + + +class TestStackOp6(TestUnStackOpBase): + def initParameters(self): + self.axis = 2 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index b0830e130dd9a9037f8dd900a256eea3d05f64b8..4f3c26ca7bdf4d807952b413c8b0dc8b211c06f6 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -31,7 +31,8 @@ class TestVariable(unittest.TestCase): self.assertEqual(DT.INT16, convert("int16")) self.assertEqual(DT.INT64, convert("int64")) self.assertEqual(DT.BOOL, convert("bool")) - self.assertRaises(ValueError, lambda: convert("int8")) + self.assertEqual(DT.INT8, convert("int8")) + self.assertEqual(DT.UINT8, convert("uint8")) def test_var(self): b = default_main_program().current_block() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 80d9758b3dd0d6c57adc888c492cac26da3939d0..21c51bd139ed72322aa363f42c8ead402a501bff 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -31,7 +31,6 @@ Steps to transpile pserver: """ import math -import random import numpy as np import collections import six @@ -239,8 +238,8 @@ class DistributeTranspiler(object): grad_var_mapping_items = list(six.iteritems(self.grad_var_mapping)) if not self.config.slice_var_up: - random.seed(self.origin_program.random_seed) - random.shuffle(grad_var_mapping_items) + np.random.seed(self.origin_program.random_seed) + np.random.shuffle(grad_var_mapping_items) grad_name_to_send_dummy_out = dict() for grad_varname, splited_vars in grad_var_mapping_items: @@ -284,10 +283,13 @@ class DistributeTranspiler(object): send_vars.append(var) if self.sync_mode: + send_barrier_out = program.global_block().create_var( + name=framework.generate_control_dev_var_name()) + input_deps = grad_name_to_send_dummy_out.values() program.global_block().append_op( type="send_barrier", - inputs={}, - outputs={}, + inputs={"X": input_deps}, + outputs={"Out": send_barrier_out}, attrs={ "endpoints": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE @@ -305,16 +307,22 @@ class DistributeTranspiler(object): self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i]) # step4: Concat the parameters splits together after recv. + all_recv_outputs = [] for param_varname, splited_var in six.iteritems(self.param_var_mapping): eps = [] for var in splited_var: index = [v.name for v in recv_vars].index(var.name) eps.append(eplist[index]) - grad_send_dummy_out = grad_name_to_send_dummy_out[ - self.param_name_to_grad_name[param_varname]] + if self.sync_mode: + recv_dep_in = send_barrier_out + else: + # connect deps to send op in async mode + recv_dep_in = grad_name_to_send_dummy_out[ + self.param_name_to_grad_name[param_varname]] + all_recv_outputs.extend(splited_var) program.global_block().append_op( type="recv", - inputs={"X": [grad_send_dummy_out]}, + inputs={"X": [recv_dep_in]}, outputs={"Out": splited_var}, attrs={ "epmap": eps, @@ -327,10 +335,11 @@ class DistributeTranspiler(object): }) if self.sync_mode: + # form a WAW dependency program.global_block().append_op( type="fetch_barrier", inputs={}, - outputs={}, + outputs={"Out": all_recv_outputs}, attrs={ "endpoints": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE @@ -414,10 +423,12 @@ class DistributeTranspiler(object): RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) + fetch_barrier_out = startup_program.global_block().create_var( + name=framework.generate_control_dev_var_name()) startup_program.global_block().append_op( type="fetch_barrier", inputs={}, - outputs={}, + outputs={"Out": fetch_barrier_out}, attrs={ "endpoints": self.pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE