diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 67994aad70a40c0e0c8a311914d4ea40b96eaf1e..6e682b69583e00ab1bbe1c0d22e21ae114a61a76 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -21,7 +21,7 @@ else fi USE_TENSORRT=OFF -if [ [-d"$TENSORRT_INCLUDE_DIR"] -a [-d"$TENSORRT_LIB_DIR"] ]; then +if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then USE_TENSORRT=ON fi diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index f9bb66a6e9f81a10368db7710108c319860e940a..677f85152f202b514d0563f885d872c84faba19a 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -42,16 +42,22 @@ class Pool2dOpConverter : public OpConverter { boost::get>(op_desc.GetAttr("strides")); std::vector paddings = boost::get>(op_desc.GetAttr("paddings")); + bool ceil_mode = boost::get(op_desc.GetAttr("ceil_mode")); + nvinfer1::Dims input_shape = input1->getDimensions(); + int nbDims = input_shape.nbDims; nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); + nvinfer1::DimsHW nv_strides(strides[0], strides[1]); + nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); + if (global_pooling == true) { - nvinfer1::Dims input_shape = input1->getDimensions(); - int nbDims = input_shape.nbDims; nv_ksize.d[0] = input_shape.d[nbDims - 2]; nv_ksize.d[1] = input_shape.d[nbDims - 1]; + nv_strides.h() = 1; + nv_strides.w() = 1; + nv_paddings.h() = 0; + nv_paddings.w() = 0; } - const nvinfer1::DimsHW nv_strides(strides[0], strides[1]); - const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL); @@ -64,6 +70,36 @@ class Pool2dOpConverter : public OpConverter { PADDLE_THROW("TensorRT unsupported pooling type!"); } + if (ceil_mode) { + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); + int input_height = input_shape.d[nbDims - 2]; + int input_width = input_shape.d[nbDims - 1]; + int floor_h_output_size = + (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1; + int ceil_h_output_size = + (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) / + strides[0] + + 1; + + int floor_w_output_size = + (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1; + int ceil_w_output_size = + (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) / + strides[1] + + 1; + if (floor_h_output_size != ceil_h_output_size) { + post_pad.h() = strides[0] - 1; + } + + if (floor_w_output_size != ceil_w_output_size) { + post_pad.w() = strides[1] - 1; + } + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, Padding, *const_cast(input1), pre_pad, + post_pad); + input1 = layer->getOutput(0); + } auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast(input1), nv_pool_type, nv_ksize); diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc index aedd6b62df040eeee4e48f628128511cd8bf4439..ee597f8465c218c0fb6648374c128cabf7b033fb 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -20,18 +20,20 @@ namespace paddle { namespace inference { namespace tensorrt { -void test_pool2d(bool global_pooling) { +void test_pool2d(bool global_pooling, bool ceil_mode) { framework::Scope scope; std::unordered_set parameters; TRTConvertValidation validator(5, parameters, scope, 1 << 15); // The ITensor's Dims should not contain the batch size. // So, the ITensor's Dims of input and output should be C * H * W. - validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4)); + validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 13, 14)); if (global_pooling) validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1)); + else if (ceil_mode) + validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 7)); else - validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2)); + validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 6)); // Prepare Op description framework::OpDesc desc; @@ -39,7 +41,7 @@ void test_pool2d(bool global_pooling) { desc.SetInput("X", {"pool2d-X"}); desc.SetOutput("Out", {"pool2d-Out"}); - std::vector ksize({2, 2}); + std::vector ksize({3, 3}); std::vector strides({2, 2}); std::vector paddings({0, 0}); std::string pooling_t = "max"; @@ -49,6 +51,7 @@ void test_pool2d(bool global_pooling) { desc.SetAttr("strides", strides); desc.SetAttr("paddings", paddings); desc.SetAttr("global_pooling", global_pooling); + desc.SetAttr("ceil_mode", ceil_mode); LOG(INFO) << "set OP"; validator.SetOp(*desc.Proto()); @@ -57,9 +60,10 @@ void test_pool2d(bool global_pooling) { validator.Execute(3); } -TEST(Pool2dOpConverter, normal) { test_pool2d(false); } +TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); } +TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); } -TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true); } +TEST(Pool2dOpConverter, test_ceil_mode) { test_pool2d(false, true); } } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index dda423efd35b96f5e1d7c55389818f46ef3d8694..46fff9d338b7759496faaf6dd9960d34887755ba 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -52,6 +52,9 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->HasOutput("TargetBBox"), "Output(TargetBBox) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("BBoxInsideWeight"), + "Output(BBoxInsideWeight) of RpnTargetAssignOp should not be null"); auto anchor_dims = ctx->GetInputDim("Anchor"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); @@ -68,6 +71,7 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ScoreIndex", {-1}); ctx->SetOutputDim("TargetLabel", {-1, 1}); ctx->SetOutputDim("TargetBBox", {-1, 4}); + ctx->SetOutputDim("BBoxInsideWeight", {-1, 4}); } protected: @@ -169,6 +173,7 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data, const float rpn_positive_overlap, const float rpn_negative_overlap, std::vector* fg_inds, std::vector* bg_inds, std::vector* tgt_lbl, + std::vector* fg_fake, std::vector* bbox_inside_weight, std::minstd_rand engine, bool use_random) { float epsilon = 0.00001; int anchor_num = anchor_to_gt_max.dims()[0]; @@ -201,12 +206,12 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data, // Reservoir Sampling int fg_num = static_cast(rpn_fg_fraction * rpn_batch_size_per_im); ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random); - fg_num = static_cast(fg_inds_fake.size()); - for (int64_t i = 0; i < fg_num; ++i) { + int fg_fake_num = static_cast(fg_inds_fake.size()); + for (int64_t i = 0; i < fg_fake_num; ++i) { target_label[fg_inds_fake[i]] = 1; } - int bg_num = rpn_batch_size_per_im - fg_num; + int bg_num = rpn_batch_size_per_im - fg_fake_num; for (int64_t i = 0; i < anchor_num; ++i) { if (anchor_to_gt_max_data[i] < rpn_negative_overlap) { bg_inds_fake.push_back(i); @@ -214,12 +219,28 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data, } ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random); bg_num = static_cast(bg_inds_fake.size()); + int fake_num = 0; for (int64_t i = 0; i < bg_num; ++i) { + // fg fake found + if (target_label[bg_inds_fake[i]] == 1) { + fake_num++; + fg_fake->emplace_back(fg_inds_fake[0]); + for (int j = 0; j < 4; ++j) { + bbox_inside_weight->emplace_back(T(0.)); + } + } target_label[bg_inds_fake[i]] = 0; } + for (int64_t i = 0; i < (fg_fake_num - fake_num) * 4; ++i) { + bbox_inside_weight->emplace_back(T(1.)); + } + for (int64_t i = 0; i < anchor_num; ++i) { - if (target_label[i] == 1) fg_inds->emplace_back(i); + if (target_label[i] == 1) { + fg_inds->emplace_back(i); + fg_fake->emplace_back(i); + } if (target_label[i] == 0) bg_inds->emplace_back(i); } fg_num = fg_inds->size(); @@ -248,7 +269,8 @@ std::vector SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx, std::vector bg_inds; std::vector gt_inds; std::vector tgt_lbl; - + std::vector fg_fake; + std::vector bbox_inside_weight; // Calculate the max IoU between anchors and gt boxes // Map from anchor to gt box that has highest overlap auto place = ctx.GetPlace(); @@ -275,32 +297,37 @@ std::vector SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx, // Follow the Faster RCNN's implementation ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max, rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap, - rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine, - use_random); + rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, &fg_fake, + &bbox_inside_weight, engine, use_random); int fg_num = fg_inds.size(); int bg_num = bg_inds.size(); - gt_inds.reserve(fg_num); - for (int i = 0; i < fg_num; ++i) { - gt_inds.emplace_back(argmax[fg_inds[i]]); + int fg_fake_num = fg_fake.size(); + gt_inds.reserve(fg_fake_num); + for (int i = 0; i < fg_fake_num; ++i) { + gt_inds.emplace_back(argmax[fg_fake[i]]); } - - Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t; - int* loc_index_data = loc_index_t.mutable_data({fg_num}, place); + Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t; + int* loc_index_data = loc_index_t.mutable_data({fg_fake_num}, place); int* score_index_data = score_index_t.mutable_data({fg_num + bg_num}, place); int* tgt_lbl_data = tgt_lbl_t.mutable_data({fg_num + bg_num}, place); - int* gt_inds_data = gt_inds_t.mutable_data({fg_num}, place); - std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data); + int* gt_inds_data = gt_inds_t.mutable_data({fg_fake_num}, place); + T* bbox_inside_weight_data = + bbox_inside_weight_t.mutable_data({fg_fake_num, 4}, place); + std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data); std::copy(fg_inds.begin(), fg_inds.end(), score_index_data); std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num); std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data); std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data); + std::copy(bbox_inside_weight.begin(), bbox_inside_weight.end(), + bbox_inside_weight_data); std::vector loc_score_tgtlbl_gt; loc_score_tgtlbl_gt.emplace_back(loc_index_t); loc_score_tgtlbl_gt.emplace_back(score_index_t); loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t); loc_score_tgtlbl_gt.emplace_back(gt_inds_t); + loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t); return loc_score_tgtlbl_gt; } @@ -318,6 +345,7 @@ class RpnTargetAssignKernel : public framework::OpKernel { auto* score_index = context.Output("ScoreIndex"); auto* tgt_bbox = context.Output("TargetBBox"); auto* tgt_lbl = context.Output("TargetLabel"); + auto* bbox_inside_weight = context.Output("BBoxInsideWeight"); PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, "RpnTargetAssignOp gt_boxes needs 1 level of LoD"); @@ -340,7 +368,7 @@ class RpnTargetAssignKernel : public framework::OpKernel { score_index->mutable_data({max_num}, place); tgt_bbox->mutable_data({max_num, 4}, place); tgt_lbl->mutable_data({max_num, 1}, place); - + bbox_inside_weight->mutable_data({max_num, 4}, place); auto& dev_ctx = context.device_context(); std::random_device rnd; @@ -394,6 +422,7 @@ class RpnTargetAssignKernel : public framework::OpKernel { Tensor sampled_score_index = loc_score_tgtlbl_gt[1]; Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2]; Tensor sampled_gt_index = loc_score_tgtlbl_gt[3]; + Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4]; int loc_num = sampled_loc_index.dims()[0]; int score_num = sampled_score_index.dims()[0]; @@ -432,6 +461,8 @@ class RpnTargetAssignKernel : public framework::OpKernel { AppendRpns(score_index, total_score_num, &sampled_score_index_unmap); AppendRpns(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox); AppendRpns(tgt_lbl, total_score_num, &sampled_tgtlbl); + AppendRpns(bbox_inside_weight, total_loc_num * 4, + &sampled_bbox_inside_weight); total_loc_num += loc_num; total_score_num += score_num; @@ -448,10 +479,12 @@ class RpnTargetAssignKernel : public framework::OpKernel { score_index->set_lod(loc_score); tgt_bbox->set_lod(lod_loc); tgt_lbl->set_lod(loc_score); + bbox_inside_weight->set_lod(lod_loc); loc_index->Resize({total_loc_num}); score_index->Resize({total_score_num}); tgt_bbox->Resize({total_loc_num, 4}); tgt_lbl->Resize({total_score_num, 1}); + bbox_inside_weight->Resize({total_loc_num, 4}); } }; @@ -514,6 +547,9 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "TargetLabel", "(Tensor), The target labels of each anchor with shape " "[F + B, 1], F and B are sampled foreground and backgroud number."); + AddOutput("BBoxInsideWeight", + "(Tensor), The bbox inside weight with shape " + "[F, 4], F is the sampled foreground number."); AddComment(R"DOC( This operator can be, for a given set of ground truth bboxes and the anchors, to assign classification and regression targets to each prediction. diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc index a04c1c1263fba659e2d3f623b607e9f476bb40ed..120b2ab440156f6020fd6005dd64a48e9a6918ec 100644 --- a/paddle/fluid/operators/fusion_gru_op.cc +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -16,10 +16,9 @@ limitations under the License. */ #include // for memcpy #include #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/sequence2batch.h" -#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { @@ -174,58 +173,44 @@ class FusionGRUKernel : public framework::OpKernel { } } -#define INIT_VEC_FUNC \ - std::function act_gate, act_state; \ - std::function cross; \ - auto& act_gate_str = ctx.Attr("gate_activation"); \ - auto& act_state_str = ctx.Attr("activation"); \ - if (platform::jit::MayIUse(platform::jit::avx)) { \ - math::VecActivations act_functor; \ - act_gate = act_functor(act_gate_str); \ - act_state = act_functor(act_state_str); \ - cross = math::vec_cross; \ - } else { \ - math::VecActivations act_functor; \ - act_gate = act_functor(act_gate_str); \ - act_state = act_functor(act_state_str); \ - cross = math::vec_cross; \ - } - -#define INIT_BASE_INPUT_OUTPUT \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* wh = ctx.Input("WeightH"); \ - auto* bias = ctx.Input("Bias"); \ - auto* xx = ctx.Output("XX"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); - -#define INIT_BASE_SIZES \ - auto x_dims = x->dims(); /* T x M*/ \ - auto wh_dims = wh->dims(); /* D x 3D*/ \ - const int total_T = x_dims[0]; \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D3 = wh_dims[1]; \ - const int D2 = D * 2; +#define INIT_BASE_DEFINES \ + auto* x = ctx.Input("X"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* xx = ctx.Output("XX"); \ + auto x_lod = x->lod(); \ + auto x_dims = x->dims(); /* T x M*/ \ + auto wh_dims = wh->dims(); /* D x 3D*/ \ + const int total_T = x_dims[0]; \ + const int D3 = wh_dims[1] + +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const auto& ker = math::jitkernel::KernelPool::Instance() \ + .template Get, \ + const std::string&, const std::string&>( \ + ctx.Attr("gate_activation"), \ + ctx.Attr("activation"), D); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ + T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = paddle::platform::CPUDeviceContext; - auto* x = ctx.Input("X"); - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES - INIT_VEC_FUNC - - auto x_lod = x->lod(); + INIT_BASE_DEFINES; + INIT_OTHER_DEFINES; const int N = x_lod[0].size() - 1; - const T* x_data = x->data(); const T* h0_data = h0 ? h0->data() : nullptr; - const T* wx_data = wx->data(); - const T* wh_data = wh->data(); const T* wh_state_data = wh_data + D * D2; - T* xx_data = xx->mutable_data(ctx.GetPlace()); - T* hidden_out_data = hidden_out->mutable_data(ctx.GetPlace()); - + T* hidden_out_data = hidden_out->mutable_data(place); auto blas = math::GetBlas(ctx); math::FCCompute(blas, total_T, D3, M, x_data, wx_data, xx_data, @@ -252,14 +237,7 @@ class FusionGRUKernel : public framework::OpKernel { if (h0_data) { prev_hidden_data = h0_data + bid * D; } else { - // W: {W_update, W_reset; W_state} - // update gate - act_gate(D, xx_data, xx_data); - // state gate - act_state(D, xx_data + D2, xx_data + D2); - // out = a*b - blas.VMUL(D, xx_data, xx_data + D2, hidden_out_data); - // save prev + ker->ComputeH1(xx_data, hidden_out_data); prev_hidden_data = hidden_out_data; tstart = 1; move_step(); @@ -269,17 +247,12 @@ class FusionGRUKernel : public framework::OpKernel { blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast(1), prev_hidden_data, D, wh_data, D2, static_cast(1), xx_data, D3); - act_gate(D2, xx_data, xx_data); - // rt = rt*ht_1 inplace result - blas.VMUL(D, prev_hidden_data, xx_data + D, hidden_out_data); - + ker->ComputeHtPart1(xx_data, prev_hidden_data, hidden_out_data); // gemm rt * Ws blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast(1), hidden_out_data, D, wh_state_data, D, static_cast(1), xx_data + D2, D3); - act_state(D, xx_data + D2, xx_data + D2); - // out = zt*ht~ + (1-zt)*ht_1 - cross(D, xx_data, xx_data + D2, prev_hidden_data, hidden_out_data); + ker->ComputeHtPart2(xx_data, prev_hidden_data, hidden_out_data); // save prev prev_hidden_data = hidden_out_data; move_step(); @@ -289,28 +262,19 @@ class FusionGRUKernel : public framework::OpKernel { void BatchCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = paddle::platform::CPUDeviceContext; - auto* x = ctx.Input("X"); - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES - if (x->lod()[0].size() == 2) { + INIT_BASE_DEFINES; + if (x_lod[0].size() == 2) { xx->Resize({total_T, D3}); SeqCompute(ctx); return; } - INIT_VEC_FUNC - + INIT_OTHER_DEFINES; auto* reordered_h0 = ctx.Output("ReorderedH0"); auto* batched_input = ctx.Output("BatchedInput"); auto* batched_out = ctx.Output("BatchedOut"); - - const T* x_data = x->data(); - const T* wx_data = wx->data(); - const T* wh_data = wh->data(); - T* xx_data = xx->mutable_data(ctx.GetPlace()); - T* batched_input_data = batched_input->mutable_data(ctx.GetPlace()); - T* batched_out_data = batched_out->mutable_data(ctx.GetPlace()); - hidden_out->mutable_data(ctx.GetPlace()); - + T* batched_input_data = batched_input->mutable_data(place); + T* batched_out_data = batched_out->mutable_data(place); + hidden_out->mutable_data(place); auto& dev_ctx = ctx.template device_context(); auto blas = math::GetBlas(dev_ctx); math::LoDTensor2BatchFunctor to_batch; @@ -336,7 +300,7 @@ class FusionGRUKernel : public framework::OpKernel { T* prev_hidden_data = nullptr; if (h0) { // reorder h0 - T* reordered_h0_data = reordered_h0->mutable_data(ctx.GetPlace()); + T* reordered_h0_data = reordered_h0->mutable_data(place); const T* h0_data = h0->data(); prev_hidden_data = reordered_h0_data; size_t sz = sizeof(T) * D; @@ -350,12 +314,7 @@ class FusionGRUKernel : public framework::OpKernel { T* cur_out_data = batched_out_data; // W: {W_update, W_reset; W_state} for (int i = 0; i < max_bs; ++i) { - // update gate - act_gate(D, cur_in_data, cur_in_data); - // state gate - act_state(D, cur_in_data + D2, cur_in_data + D2); - // out = a*b - blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data); + ker->ComputeH1(cur_in_data, cur_out_data); // add offset cur_in_data += D3; cur_out_data += D; @@ -380,10 +339,8 @@ class FusionGRUKernel : public framework::OpKernel { T* cur_out_data = batched_out_data; T* cur_prev_hidden_data = prev_hidden_data; for (int i = 0; i < cur_bs; ++i) { - act_gate(D2, cur_batched_data, cur_batched_data); - // rt = rt*ht_1 inplace result - blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D, cur_out_data); - + ker->ComputeHtPart1(cur_batched_data, cur_prev_hidden_data, + cur_out_data); cur_batched_data += D3; cur_prev_hidden_data += D; cur_out_data += D; @@ -397,12 +354,8 @@ class FusionGRUKernel : public framework::OpKernel { cur_prev_hidden_data = prev_hidden_data; for (int i = 0; i < cur_bs; ++i) { - // ht~ = act_state(...) - act_state(D, cur_batched_data + D2, cur_batched_data + D2); - // out = zt*ht~ + (1-zt)*ht_1 - cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data, - cur_out_data); - + ker->ComputeHtPart2(cur_batched_data, cur_prev_hidden_data, + cur_out_data); cur_batched_data += D3; cur_prev_hidden_data += D; cur_out_data += D; @@ -416,9 +369,8 @@ class FusionGRUKernel : public framework::OpKernel { batched_out->set_lod(batched_lod); to_seq(dev_ctx, *batched_out, hidden_out); } -#undef INIT_VEC_FUNC -#undef INIT_BASE_SIZES -#undef INIT_BASE_INPUT_OUTPUT +#undef INIT_OTHER_DEFINES +#undef INIT_BASE_DEFINES }; } // namespace operators diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 5d0c0b4228d8e2890c8b8d8bd10e0df080251350..55e2ea760158cda631ec07e2c7d318ec1cf79b77 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -68,6 +68,7 @@ cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selec cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col) cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) +cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling) if(WITH_GPU) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function) @@ -75,6 +76,6 @@ endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_library(jit_kernel - SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc + SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc DEPS cpu_info cblas) cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index e91e4e8e5adfdfff8163efe7fc1451bc602504e0..9088d0c7a6307c3fbd9707c719ec9e6f6c85fbdb 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -142,6 +142,15 @@ class LSTMKernel : public Kernel { const T *wp_data = nullptr) const = 0; }; +template +class GRUKernel : public Kernel { + public: + // compute h1 without h0 + virtual void ComputeH1(T *gates, T *ht) const = 0; + virtual void ComputeHtPart1(T *gates, const T *ht_1, T *ht) const = 0; + virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0; +}; + } // namespace jitkernel } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/jit_kernel_lstm.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc similarity index 65% rename from paddle/fluid/operators/math/jit_kernel_lstm.cc rename to paddle/fluid/operators/math/jit_kernel_rnn.cc index 26bd26e2e171feea569fbd646a9caf03bebbaa46..c0847f0bee415393255983468349ad314716bcb3 100644 --- a/paddle/fluid/operators/math/jit_kernel_lstm.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -136,6 +136,21 @@ static std::shared_ptr> GetActKernel( return nullptr; } +template +static std::unique_ptr GetAVXAct(const std::string& type) { + if (type == "sigmoid") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "relu") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "tanh") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "identity" || type == "") { + return std::unique_ptr(new AVXActImpl()); + } + PADDLE_THROW("Not support type: %s", type); + return nullptr; +} + /* LSTM JitKernel */ template class LSTMKernelImpl : public LSTMKernel { @@ -192,61 +207,49 @@ class LSTMKernelImpl : public LSTMKernel { #endif }; -#define INTRI8_FLOAT(isa) \ - template <> \ - LSTMKernelImpl::LSTMKernelImpl( \ - const std::string& act_gate, const std::string& act_cand, \ - const std::string& act_cell, int d) \ - : LSTMKernel() { \ - auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr { \ - if (type == "sigmoid") { \ - return std::unique_ptr(new AVXActImpl()); \ - } else if (type == "relu") { \ - return std::unique_ptr(new AVXActImpl()); \ - } else if (type == "tanh") { \ - return std::unique_ptr(new AVXActImpl()); \ - } else if (type == "identity" || type == "") { \ - return std::unique_ptr(new AVXActImpl()); \ - } \ - PADDLE_THROW("Not support type: %s", type); \ - }; \ - avx_act_gate_ = GetAVXAct(act_gate); \ - avx_act_cand_ = GetAVXAct(act_cand); \ - avx_act_cell_ = GetAVXAct(act_cell); \ - } \ - template <> \ - void LSTMKernelImpl::ComputeCtHt( \ - float* gates, const float* ct_1, float* ct, float* ht, \ - const float* wp_data, float* checked) const { \ - /* gates: W_ch, W_ih, W_fh, W_oh */ \ - __m256 c, i, f, o; \ - c = _mm256_loadu_ps(gates); \ - i = _mm256_loadu_ps(gates + 8); \ - f = _mm256_loadu_ps(gates + 16); \ - o = _mm256_loadu_ps(gates + 24); \ - /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ - c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ - i = _mm256_loadu_ps(ct_1); \ - f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ - f = _mm256_add_ps(c, f); \ - _mm256_storeu_ps(ct, f); \ - /* H_t = act_cell(C_t) * ogated */ \ - o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ - _mm256_storeu_ps(ht, o); \ - } \ - template <> \ - void LSTMKernelImpl::ComputeC1H1( \ - float* gates, float* ct, float* ht, const float* wp_data) const { \ - __m256 c, i, o; \ - c = _mm256_loadu_ps(gates); \ - i = _mm256_loadu_ps(gates + 8); \ - o = _mm256_loadu_ps(gates + 24); \ - /* C_t = igated * cgated*/ \ - c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \ - _mm256_storeu_ps(ct, c); \ - /* H_t = act_cell(C_t) * ogated */ \ - o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \ - _mm256_storeu_ps(ht, o); \ +#define INTRI8_FLOAT(isa) \ + template <> \ + LSTMKernelImpl::LSTMKernelImpl( \ + const std::string& act_gate, const std::string& act_cand, \ + const std::string& act_cell, int d) \ + : LSTMKernel() { \ + avx_act_gate_ = GetAVXAct(act_gate); \ + avx_act_cand_ = GetAVXAct(act_cand); \ + avx_act_cell_ = GetAVXAct(act_cell); \ + } \ + template <> \ + void LSTMKernelImpl::ComputeCtHt( \ + float* gates, const float* ct_1, float* ct, float* ht, \ + const float* wp_data, float* checked) const { \ + /* gates: W_ch, W_ih, W_fh, W_oh */ \ + __m256 c, i, f, o; \ + c = _mm256_loadu_ps(gates); \ + i = _mm256_loadu_ps(gates + 8); \ + f = _mm256_loadu_ps(gates + 16); \ + o = _mm256_loadu_ps(gates + 24); \ + /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ + c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ + i = _mm256_loadu_ps(ct_1); \ + f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ + f = _mm256_add_ps(c, f); \ + _mm256_storeu_ps(ct, f); \ + /* H_t = act_cell(C_t) * ogated */ \ + o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ + _mm256_storeu_ps(ht, o); \ + } \ + template <> \ + void LSTMKernelImpl::ComputeC1H1( \ + float* gates, float* ct, float* ht, const float* wp_data) const { \ + __m256 c, i, o; \ + c = _mm256_loadu_ps(gates); \ + i = _mm256_loadu_ps(gates + 8); \ + o = _mm256_loadu_ps(gates + 24); \ + /* C_t = igated * cgated*/ \ + c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \ + _mm256_storeu_ps(ct, c); \ + /* H_t = act_cell(C_t) * ogated */ \ + o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \ + _mm256_storeu_ps(ht, o); \ } // TODO(TJ): optimize keq16 @@ -354,6 +357,126 @@ REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM, #undef JITKERNEL_DECLARE_LSTM #undef JITKERNEL_KEY_LSTM #undef JITKERNEL_NEW_LSTM_IMPL + +/* GRU JitKernel */ +template +class GRUKernelImpl : public GRUKernel { + public: + explicit GRUKernelImpl(const std::string& act_gate, + const std::string& act_state, int d) + : GRUKernel() { + d_ = d; + d2_ = d * 2; + act_gate_d2_ = GetActKernel(act_gate, d2_); + act_gate_d_ = GetActKernel(act_gate, d); + act_state_d_ = GetActKernel(act_state, d); + vmul_d_ = KernelPool::Instance().template Get>(d); + } + + void ComputeH1(T* gates, T* ht) const override { + act_gate_d_->Compute(gates, gates); + act_state_d_->Compute(gates + d2_, gates + d2_); + vmul_d_->Compute(gates, gates + d2_, ht); + } + + void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override { + // W: {W_update, W_reset; W_state} + act_gate_d2_->Compute(gates, gates); + vmul_d_->Compute(ht_1, gates + d_, ht); + } + + void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override { + T* y = gates + d2_; + act_state_d_->Compute(y, y); + // out = zt*ht~ + (1-zt)*ht_1 + for (int i = 0; i < d_; ++i) { + ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; + } + } + + private: + int d_, d2_; + std::shared_ptr> act_gate_d2_, act_gate_d_, act_state_d_; + std::shared_ptr> vmul_d_; +#ifdef __AVX__ + std::unique_ptr avx_act_gate_, avx_act_state_; +#endif +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + GRUKernelImpl::GRUKernelImpl( \ + const std::string& act_gate, const std::string& act_state, int d) \ + : GRUKernel() { \ + avx_act_gate_ = GetAVXAct(act_gate); \ + avx_act_state_ = GetAVXAct(act_state); \ + } \ + template <> \ + void GRUKernelImpl::ComputeH1(float* gates, float* ht) \ + const { \ + __m256 u, s; \ + /* W: {W_update, W_reset; W_state} */ \ + u = _mm256_loadu_ps(gates); \ + s = _mm256_loadu_ps(gates + 16); \ + s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \ + _mm256_storeu_ps(ht, s); \ + } \ + template <> \ + void GRUKernelImpl::ComputeHtPart1( \ + float* gates, const float* ht_1, float* ht) const { \ + /* not exactly equal the any implementation */ \ + __m256 r, ht0; \ + r = _mm256_loadu_ps(gates + 8); \ + ht0 = _mm256_loadu_ps(ht_1); \ + r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0); \ + _mm256_storeu_ps(ht, r); \ + } \ + template <> \ + void GRUKernelImpl::ComputeHtPart2( \ + float* gates, const float* ht_1, float* ht) const { \ + /* not exactly equal the any implementation */ \ + __m256 u, s, ht0; \ + u = _mm256_loadu_ps(gates); \ + s = _mm256_loadu_ps(gates + 16); \ + ht0 = _mm256_loadu_ps(ht_1); \ + u = avx_act_gate_->Compute(u); \ + s = _mm256_mul_ps(u, avx_act_state_->Compute(s)); \ + u = _mm256_sub_ps(_mm256_set1_ps(1.f), u); \ + u = _mm256_mul_ps(u, ht0); \ + u = _mm256_add_ps(s, u); \ + _mm256_storeu_ps(ht, u); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif + +#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> KernelPool::Get< \ + GRUKernel, const std::string&, const std::string&, int>( \ + const std::string& act_gate, const std::string& act_state, int d) + +#define JITKERNEL_KEY_GRU(ker_key, dtype_key) \ + #ker_key #dtype_key + std::to_string(d) + act_gate + act_state + +#define JITKERNEL_NEW_GRU_IMPL(ker, dtype, isa, k) \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(act_gate, act_state, d)); + +REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DECLARE_GRU, + JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL); + +#undef INTRI8_FLOAT +#undef JITKERNEL_NEW_GRU_IMPL +#undef JITKERNEL_KEY_GRU +#undef JITKERNEL_DECLARE_GRU } // namespace jitkernel } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 235b5405fb7d016f4bd8c738f75b303522183116..7be8539a7b0f1890898fd386a3056601fda8a7c3 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -157,6 +157,31 @@ class FirstSeqPoolFunctor { } }; +template +class SumSeqPoolGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& out_grad, + framework::LoDTensor* in_grad) { + auto lod = in_grad->lod()[0]; + int64_t out_w = out_grad.numel() / out_grad.dims()[0]; + int64_t in_w = in_grad->numel() / in_grad->dims()[0]; + PADDLE_ENFORCE(in_w == out_w); + const T* out_g_data = out_grad.data(); + T* in_g_data = in_grad->mutable_data(context.GetPlace()); + auto blas = math::GetBlas(context); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + int64_t h = static_cast(lod[i + 1] - lod[i]); + int64_t in_offset = lod[i] * in_w; + const T* out_pos = out_g_data + i * out_w; + T* in_pos = in_g_data + in_offset; + for (int r = 0; r != h; ++r) { + blas.VCOPY(in_w, out_pos, in_pos + r * in_w); + } + } + } +}; + template class SequencePoolFunctor { public: @@ -231,9 +256,15 @@ class SequencePoolGradFunctor { math::SetConstant functor; functor(context, in_grad, 0); } + + if (pooltype == "SUM") { + math::SumSeqPoolGradFunctor sum_pool_grad; + sum_pool_grad(context, out_grad, in_grad); + return; + } + auto lod = in_grad->lod()[0]; auto& place = *context.eigen_device(); - auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { auto in_g_t = in_grad->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); @@ -247,12 +278,6 @@ class SequencePoolGradFunctor { if (pooltype == "AVERAGE") { in_g_e.device(place) = (out_g_e / static_cast(h)).broadcast(bcast); - } else if (pooltype == "SUM") { - const T* out_g_data = out_g_t.data(); - T* in_g_data = in_g_t.mutable_data(context.GetPlace()); - for (int r = 0; r != h; ++r) { - blas.VCOPY(w, out_g_data, in_g_data + r * w); - } } else if (pooltype == "SQRT") { in_g_e.device(place) = (out_g_e / std::sqrt(static_cast(h))).broadcast(bcast); diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2bc008dd34ffcfe93a00bd4a8cde61626d91e235 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_pooling.h" +#include +#include + +template +void TestSequencePoolingSum(const paddle::framework::LoD& lod) { + paddle::framework::LoDTensor cpu_out_grad; + paddle::framework::LoDTensor cpu_in_grad; + paddle::framework::LoDTensor out_grad; + paddle::framework::LoDTensor in_grad; + const size_t second_dim = 128u; + + // construct out_grad's tensor in cpu + const size_t out_first_dim = lod[0].size() - 1; + auto out_dims = paddle::framework::make_ddim( + {static_cast(out_first_dim), static_cast(second_dim)}); + + cpu_out_grad.mutable_data(out_dims, paddle::platform::CPUPlace()); + for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) { + cpu_out_grad.data()[i] = static_cast(i); + } + + // copy to dst out_grad + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + if (paddle::platform::is_cpu_place(*place)) { + out_grad = cpu_out_grad; + } else { + TensorCopySync(cpu_out_grad, *place, &out_grad); + } + + // construct in_grad + in_grad.set_lod(lod); + auto in_dims = paddle::framework::make_ddim( + {static_cast(lod[0].back()), static_cast(second_dim)}); + in_grad.mutable_data(in_dims, context->GetPlace()); + + // check tensor contruction result + PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size()); + for (int64_t i = 1; i < out_grad.dims().size(); ++i) { + PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]); + } + + // call functor + paddle::operators::math::SequencePoolGradFunctor()( + *context, "SUM", out_grad, &in_grad); + + if (paddle::platform::is_cpu_place(*place)) { + cpu_in_grad = in_grad; + } else { + TensorCopySync(in_grad, paddle::platform::CPUPlace(), &cpu_in_grad); + cpu_in_grad.set_lod(in_grad.lod()); + } + + EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim); + EXPECT_EQ(in_grad.lod(), lod); + + if (paddle::platform::is_cpu_place(*place)) { + for (int64_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) { + int64_t begin = in_grad.lod()[0][i]; + int64_t end = in_grad.lod()[0][i + 1]; + paddle::framework::Tensor tmp = in_grad.Slice(begin, end); + for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) { + for (int64_t m = 0; m != second_dim; ++m) { + EXPECT_EQ(tmp.data()[m + j * second_dim], + out_grad.data()[m + i * second_dim]); + } + } + } + } else { + for (int64_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) { + int64_t begin = cpu_in_grad.lod()[0][i]; + int64_t end = cpu_in_grad.lod()[0][i + 1]; + paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end); + for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) { + for (int64_t m = 0; m != second_dim; ++m) { + EXPECT_EQ(tmp.data()[m + j * second_dim], + cpu_out_grad.data()[m + i * second_dim]); + } + } + } + } + + delete place; + delete context; +} + +TEST(SequencePoolingGrad, CPU_SUM) { + paddle::framework::LoD lod1; + lod1.push_back(std::vector{0, 10}); + TestSequencePoolingSum(lod1); + + paddle::framework::LoD lod2; + lod2.push_back(std::vector{0, 2, 7, 10}); + TestSequencePoolingSum(lod2); +} + +#ifdef PADDLE_WITH_CUDA +TEST(SequencePoolingGrad, CUDA_SUM) { + paddle::framework::LoD lod1; + lod1.push_back(std::vector{0, 10}); + TestSequencePoolingSum(lod1); + + paddle::framework::LoD lod2; + lod2.push_back(std::vector{0, 2, 7, 10}); + TestSequencePoolingSum(lod2); +} +#endif diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 8e4a07556fb51dbb15ef948fcee120e2f68e089a..0cad224ca8860b0e4bc2e3f2bc1659235aadfe2d 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -262,31 +262,31 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, const T* src, int lds, int dim, int k, int grid_dim, int num) { __shared__ Pair sh_topk[BlockSize]; - __shared__ int maxid[BlockSize / 2]; const int tid = threadIdx.x; const int warp = threadIdx.x / 32; const int bid = blockIdx.x; for (int i = bid; i < num; i += grid_dim) { - output += i * output_stride; - indices += i * k; - + int top_num = k; + __shared__ int maxid[BlockSize / 2]; + T* out = output + i * output_stride; + int64_t* inds = indices + i * k; Pair topk[MaxLength]; int beam = MaxLength; Pair max; bool is_empty = false; bool firststep = true; - for (int k = 0; k < MaxLength; k++) { - topk[k].set(-INFINITY, -1); + for (int j = 0; j < MaxLength; j++) { + topk[j].set(-INFINITY, -1); } - while (k) { + while (top_num) { ThreadGetTopK( topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid); sh_topk[tid] = topk[0]; - BlockReduce(sh_topk, maxid, topk, &output, - &indices, &beam, &k, tid, warp); + BlockReduce(sh_topk, maxid, topk, &out, &inds, + &beam, &top_num, tid, warp); } } } @@ -327,13 +327,15 @@ class TopkOpCUDAKernel : public framework::OpKernel { size_t k = static_cast(ctx.Attr("k")); const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); // FIXME(typhoonzero): data is always converted to type T? int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - size_t input_height = input->dims()[0]; - size_t input_width = input->dims()[1]; + framework::DDim inputdims = input->dims(); + const size_t input_height = framework::product( + framework::slice_ddim(inputdims, 0, inputdims.size() - 1)); + const size_t input_width = inputdims[inputdims.size() - 1]; + if (k > input_width) k = input_width; // NOTE: pass lds and dim same to input width. @@ -342,14 +344,12 @@ class TopkOpCUDAKernel : public framework::OpKernel { const int kMaxHeight = 2048; int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; auto& dev_ctx = ctx.cuda_device_context(); - switch (GetDesiredBlockDim(input_width)) { FIXED_BLOCK_DIM( KeMatrixTopK<<>>( - output_data, output->dims()[1], indices_data, input_data, - input_width, input_width, static_cast(k), gridx, - input_height)); + output_data, k, indices_data, input_data, input_width, + input_width, static_cast(k), gridx, input_height)); default: PADDLE_THROW("Error"); } diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h index 054dd481994d03f71b0ed5dc73e103085f6c91aa..76ece57b39919148da04caecaa43ea9d2b9d95df 100644 --- a/paddle/fluid/operators/top_k_op.h +++ b/paddle/fluid/operators/top_k_op.h @@ -34,7 +34,6 @@ class TopkKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { // Get the top k elements of each row of input tensor - // FIXME: only deal with matrix(2d tensor). auto* input = ctx.Input("X"); auto* output = ctx.Output("Out"); auto* indices = ctx.Output("Indices"); @@ -44,8 +43,6 @@ class TopkKernel : public framework::OpKernel { T* output_data = output->mutable_data(ctx.GetPlace()); int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - auto eg_input = EigenMatrix::From(*input); - // reshape input to a flattern matrix(like flat_inner_dims) framework::DDim inputdims = input->dims(); const size_t row = framework::product( @@ -53,7 +50,7 @@ class TopkKernel : public framework::OpKernel { const size_t col = inputdims[inputdims.size() - 1]; Eigen::DSizes flat2dims(row, col); // NOTE: eigen shape doesn't affect paddle tensor. - eg_input.reshape(flat2dims); + auto eg_input = EigenMatrix::Reshape(*input, inputdims.size() - 1); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index b94b59631a3d8999f569acf1027c71d3019f5c56..ece22d0b7ed4cac6618c7be14939c770bcf1176d 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -116,8 +116,8 @@ def rpn_target_assign(bbox_pred, Returns: tuple: A tuple(predicted_scores, predicted_location, target_label, - target_bbox) is returned. The predicted_scores and - predicted_location is the predicted result of the RPN. + target_bbox, bbox_inside_weight) is returned. The predicted_scores + and predicted_location is the predicted result of the RPN. The target_label and target_bbox is the ground truth, respectively. The predicted_location is a 2D Tensor with shape [F, 4], and the shape of target_bbox is same as the shape of @@ -126,6 +126,8 @@ def rpn_target_assign(bbox_pred, [F + B, 1], and the shape of target_label is same as the shape of the predicted_scores, B is the number of the background anchors, the F and B is depends on the input of this operator. + Bbox_inside_weight represents whether the predicted loc is fake_fg + or not and the shape is [F, 4]. Examples: .. code-block:: python @@ -138,7 +140,7 @@ def rpn_target_assign(bbox_pred, append_batch_size=False, dtype='float32') gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], append_batch_size=False, dtype='float32') - loc_pred, score_pred, loc_target, score_target = + loc_pred, score_pred, loc_target, score_target, bbox_inside_weight = fluid.layers.rpn_target_assign(bbox_pred=bbox_pred, cls_logits=cls_logits, anchor_box=anchor_box, @@ -152,6 +154,8 @@ def rpn_target_assign(bbox_pred, target_label = helper.create_variable_for_type_inference(dtype='int32') target_bbox = helper.create_variable_for_type_inference( dtype=anchor_box.dtype) + bbox_inside_weight = helper.create_variable_for_type_inference( + dtype=anchor_box.dtype) helper.append_op( type="rpn_target_assign", inputs={ @@ -164,7 +168,8 @@ def rpn_target_assign(bbox_pred, 'LocationIndex': loc_index, 'ScoreIndex': score_index, 'TargetLabel': target_label, - 'TargetBBox': target_bbox + 'TargetBBox': target_bbox, + 'BBoxInsideWeight': bbox_inside_weight }, attrs={ 'rpn_batch_size_per_im': rpn_batch_size_per_im, @@ -179,13 +184,14 @@ def rpn_target_assign(bbox_pred, score_index.stop_gradient = True target_label.stop_gradient = True target_bbox.stop_gradient = True + bbox_inside_weight.stop_gradient = True cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1)) bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4)) predicted_cls_logits = nn.gather(cls_logits, score_index) predicted_bbox_pred = nn.gather(bbox_pred, loc_index) - return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox + return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox, bbox_inside_weight def detection_output(loc, diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 56129641ce5900d82aedf243d2fa1eadfd6b8d86..28dc7519571d8b5464e92fddf634ba46691ceaa9 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -301,7 +301,7 @@ class TestRpnTargetAssign(unittest.TestCase): dtype='float32', lod_level=1, append_batch_size=False) - pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign( + pred_scores, pred_loc, tgt_lbl, tgt_bbox, bbox_inside_weight = layers.rpn_target_assign( bbox_pred=bbox_pred, cls_logits=cls_logits, anchor_box=anchor_box, @@ -313,15 +313,18 @@ class TestRpnTargetAssign(unittest.TestCase): rpn_straddle_thresh=0.0, rpn_fg_fraction=0.5, rpn_positive_overlap=0.7, - rpn_negative_overlap=0.3) + rpn_negative_overlap=0.3, + use_random=False) self.assertIsNotNone(pred_scores) self.assertIsNotNone(pred_loc) self.assertIsNotNone(tgt_lbl) self.assertIsNotNone(tgt_bbox) + self.assertIsNotNone(bbox_inside_weight) assert pred_scores.shape[1] == 1 assert pred_loc.shape[1] == 4 assert pred_loc.shape[1] == tgt_bbox.shape[1] + print(str(program)) class TestGenerateProposals(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 7de0ebce06e9de439d3570bee9ac7dbce33ee868..68e498c6e82c6d2b389e2e08355b63db9de0bd14 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -78,9 +78,9 @@ if(WITH_DISTRIBUTE) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) - # TODO: fix this test - #py_test_modules(test_dist_transformer MODULES test_dist_transformer) - #set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) + + py_test_modules(test_dist_transformer MODULES test_dist_transformer) + set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) endif() diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index a2cc57425841100a2b61279d1b447b88ed4b9a54..ab44954811562b8f74e368a551e855948f90af87 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -35,7 +35,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid import core -from test_dist_base import TestDistRunnerBase, runtime_main +from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP import paddle.compat as cpt from paddle.compat import long_type @@ -562,18 +562,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, for pass_id in six.moves.xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() for batch_id, data in enumerate(train_data()): - if batch_id >= 5: + if batch_id >= RUN_STEP: break feed_list = [] total_num_token = 0 - #if TrainTaskConfig.local: - # lr_rate = lr_scheduler.update_learning_rate() - #for place_id, data_buffer in enumerate( - # split_data( - # data, num_part=dev_count)): - if TrainTaskConfig.local: lr_rate = lr_scheduler.update_learning_rate() @@ -619,12 +613,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, init = True # Validate and save the model for inference. - if batch_id == 0 or batch_id == 4: - if TrainTaskConfig.val_file_pattern is not None: - val_avg_cost, val_ppl = test() - print("[%f]" % val_avg_cost) - else: - assert (False) + if TrainTaskConfig.val_file_pattern is not None: + val_avg_cost, val_ppl = test() + print("[%f]" % val_avg_cost) + else: + assert (False) #import transformer_reader as reader @@ -1701,7 +1694,7 @@ class DistTransformer2x2(TestDistRunnerBase): def run_trainer(self, args): TrainTaskConfig.use_gpu = args.use_cuda - sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( + sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model( args.is_dist, not args.sync_mode) if args.is_dist: diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index f65dd7e2a28c4ace3988c0cc1267ebe981fbd9cb..94b66a40233be4378e1a003f01d9375d00794743 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -40,7 +40,8 @@ class TestDistMnistAsync(TestDistBase): self._sync_mode = False self._use_reduce = False - def test_dist_train(self): + # FIXME(typhoonzero): fix async mode test later + def no_test_dist_train(self): self.check_with_place("dist_mnist.py", delta=200) diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index c0989ca709e100d8f147a08970b0e858c81ce09b..c1e60dc9e420d11677468e0c62357437ecdf9e35 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -40,7 +40,8 @@ class TestDistSeResneXt2x2Async(TestDistBase): self._sync_mode = False self._use_reader_alloc = False - def test_dist_train(self): + #FIXME(typhoonzero): fix async mode later + def no_test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=100) diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py index a0b6879f99e80a9710ee76f981769299a066b85b..e1e6ef61090dfb439a3b43c4baf5ba88f61310ba 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py @@ -42,7 +42,8 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): self._sync_mode = False self._enforce_place = "CPU" - def test_simnet_bow(self): + #FIXME(typhoonzero): fix async tests later + def no_test_simnet_bow(self): need_envs = { "IS_DISTRIBUTED": '0', "IS_SPARSE": '0', @@ -78,7 +79,8 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): self._sync_mode = False self._enforce_place = "CPU" - def test_simnet_bow(self): + #FIXME(typhoonzero): fix async tests later + def no_test_simnet_bow(self): need_envs = { "IS_DISTRIBUTED": '0', "IS_SPARSE": '1', diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py index 47e8dfaf03ceb27a74f5e48d662d2b534d2d152b..25dcccc28d710695d4c5e08c17816669d0fae5d8 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -61,7 +61,8 @@ class TestDistTransformer2x2Sync(TestDistBase): def test_dist_train(self): download_files() - self.check_with_place("dist_transformer.py", delta=1e-5) + self.check_with_place( + "dist_transformer.py", delta=1e-5, check_error_log=False) class TestDistTransformer2x2Async(TestDistBase): @@ -70,7 +71,8 @@ class TestDistTransformer2x2Async(TestDistBase): def test_dist_train(self): download_files() - self.check_with_place("dist_transformer.py", delta=1.0) + self.check_with_place( + "dist_transformer.py", delta=1.0, check_error_log=False) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py index 36ebc8fb6ea9efdcd1807f5c8917ab1428b3381e..377454e7802e40f90c371987adfe50cce922c764 100644 --- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py @@ -125,6 +125,12 @@ class TestFusionGRUOpMD2(TestFusionGRUOp): self.D = 8 +class TestFusionGRUOpMD3(TestFusionGRUOp): + def set_confs(self): + self.M = 17 + self.D = 15 + + class TestFusionGRUOpBS1(TestFusionGRUOp): def set_confs(self): self.lod = [[3]] diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py index f63dbcd3d7f6bfce3ccc1c42ae41afe42bfad003..1a2c9bb5f43d55d8e6183de0d55bfcc2b9ac3f08 100644 --- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py @@ -50,8 +50,10 @@ def rpn_target_assign(anchor_by_gt_overlap, fg_inds, size=(len(fg_inds) - num_fg), replace=False) else: disable_inds = fg_inds[num_fg:] + labels[disable_inds] = -1 fg_inds = np.where(labels == 1)[0] + bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32) num_bg = rpn_batch_size_per_im - np.sum(labels == 1) bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] @@ -59,18 +61,27 @@ def rpn_target_assign(anchor_by_gt_overlap, enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] else: enable_inds = bg_inds[:num_bg] + + fg_fake_inds = np.array([], np.int32) + fg_value = np.array([fg_inds[0]], np.int32) + fake_num = 0 + for bg_id in enable_inds: + if bg_id in fg_inds: + fake_num += 1 + fg_fake_inds = np.hstack([fg_fake_inds, fg_value]) labels[enable_inds] = 0 + + bbox_inside_weight[fake_num:, :] = 1 fg_inds = np.where(labels == 1)[0] bg_inds = np.where(labels == 0)[0] - - loc_index = fg_inds - score_index = np.hstack((fg_inds, bg_inds)) + loc_index = np.hstack([fg_fake_inds, fg_inds]) + score_index = np.hstack([fg_inds, bg_inds]) labels = labels[score_index] assert not np.any(labels == -1), "Wrong labels with -1" - gt_inds = anchor_to_gt_argmax[fg_inds] + gt_inds = anchor_to_gt_argmax[loc_index] - return loc_index, score_index, labels, gt_inds + return loc_index, score_index, labels, gt_inds, bbox_inside_weight def get_anchor(n, c, h, w): @@ -123,9 +134,12 @@ def rpn_target_assign_in_python(all_anchors, gt_boxes_slice = gt_boxes_slice[not_crowd_inds] iou = _bbox_overlaps(inside_anchors, gt_boxes_slice) - loc_inds, score_inds, labels, gt_inds = rpn_target_assign( - iou, rpn_batch_size_per_im, rpn_positive_overlap, - rpn_negative_overlap, rpn_fg_fraction, use_random) + loc_inds, score_inds, labels, gt_inds, bbox_inside_weight = \ + rpn_target_assign(iou, rpn_batch_size_per_im, + rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, + use_random) # unmap to all anchor loc_inds = inds_inside[loc_inds] score_inds = inds_inside[score_inds] @@ -139,6 +153,7 @@ def rpn_target_assign_in_python(all_anchors, score_indexes = score_inds tgt_labels = labels tgt_bboxes = box_deltas + bbox_inside_weights = bbox_inside_weight else: loc_indexes = np.concatenate( [loc_indexes, loc_inds + i * anchor_num]) @@ -146,8 +161,10 @@ def rpn_target_assign_in_python(all_anchors, [score_indexes, score_inds + i * anchor_num]) tgt_labels = np.concatenate([tgt_labels, labels]) tgt_bboxes = np.vstack([tgt_bboxes, box_deltas]) + bbox_inside_weights = np.vstack([bbox_inside_weights, \ + bbox_inside_weight]) - return loc_indexes, score_indexes, tgt_bboxes, tgt_labels + return loc_indexes, score_indexes, tgt_bboxes, tgt_labels, bbox_inside_weights class TestRpnTargetAssignOp(OpTest): @@ -182,10 +199,12 @@ class TestRpnTargetAssignOp(OpTest): rpn_fg_fraction = 0.5 use_random = False - loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python( - all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh, - rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap, - rpn_fg_fraction, use_random) + loc_index, score_index, tgt_bbox, labels, bbox_inside_weights = \ + rpn_target_assign_in_python(all_anchors, gt_boxes, is_crowd, + im_info, lod, rpn_straddle_thresh, + rpn_batch_size_per_im, rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, use_random) labels = labels[:, np.newaxis] self.op_type = "rpn_target_assign" @@ -207,7 +226,8 @@ class TestRpnTargetAssignOp(OpTest): 'LocationIndex': loc_index.astype('int32'), 'ScoreIndex': score_index.astype('int32'), 'TargetBBox': tgt_bbox.astype('float32'), - 'TargetLabel': labels.astype('int32') + 'TargetLabel': labels.astype('int32'), + 'BBoxInsideWeight': bbox_inside_weights.astype('float32') } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py index e54e170f7f1e03db4b63db72edb7395d18130f68..69b29db83a43d18c0825b610642009a0377b9901 100644 --- a/python/paddle/fluid/tests/unittests/test_top_k_op.py +++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py @@ -21,22 +21,27 @@ from op_test import OpTest class TestTopkOp(OpTest): def setUp(self): + self.set_args() self.op_type = "top_k" - k = 1 - input = np.random.random((32, 84)).astype("float32") - output = np.ndarray((32, k)) - indices = np.ndarray((32, k)).astype("int64") + k = self.top_k + input = np.random.random((self.row, k)).astype("float32") + output = np.ndarray((self.row, k)) + indices = np.ndarray((self.row, k)).astype("int64") self.inputs = {'X': input} self.attrs = {'k': k} - for rowid in range(32): + for rowid in range(self.row): row = input[rowid] - output[rowid] = np.sort(row)[-k:] - indices[rowid] = row.argsort()[-k:] + output[rowid] = np.sort(row)[::-1][:k] + indices[rowid] = row.argsort()[::-1][:k] self.outputs = {'Out': output, 'Indices': indices} + def set_args(self): + self.row = 32 + self.top_k = 1 + def test_check_output(self): self.check_output() @@ -50,14 +55,39 @@ class TestTopkOp3d(OpTest): output = np.ndarray((64, k)) indices = np.ndarray((64, k)).astype("int64") - # FIXME: should use 'X': input for a 3d input - self.inputs = {'X': input_flat_2d} + self.inputs = {'X': input} self.attrs = {'k': k} for rowid in range(64): row = input_flat_2d[rowid] - output[rowid] = np.sort(row)[-k:] - indices[rowid] = row.argsort()[-k:] + output[rowid] = np.sort(row)[::-1][:k] + indices[rowid] = row.argsort()[::-1][:k] + + self.outputs = { + 'Out': output.reshape((32, 2, k)), + 'Indices': indices.reshape((32, 2, k)) + } + + def test_check_output(self): + self.check_output() + + +class TestTopkOp2(OpTest): + def setUp(self): + self.op_type = "top_k" + k = 1 + m = 2056 + input = np.random.random((m, 84)).astype("float32") + output = np.ndarray((m, k)) + indices = np.ndarray((m, k)).astype("int64") + + self.inputs = {'X': input} + self.attrs = {'k': k} + + for rowid in range(m): + row = input[rowid] + output[rowid] = -np.sort(-row)[:k] + indices[rowid] = (-row).argsort()[:k] self.outputs = {'Out': output, 'Indices': indices} @@ -65,5 +95,17 @@ class TestTopkOp3d(OpTest): self.check_output() +class TestTopkOp3(TestTopkOp): + def set_args(self): + self.row = 2056 + self.top_k = 3 + + +class TestTopkOp4(TestTopkOp): + def set_args(self): + self.row = 40000 + self.top_k = 1 + + if __name__ == "__main__": unittest.main()