test detection_output cpu and gpu ok, but doc will be modify

fe177b62 · sweetsky0901 · 9e72cc5c · fe177b62 · fe177b62 · fe177b62
4 changed file
--- a/paddle/operators/detection_output_op.cc
+++ b/paddle/operators/detection_output_op.cc
@@ -65,17 +65,18 @@ class Detection_output_Op : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
+    PADDLE_ENFORCE(ctx->HasInput("Loc"),
+                   "Input(X) of Detection_output_Op"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Conf"),
+                   "Input(X) of Detection_output_Op"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
                   "Input(X) of Detection_output_Op"
                   "should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of Detection_output_Op should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> output_shape({1, 7});
-    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
-    PADDLE_ENFORCE(in_x_dims.size() == 4,
-                   "Detection_output_ing intput must be of 4-dimensional.");
-    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
-    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
  }
 };

--- a/paddle/operators/detection_output_op.h
+++ b/paddle/operators/detection_output_op.h
@@ -40,6 +40,9 @@ class Detection_output_Kernel : public framework::OpKernel<T> {
    int input_num = in_loc->dims()[0];
    int batch_size = in_loc->dims()[1];
+    int channels = in_loc->dims()[2];
+    int height = in_loc->dims()[3];
+    int weight = in_loc->dims()[4];
    int loc_sum_size = in_loc->numel();
    int conf_sum_size = in_conf->numel();
    std::vector<int64_t> loc_shape_vec({1, loc_sum_size});
@@ -49,17 +52,62 @@ class Detection_output_Kernel : public framework::OpKernel<T> {
    framework::DDim conf_shape(framework::make_ddim(conf_shape_vec));
    framework::Tensor loc_tensor;
    framework::Tensor conf_tensor;
+    loc_tensor.Resize(loc_shape);
+    conf_tensor.Resize(conf_shape);
    loc_tensor.mutable_data<T>(loc_shape, context.GetPlace());
    conf_tensor.mutable_data<T>(conf_shape, context.GetPlace());
+    framework::Tensor loc_cpu;
+    framework::Tensor conf_cpu;
+    framework::Tensor priorbox_cpu;
+    const T* in_loc_data = in_loc->data<T>();
+    const T* in_conf_data = in_conf->data<T>();
+    T* loc_data;
+    T* conf_data;
+    const T* priorbox_data = in_priorbox->data<T>();
-    // KNCHW ==> NHWC
+    if (platform::is_gpu_place(context.GetPlace())) {
+      loc_cpu.mutable_data<T>(in_loc->dims(), platform::CPUPlace());
+      framework::CopyFrom(*in_loc, platform::CPUPlace(),
+                          context.device_context(), &loc_cpu);
+      in_loc_data = loc_cpu.data<T>();
+      conf_cpu.mutable_data<T>(in_conf->dims(), platform::CPUPlace());
+      framework::CopyFrom(*in_conf, platform::CPUPlace(),
+                          context.device_context(), &conf_cpu);
+      in_conf_data = conf_cpu.data<T>();
+      priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
+      framework::CopyFrom(*in_priorbox, platform::CPUPlace(),
+                          context.device_context(), &priorbox_cpu);
+      priorbox_data = priorbox_cpu.data<T>();
+      loc_tensor.mutable_data<T>(loc_shape, platform::CPUPlace());
+      conf_tensor.mutable_data<T>(conf_shape, platform::CPUPlace());
+    }
+    T* loc_tensor_data = loc_tensor.data<T>();
+    T* conf_tensor_data = conf_tensor.data<T>();
    for (int i = 0; i < input_num; ++i) {
-      math::appendWithPermute<T>(*in_loc, &loc_tensor);
+      math::appendWithPermute<T>(in_loc_data, input_num, batch_size, channels,
-      math::appendWithPermute<T>(*in_conf, &conf_tensor);
+                                 height, weight, loc_tensor_data);
+      math::appendWithPermute<T>(in_conf_data, input_num, batch_size, channels,
+                                 height, weight, conf_tensor_data);
+    }
+    loc_data = loc_tensor.data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      framework::Tensor conf_gpu;
+      conf_gpu.Resize(conf_shape);
+      conf_gpu.mutable_data<T>(conf_shape, context.GetPlace());
+      framework::CopyFrom(conf_tensor, platform::GPUPlace(),
+                          context.device_context(), &conf_gpu);
+      // softmax
+      math::SoftmaxFunctor<Place, T>()(context.device_context(), &conf_gpu,
+                                       &conf_gpu);
+      conf_tensor.mutable_data<T>(conf_gpu.dims(), platform::CPUPlace());
+      framework::CopyFrom(conf_gpu, platform::CPUPlace(),
+                          context.device_context(), &conf_tensor);
+    } else {
+      // softmax
+      math::SoftmaxFunctor<Place, T>()(context.device_context(), &conf_tensor,
+                                       &conf_tensor);
    }
-    // softmax
+    conf_data = conf_tensor.data<T>();
-    math::SoftmaxFunctor<Place, T>()(context.device_context(), &conf_tensor,
-                                     &conf_tensor);
    // get decode bboxes
    size_t num_priors = in_priorbox->numel() / 8;
    std::vector<std::vector<operators::math::BBox<T>>> all_decoded_bboxes;
@@ -69,29 +117,26 @@ class Detection_output_Kernel : public framework::OpKernel<T> {
        size_t prior_offset = i * 8;
        size_t loc_pred_offset = n * num_priors * 4 + i * 4;
        std::vector<math::BBox<T>> prior_bbox_vec;
-        math::getBBoxFromPriorData<T>(in_priorbox->data<T>() + prior_offset, 1,
+        math::getBBoxFromPriorData<T>(priorbox_data + prior_offset, 1,
                                      prior_bbox_vec);
        std::vector<std::vector<T>> prior_bbox_var;
-        math::getBBoxVarFromPriorData<T>(in_priorbox->data<T>() + prior_offset,
+        math::getBBoxVarFromPriorData<T>(priorbox_data + prior_offset, 1,
-                                         1, prior_bbox_var);
+                                         prior_bbox_var);
        std::vector<T> loc_pred_data;
        for (size_t j = 0; j < 4; ++j)
-          loc_pred_data.push_back(
+          loc_pred_data.push_back(*(loc_data + loc_pred_offset + j));
-              *(loc_tensor.data<T>() + loc_pred_offset + j));
        math::BBox<T> bbox = math::decodeBBoxWithVar<T>(
            prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data);
        decoded_bboxes.push_back(bbox);
      }
      all_decoded_bboxes.push_back(decoded_bboxes);
    }
    std::vector<std::map<size_t, std::vector<size_t>>> all_indices;
    int num_kept = math::getDetectionIndices<T>(
-        conf_tensor.data<T>(), num_priors, num_classes, background_label_id,
+        conf_data, num_priors, num_classes, background_label_id, batch_size,
-        batch_size, confidence_threshold, nms_top_k, nms_threshold, top_k,
+        confidence_threshold, nms_top_k, nms_threshold, top_k,
        all_decoded_bboxes, &all_indices);
-    framework::Tensor out_tmp;
    if (num_kept <= 0) {
      std::vector<int64_t> out_shape_vec({0, 0});
      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
@@ -100,14 +145,20 @@ class Detection_output_Kernel : public framework::OpKernel<T> {
    }
    std::vector<int64_t> out_shape_vec({num_kept, 7});
    framework::DDim out_shape(framework::make_ddim(out_shape_vec));
-    out_tmp.mutable_data<T>(out_shape, context.GetPlace());
-    T* out_data = out_tmp.data<T>();
-    math::getDetectionOutput<T>(conf_tensor.data<T>(), num_kept, num_priors,
-                                num_classes, batch_size, all_indices,
-                                all_decoded_bboxes, out_data);
    out->mutable_data<T>(out_shape, context.GetPlace());
-    out->ShareDataWith(out_tmp);
+    framework::Tensor out_cpu;
+    T* out_data = out->data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
+      out_data = out_cpu.data<T>();
+    }
+    math::getDetectionOutput<T>(conf_data, num_kept, num_priors, num_classes,
+                                batch_size, all_indices, all_decoded_bboxes,
+                                out_data);
+    if (platform::is_gpu_place(context.GetPlace())) {
+      framework::CopyFrom(out_cpu, platform::GPUPlace(),
+                          context.device_context(), out);
+    }
  }
 };
 }  // namespace operators

--- a/paddle/operators/math/detection_util.h
+++ b/paddle/operators/math/detection_util.h
@@ -50,27 +50,23 @@ struct BBox {
 };
 // KNCHW ==> NHWC
 template <typename T>
-int appendWithPermute(const framework::Tensor& input,
+int appendWithPermute(const T* input_data, int input_nums, int batch_size,
-                      framework::Tensor* output) {
+                      int channels, int height, int weight, T* output_data) {
-  const int input_nums = input.dims()[0];
-  const int batch_size = input.dims()[1];
-  const int channels = input.dims()[2];
-  const int height = input.dims()[3];
-  const int weight = input.dims()[4];
  int image_size = height * weight;
+  int numel = input_nums * batch_size * channels * height * weight;
  int offset = 0;
  for (int p = 0; p < input_nums; ++p) {
    int in_p_offset = p * batch_size * channels * image_size;
    for (int n = 0; n < batch_size; ++n) {
      int in_n_offset = n * channels * image_size;
-      int out_n_offset = n * input.numel() / batch_size + offset;
+      int out_n_offset = n * numel / batch_size + offset;
      int in_stride = image_size;
      int out_stride = channels;
-      const T* in_data = input.data<T>() + in_p_offset + in_n_offset;
+      const T* in_data = input_data + in_p_offset + in_n_offset;
-      T* out_data = output->data<T>() + out_n_offset;
+      T* out_data = output_data + out_n_offset;
-      for (int i = 0; i < channels; ++i) {
+      for (int c = 0; c < channels; ++c) {
-        for (int c = 0; c < image_size; ++c) {
+        for (int i = 0; i < image_size; ++i) {
-          out_data[out_stride * c + i] = in_data[i * in_stride + c];
+          out_data[out_stride * i + c] = in_data[c * in_stride + i];
        }
      }
    }

--- a/python/paddle/v2/fluid/tests/test_detection_output_op.py
+++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "detection_output"
+        self.init_test_case()
+        #loc = np.zeros((1, 4, 4, 1, 1))
+        #conf = np.zero((1, 4, 2, 1, 1))
+        loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]])
+        conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]]],
+                         [[[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]])
+        priorbox = np.array([0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,\
+                    0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,\
+                    0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,\
+                    0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2])
+        output = np.array([0, 1, 0.68997443, 0.099959746, 0.099959746,\
+                           0.50804031, 0.50804031])
+        self.inputs = {
+            'Loc': loc.astype('float32'),
+            'Conf': conf.astype('float32'),
+            'PriorBox': priorbox.astype('float32')
+        }
+        self.attrs = {
+            'num_classes': self.num_classes,
+            'top_k': self.top_k,
+            'nms_top_k': self.nms_top_k,
+            'background_label_id': self.background_label_id,
+            'nms_threshold': self.nms_threshold,
+            'confidence_threshold': self.confidence_threshold,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+    def test_check_output(self):
+        self.check_output()
+    def init_test_case(self):
+        self.num_classes = 2
+        self.top_k = 10
+        self.nms_top_k = 20
+        self.background_label_id = 0
+        self.nms_threshold = 0.01
+        self.confidence_threshold = 0.01
+if __name__ == '__main__':
+    unittest.main()