From 838ef366dc6349ff25f88b6f69d2815c21ea8261 Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Sat, 24 Dec 2016 19:21:09 -0800
Subject: [PATCH] add first paddle function example for
 ContextProjectionForward operator, by going through Daoyuan's excellent
 paddle function design.

---
 paddle/function/CMakeLists.txt                |   4 +
 paddle/function/Function.cpp                  |  28 ++++
 paddle/function/Function.h                    |   2 +
 paddle/function/FunctionTest.h                |  44 ++++--
 paddle/function/context_projection_op.cpp     | 136 +++++++++++++++++
 paddle/function/context_projection_op.h       |  43 ++++++
 paddle/function/context_projection_op_gpu.cu  | 137 ++++++++++++++++++
 .../function/context_projection_op_test.cpp   | 101 +++++++++++++
 8 files changed, 479 insertions(+), 16 deletions(-)
 create mode 100644 paddle/function/context_projection_op.cpp
 create mode 100644 paddle/function/context_projection_op.h
 create mode 100644 paddle/function/context_projection_op_gpu.cu
 create mode 100644 paddle/function/context_projection_op_test.cpp
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index b0e6f92f1e..f70ae9959b 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -17,6 +17,10 @@ if(WITH_TESTING)
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
     add_simple_unittest(CrossMapNormalOpTest)
+    add_unittest(ContextProjectionOpTest
+        ContextProjectionOpTest.cpp
+        ContextProjectionOpGpu.cu
+        ../gserver/tests/TestUtil.cpp)
 endif()
 endif()
 
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index eb005e6744..215b3dbd83 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -30,6 +30,20 @@ real FuncConfig::get<real>(const std::string& key) const {
   return it->second.r;
 }
 
+template <>
+int FuncConfig::get<int>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.i;
+}
+
+template <>
+bool FuncConfig::get<bool>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.b;
+}
+
 template <>
 FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
   CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
@@ -44,6 +58,20 @@ FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
   return *this;
 }
 
+template <>
+FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
+  CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
+  valueMap_[key].i = v;
+  return *this;
+}
+
+template <>
+FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
+  CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
+  valueMap_[key].b = v;
+  return *this;
+}
+
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
 
 }  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 095584c0b1..210eba1301 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -59,6 +59,8 @@ public:
   union value {
     size_t s;
     real r;
+    int i;
+    bool b;
   };
 
   template <typename T>
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index a602bde57e..32131037f6 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -33,25 +33,33 @@ public:
     // init cpu and gpu arguments
     auto initArgs = [=](
         Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (auto arg : inArgs) {
+      for (const auto arg : inArgs) {
         size_t size = sizeof(real);
-        for (auto dim : arg.dims_) {
+        for (const auto dim : arg.dims_) {
           size *= dim;
         }
-        cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-        gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-        cpuArgs.emplace_back(
-            Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
-        gpuArgs.emplace_back(
-            Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
-
-        // will use an api to refactor this code.
-        CpuVector cpuVector(size / sizeof(real),
-                            (real*)cpuArgs.back().getData());
-        GpuVector gpuVector(size / sizeof(real),
-                            (real*)gpuArgs.back().getData());
-        cpuVector.uniform(0.001, 1);
-        gpuVector.copyFrom(cpuVector);
+        if (arg.getData()) {
+          // todo(tianbing), waste unnecessary mem here
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          // already init outside
+        } else {
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+          cpuArgs.emplace_back(
+              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
+          gpuArgs.emplace_back(
+              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
+          // will use an api to refactor this code.
+          CpuVector cpuVector(size / sizeof(real),
+                              (real*)cpuArgs.back().getData());
+          GpuVector gpuVector(size / sizeof(real),
+                              (real*)gpuArgs.back().getData());
+          cpuVector.uniform(0.001, 1);
+          gpuVector.copyFrom(cpuVector);
+        }
       }
     };
     initArgs(cpuInputs, gpuInputs, inputs);
@@ -81,6 +89,10 @@ public:
     checkArgs(cpuInouts, gpuInouts);
   }
 
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+
 protected:
   std::shared_ptr<FunctionBase> cpu;
   std::shared_ptr<FunctionBase> gpu;
diff --git a/paddle/function/context_projection_op.cpp b/paddle/function/context_projection_op.cpp
new file mode 100644
index 0000000000..75c41eed14
--- /dev/null
+++ b/paddle/function/context_projection_op.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "context_projection_op.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void ContextProjectionForward<DEVICE_TYPE_CPU>(Tensor& output,
+                                               const Tensor& input,
+                                               const Tensor& weight,
+                                               const Tensor& sequence,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad,
+                                               bool is_padding) {
+  CHECK(output.getData() && input.getData() && sequence.getData());
+  CHECK_EQ(output.dims_.size(), 2);
+  CHECK_EQ(input.dims_.size(), 2);
+  CHECK_EQ(weight.dims_.size(), 2);
+  CHECK_EQ(sequence.dims_.size(), 1);
+
+  auto out_mat = std::make_shared<CpuMatrix>(
+      output.getData(), output.dims_[0], output.dims_[1]);
+  const auto in_mat = std::make_shared<CpuMatrix>(
+      input.getData(), input.dims_[0], input.dims_[1]);
+  const auto weight_mat =
+      !weight.getData()
+          ? nullptr
+          : std::make_shared<CpuMatrix>(
+                weight.getData(), weight.dims_[0], input.dims_[1]);
+  CpuIVector seq_vec(sequence.dims_[0],
+                     reinterpret_cast<int*>(sequence.getData()));
+  CHECK_EQ(out_mat->getWidth(), in_mat->getWidth() * context_length);
+
+  const int* starts = seq_vec.getData();
+  const size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
+        if (is_padding && weight_mat) {
+          MatrixPtr sub = weight_mat->subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (is_padding && weight_mat) {
+          MatrixPtr sub = weight_mat->subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      MatrixPtr src = in_mat->subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * in_mat->getWidth());
+    }
+  }
+}
+
+/**
+ * \param inputs[0] input value.
+ * \param inputs[1] input weight.
+ * \param inputs[2] input sequence.
+ * \param outputs[0] output value.
+ */
+template <DeviceType Device>
+class ContextProjectionForwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    is_padding_ = config.get<bool>("is_padding");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(3, inputs.size());
+    CHECK_EQ(1, outputs.size());
+    CHECK_EQ(0, inouts.size());
+
+    ContextProjectionForward<Device>((Tensor&)outputs[0],
+                                     inputs[0],
+                                     inputs[1],
+                                     inputs[2],
+                                     context_length_,
+                                     context_start_,
+                                     begin_pad_,
+                                     is_padding_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  bool is_padding_;
+};
+
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    CPU,
+                    ContextProjectionForwardFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    GPU,
+                    ContextProjectionForwardFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/context_projection_op.h b/paddle/function/context_projection_op.h
new file mode 100644
index 0000000000..bdc5071bc6
--- /dev/null
+++ b/paddle/function/context_projection_op.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Context Projection Forward.
+ *
+ * \param[out]  outputs           output data.
+ * \param[in]   input             input data.
+ * \param[in]   weight            input weight.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length     consecutive rows for concatenation.
+ * \param[in]   begin_pad          context start position.
+ * \param[in]   is_padding         whether padding 0 or not.
+ *
+ */
+template <DeviceType Device>
+void ContextProjectionForward(Tensor& output,
+                              const Tensor& input,
+                              const Tensor& weight,
+                              const Tensor& sequence,
+                              size_t context_length,
+                              int context_start,
+                              size_t begin_pad,
+                              bool is_padding);
+
+}  // namespace paddle
diff --git a/paddle/function/context_projection_op_gpu.cu b/paddle/function/context_projection_op_gpu.cu
new file mode 100644
index 0000000000..4e7958164b
--- /dev/null
+++ b/paddle/function/context_projection_op_gpu.cu
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "context_projection_op.h"
+
+namespace paddle {
+
+template <bool padding>
+__global__ void KeContextProjectionForward(const real* input,
+                                           const int* sequence,
+                                           const real* weight,
+                                           real* output,
+                                           int input_dim,
+                                           int context_length,
+                                           int context_start,
+                                           int begin_pad) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seq_end - seq_start + context_length - 1;
+  output += seq_start * input_dim * context_length;
+  input += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        // i + context_start;
+        if ((i + context_start) < 0) {
+          if (padding) {
+            value = weight[i * input_dim + idx];
+          } else {
+            continue;
+          }
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          if (padding) {
+            value =
+              weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
+                         input_dim + idx];
+          } else {
+            continue;
+          }
+        } else {
+          value = input[(i + context_start) * input_dim + idx];
+        }
+
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          output + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          output_r[idx] += value;
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+      }
+    }
+    idx += block_size;
+  }
+}
+
+void hl_context_projection_forward(const real* input,
+                                   const int* sequence,
+                                   real* weight,
+                                   real* output,
+                                   int num_sequences,
+                                   int input_dim,
+                                   int context_length,
+                                   int context_start,
+                                   int begin_pad,
+                                   bool is_padding) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(output);
+  CHECK(!is_padding || weight);
+
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+
+  if (is_padding) {
+    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  } else  {
+    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  }
+  CHECK_SYNC("hl_context_projection_forward failed");
+}
+
+template <>
+void ContextProjectionForward<DEVICE_TYPE_GPU>(Tensor& output,
+                                               const Tensor& input,
+                                               const Tensor& weight,
+                                               const Tensor& sequence,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad,
+                                               bool is_padding) {
+  CHECK(output.getData() && input.getData() && sequence.getData());
+  CHECK_EQ(output.dims_.size(), 2);
+  CHECK_EQ(input.dims_.size(), 2);
+  CHECK_EQ(weight.dims_.size(), 2);
+  CHECK_EQ(sequence.dims_.size(), 1);
+  CHECK_EQ(output.dims_[1], input.dims_[1] * context_length);
+
+  hl_context_projection_forward(input.getData(),
+                                reinterpret_cast<int*>(sequence.getData()),
+                                weight.getData(),
+                                output.getData(),
+                                sequence.dims_[0] - 1,
+                                input.dims_[1],
+                                context_length,
+                                context_start,
+                                begin_pad,
+                                is_padding);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/context_projection_op_test.cpp b/paddle/function/context_projection_op_test.cpp
new file mode 100644
index 0000000000..98784471ae
--- /dev/null
+++ b/paddle/function/context_projection_op_test.cpp
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/math/Matrix.h"
+
+using namespace paddle;  // NOLINT
+
+void testMatrixProjectionForward(int context_start,
+                                 size_t context_length,
+                                 bool is_padding,
+                                 size_t batch_size,
+                                 size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare compare("ContextProjectionForward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start))
+                              .set("is_padding", is_padding));
+
+  CpuMatrix cpu_in(batch_size, input_dim);
+  cpu_in.randomizeUniform();
+  GpuMatrix gpu_in(batch_size, input_dim);
+  gpu_in.copyFrom(cpu_in);
+  auto cpu_weight =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_weight =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_weight->randomizeUniform();
+    gpu_weight->copyFrom(*cpu_weight);
+  }
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+
+  CpuMatrix cpu_out(batch_size, input_dim * context_length);
+  GpuMatrix gpu_out(batch_size, input_dim * context_length);
+  cpu_out.randomizeUniform();
+  gpu_out.copyFrom(cpu_out);
+
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+
+  autotest::TensorCheckEqual(cpu_out, gpu_out);
+}
+
+TEST(ContextProjectionForward, projection) {
+  for (auto context_start : {-5, -3, -1, 0, 3}) {
+    for (auto context_length : {1, 2, 5, 7}) {
+      for (auto trainable_padding : {false, true}) {
+        for (auto batch_size : {1, 2, 5, 20, 100}) {
+          for (auto input_dim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " context_start=" << context_start
+                    << " context_length=" << context_length
+                    << " trainable_padding=" << trainable_padding
+                    << " batch_size=" << batch_size
+                    << " input_dim=" << input_dim;
+            testMatrixProjectionForward(context_start,
+                                        context_length,
+                                        trainable_padding,
+                                        batch_size,
+                                        input_dim);
+          }
+        }
+      }
+    }
+  }
+}
-- 
GitLab