Add the GPU version implementation of ImageExpand function.

152bd2f9 · hedaoyuan · 34362d93 · 152bd2f9 · 152bd2f9 · 152bd2f9
5 changed file
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -14,6 +14,9 @@ limitations under the License. */

 #pragma once

+#include "TensorShape.h"
+#include "TensorType.h"
+
 namespace paddle {

 /* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */

--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+
+namespace paddle {
+
+template<class T>
+__global__
+void im2colOCF(const T* imData, T* colData,
+               int inputChannels,
+               int inputHeight, int inputWidth,
+               int filterHeight, int filterWidth,
+               int strideHeight, int strideWidth,
+               int paddingHeight, int paddingWidth,
+               int outputHeight, int outputWidth) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+
+  for (int channelId = threadIdx.z;
+       channelId < inputChannels;
+       channelId += blockDim.z) {
+    int widthOffset = idx + swId * strideWidth - paddingWidth;
+    int heightOffset = idy + shId * strideHeight - paddingHeight;
+    int imOffset = widthOffset + heightOffset * inputWidth
+       + channelId * inputHeight * inputWidth;
+
+    int colOffset = idx + idy * filterWidth
+      + channelId * filterHeight * filterWidth
+      + (shId * outputWidth + swId)
+      * (inputChannels * filterHeight * filterWidth);
+
+    if (idx < filterWidth && idy < filterHeight) {
+      if (heightOffset >= inputHeight || heightOffset < 0 ||
+          widthOffset >= inputWidth || widthOffset < 0) {
+        colData[colOffset] = T(0);
+      } else {
+        colData[colOffset] = imData[imOffset];
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    im2colOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, colData, inputChannels, inputHeight, inputWidth,
+         filterHeight, filterWidth, strideHeight, strideWidth,
+         paddingHeight, paddingWidth, outputHeight, outputWidth);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+
+}  // namespace paddle
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -291,5 +291,8 @@ public:

 REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward);
 REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(ImageExpand, GPU, ImageExpandForward);
+#endif

 }  // namespace paddle
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -37,7 +37,6 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
  imgSizeH_ = blockConf.img_size_y();
  imgSizeW_ = blockConf.img_size_x();

-  if (!useGpu_) {
  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
@@ -47,6 +46,7 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
                     .set("strides", strides)
                     .set("paddings", paddings)
                     .set("blocks", blocks));
+  if (!useGpu_) {
    createFunction(backward_,
                   "ImageExpandGrad",
                   FuncConfig()
@@ -84,62 +84,29 @@ void BlockExpandLayer::forward(PassType passType) {
  size_t blockNum = getBlockNum();
  size_t blockSize = blockH_ * blockW_ * channels_;
  resetOutput(blockNum * batchSize, blockSize);
-  // TODO(hedaoyuan): After completing the GPU version of ImageExpand,
-  // refactor the following code.
-  Argument& out = getOutput();
-  MatrixPtr outV = getOutputValue();

-  MatrixPtr input = getPrev(0)->getOutputValue();
-  Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
+  // calculate output_.value
+  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inputShape_);
+  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
+  Argument& out = getOutput();
  ICpuGpuVector::resizeOrCreate(
      out.sequenceStartPositions, batchSize + 1, false);
  IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
  int* start = out.sequenceStartPositions->getMutableData(false);
  int* dims = out.cpuSequenceDims->getData();
  for (size_t i = 0; i < batchSize; i++) {
-    if (useGpu_) {
-      outVTrans_->zeroMem();
-      /* expand each block as one row */
-      MatrixPtr inputTmp =
-          Matrix::create(input->getData() + i * input->getWidth(),
-                         1,
-                         input->getWidth(),
-                         false,
-                         useGpu_);
-      outVTrans_->convExpand(*inputTmp,
-                             imgSizeH_,
-                             imgSizeW_,
-                             channels_,
-                             blockH_,
-                             blockW_,
-                             strideH_,
-                             strideW_,
-                             paddingH_,
-                             paddingW_,
-                             outputH_,
-                             outputW_);
-      MatrixPtr outVTmp =
-          Matrix::create(outV->getData() + i * blockNum * blockSize,
-                         blockNum,
-                         blockSize,
-                         false,
-                         useGpu_);
-      outVTrans_->transpose(outVTmp, false);
-    }
    start[i] = i * blockNum;
    dims[2 * i] = outputH_;
    dims[2 * i + 1] = outputW_;
  }
  start[batchSize] = batchSize * blockNum;
-  if (!useGpu_) {
-    inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-    outputShape_ = TensorShape({batchSize, blockNum, blockSize});
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getInputValue(0), inputShape_);
-    outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
 }

 void BlockExpandLayer::backward(const UpdateCallback& callback) {

--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -50,9 +50,6 @@ protected:
  size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
  size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;

-  /// auxiliary variable, which saves the transposed output value.
-  MatrixPtr outVTrans_;
-
  TensorShape inputShape_;
  TensorShape outputShape_;