From 152bd2f9c867e8e165c3d22810281023880b3d16 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 20:30:02 +0800 Subject: [PATCH] Add the GPU version implementation of ImageExpand function. --- paddle/function/Im2Col.h | 3 + paddle/function/Im2ColOpGpu.cu | 130 +++++++++++++++++++++ paddle/function/ImageExpandOp.cpp | 3 + paddle/gserver/layers/BlockExpandLayer.cpp | 73 ++++-------- paddle/gserver/layers/BlockExpandLayer.h | 3 - 5 files changed, 156 insertions(+), 56 deletions(-) create mode 100644 paddle/function/Im2ColOpGpu.cu diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 6d76e229bfc..48e2e32f925 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -14,6 +14,9 @@ limitations under the License. */ #pragma once +#include "TensorShape.h" +#include "TensorType.h" + namespace paddle { /* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */ diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu new file mode 100644 index 00000000000..1dac2585db7 --- /dev/null +++ b/paddle/function/Im2ColOpGpu.cu @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Im2Col.h" + +namespace paddle { + +template +__global__ +void im2colOCF(const T* imData, T* colData, + int inputChannels, + int inputHeight, int inputWidth, + int filterHeight, int filterWidth, + int strideHeight, int strideWidth, + int paddingHeight, int paddingWidth, + int outputHeight, int outputWidth) { + int idx = threadIdx.x; + int idy = threadIdx.y; + int swId = blockIdx.x; + int shId = blockIdx.y; + + for (int channelId = threadIdx.z; + channelId < inputChannels; + channelId += blockDim.z) { + int widthOffset = idx + swId * strideWidth - paddingWidth; + int heightOffset = idy + shId * strideHeight - paddingHeight; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; + + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) + * (inputChannels * filterHeight * filterWidth); + + if (idx < filterWidth && idy < filterHeight) { + if (heightOffset >= inputHeight || heightOffset < 0 || + widthOffset >= inputWidth || widthOffset < 0) { + colData[colOffset] = T(0); + } else { + colData[colOffset] = imData[imOffset]; + } + } + } +} + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + + int blockDimX = 0; + int blockDimY = 0; + if (filterHeight <= 4 && filterWidth <= 4) { + blockDimX = 4; + blockDimY = 4; + } else if (filterHeight <= 8 && filterWidth <= 8) { + blockDimX = 8; + blockDimY = 8; + } else if (filterHeight <= 16 && filterWidth <= 16) { + blockDimX = 16; + blockDimY = 16; + } else { + blockDimX = 32; + blockDimY = 32; + } + + int blockDimZ = 1024 / blockDimX / blockDimY; + dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); + dim3 grid(outputWidth, outputHeight); + im2colOCF<<< grid, threads, 0, STREAM_DEFAULT >>> + (imData, colData, inputChannels, inputHeight, inputWidth, + filterHeight, filterWidth, strideHeight, strideWidth, + paddingHeight, paddingWidth, outputHeight, outputWidth); + CHECK_SYNC("Im2ColFunctor GPU failed"); + } +}; + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; + +} // namespace paddle diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp index ad34967bd65..fe4c8fefcf5 100644 --- a/paddle/function/ImageExpandOp.cpp +++ b/paddle/function/ImageExpandOp.cpp @@ -291,5 +291,8 @@ public: REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward); REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward); +#ifndef PADDLE_ONLY_CPU +REGISTER_TYPED_FUNC(ImageExpand, GPU, ImageExpandForward); +#endif } // namespace paddle diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp index c8d0b21c875..1889b347c2d 100644 --- a/paddle/gserver/layers/BlockExpandLayer.cpp +++ b/paddle/gserver/layers/BlockExpandLayer.cpp @@ -37,16 +37,16 @@ bool BlockExpandLayer::init(const LayerMap& layerMap, imgSizeH_ = blockConf.img_size_y(); imgSizeW_ = blockConf.img_size_x(); + std::vector strides = {(size_t)strideH_, (size_t)strideW_}; + std::vector paddings = {(size_t)paddingH_, (size_t)paddingW_}; + std::vector blocks = {(size_t)blockH_, (size_t)blockW_}; + createFunction(forward_, + "ImageExpand", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); if (!useGpu_) { - std::vector strides = {(size_t)strideH_, (size_t)strideW_}; - std::vector paddings = {(size_t)paddingH_, (size_t)paddingW_}; - std::vector blocks = {(size_t)blockH_, (size_t)blockW_}; - createFunction(forward_, - "ImageExpand", - FuncConfig() - .set("strides", strides) - .set("paddings", paddings) - .set("blocks", blocks)); createFunction(backward_, "ImageExpandGrad", FuncConfig() @@ -84,62 +84,29 @@ void BlockExpandLayer::forward(PassType passType) { size_t blockNum = getBlockNum(); size_t blockSize = blockH_ * blockW_ * channels_; resetOutput(blockNum * batchSize, blockSize); - // TODO(hedaoyuan): After completing the GPU version of ImageExpand, - // refactor the following code. - Argument& out = getOutput(); - MatrixPtr outV = getOutputValue(); - MatrixPtr input = getPrev(0)->getOutputValue(); - Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_); + // calculate output_.value + inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_}); + outputShape_ = TensorShape({batchSize, blockNum, blockSize}); + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getInputValue(0), inputShape_); + outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO); + forward_[0]->calc(inputs, outputs); + + // calculate output_.sequenceStartPositions and output_.cpuSequenceDims + Argument& out = getOutput(); ICpuGpuVector::resizeOrCreate( out.sequenceStartPositions, batchSize + 1, false); IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false); int* start = out.sequenceStartPositions->getMutableData(false); int* dims = out.cpuSequenceDims->getData(); for (size_t i = 0; i < batchSize; i++) { - if (useGpu_) { - outVTrans_->zeroMem(); - /* expand each block as one row */ - MatrixPtr inputTmp = - Matrix::create(input->getData() + i * input->getWidth(), - 1, - input->getWidth(), - false, - useGpu_); - outVTrans_->convExpand(*inputTmp, - imgSizeH_, - imgSizeW_, - channels_, - blockH_, - blockW_, - strideH_, - strideW_, - paddingH_, - paddingW_, - outputH_, - outputW_); - MatrixPtr outVTmp = - Matrix::create(outV->getData() + i * blockNum * blockSize, - blockNum, - blockSize, - false, - useGpu_); - outVTrans_->transpose(outVTmp, false); - } start[i] = i * blockNum; dims[2 * i] = outputH_; dims[2 * i + 1] = outputW_; } start[batchSize] = batchSize * blockNum; - if (!useGpu_) { - inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_}); - outputShape_ = TensorShape({batchSize, blockNum, blockSize}); - BufferArgs inputs; - BufferArgs outputs; - inputs.addArg(*getInputValue(0), inputShape_); - outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO); - forward_[0]->calc(inputs, outputs); - } } void BlockExpandLayer::backward(const UpdateCallback& callback) { diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h index edda0e0b630..15ce73ab8b2 100644 --- a/paddle/gserver/layers/BlockExpandLayer.h +++ b/paddle/gserver/layers/BlockExpandLayer.h @@ -50,9 +50,6 @@ protected: size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_; size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_; - /// auxiliary variable, which saves the transposed output value. - MatrixPtr outVTrans_; - TensorShape inputShape_; TensorShape outputShape_; -- GitLab