diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index cbdbf5335d32d55a0221728758025c9d2cb3e7d1..a9876cec2aabf7d116443b685391ee9d20bc1370 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -178,19 +178,22 @@ public: real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); real* outputData = outputs[0].data(); + real* colData = NULL; bool needIm2col = isNeedIm2col(filter); TensorShape imShape = TensorShape({inputChannels / groups_, inputHeight, inputWidth}); - TensorShape colShape; - real* colData = NULL; - size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth; - size_t colWidth = outputHeight * outputWidth; - // Max col matrix height 256, Max col matrix width 1024 - size_t stepColHeight = std::min(colHeight, static_cast(256)); - size_t stepColWidth = std::min(colWidth, static_cast(2048)); + // Max col matrix width 4096, Max col matrix size 4M. + size_t outputHeightSteps = + std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight); + size_t maxColWidth = outputHeightSteps * outputWidth; + size_t channelSteps = + std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth, + (size_t)1), + inputChannels / groups_); + size_t maxColHeight = channelSteps * filterHeight * filterWidth; if (needIm2col) { colShape = TensorShape({inputChannels / groups_, @@ -199,7 +202,7 @@ public: outputHeight, outputWidth}); - resizeBuffer(stepColHeight * stepColWidth * sizeof(real)); + resizeBuffer(maxColHeight * maxColWidth * sizeof(real)); colData = reinterpret_cast(memory_->getBuf()); } @@ -209,20 +212,24 @@ public: (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; - int nStride = colWidth; - int kStride = colHeight; + int nStride = outputHeight * outputWidth; + int kStride = inputChannels / groups_ * filterHeight * filterWidth; for (size_t i = 0; i < batchSize; i++) { + filterData = inputs[1].data(); for (size_t g = 0; g < groups_; g++) { if (needIm2col) { real beta_ = beta; - for (size_t colHeightStart = 0; colHeightStart < colHeight; - colHeightStart += stepColHeight) { - for (size_t colWidthStart = 0; colWidthStart < colWidth; - colWidthStart += stepColWidth) { - int N = std::min(colWidth - colWidthStart, stepColWidth); - int K = std::min(colHeight - colHeightStart, stepColHeight); + for (size_t ic = 0; ic < inputChannels / groups_; + ic += channelSteps) { + int channels = std::min(inputChannels / groups_ - ic, channelSteps); + for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) { + int height = std::min(outputHeight - oh, outputHeightSteps); + + int M = outputChannels / groups_; + int N = height * outputWidth; + int K = channels * filterHeight * filterWidth; // im2col - im2col(inputData + g * inputOffset, + im2col(inputData, imShape, colData, colShape, @@ -232,13 +239,12 @@ public: paddingW(), dilationH(), dilationW(), - colHeightStart, - K, - colWidthStart, + channels, + oh, + height, N); // gemm - int M = outputChannels / groups_; BlasGemm::compute( false, false, @@ -246,12 +252,12 @@ public: N, K, 1.0f, - filterData + g * filterOffset + colHeightStart, + filterData + ic * filterHeight * filterWidth, kStride, colData, N, beta_, - outputData + g * outputOffset + colWidthStart, + outputData + oh * outputWidth, nStride); } beta_ = 1.0; @@ -266,17 +272,18 @@ public: N, K, 1.0f, - filterData + g * filterOffset, + filterData, K, - inputData + g * inputOffset, + inputData, N, beta, - outputData + g * outputOffset, + outputData, N); } + inputData += inputOffset; + outputData += outputOffset; + filterData += filterOffset; } - inputData += inputChannels * inputHeight * inputWidth; - outputData += outputChannels * outputHeight * outputWidth; } memory_.reset(); diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 36a9bcf84e4b14965c83627821b71d1c7c0da1b2..361ba4c18a26d35b45dc4545372d4b881f7697a6 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -98,6 +98,7 @@ public: int dilationWidth = 1); }; +#if 0 template class Im2ColMobileFunctor { public: @@ -147,5 +148,57 @@ public: } } }; +#endif + +template +class Im2ColMobileFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int dilationHeight, + int dilationWidth, + int inputChannels, + int colOffset, + int colOutputHeight, + int colWidth) { + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputWidth = colShape[4]; + + for (int ic = 0; ic < inputChannels; ic++) { + for (int oh = 0; oh < colOutputHeight; oh++) { + T* dstData = colData + oh * outputWidth; + for (int fh = 0; fh < filterHeight; fh++) { + for (int fw = 0; fw < filterWidth; fw++) { + int imRowIdx = (oh + colOffset) * strideHeight + fh - paddingHeight; + if (imRowIdx < 0 || imRowIdx >= inputHeight) { + memset(dstData, 0, outputWidth * sizeof(T)); + } else { + for (int ow = 0; ow < outputWidth; ow++) { + int imColIdx = ow * strideWidth + fw - paddingWidth; + if (imColIdx < 0 || imColIdx >= inputWidth) { + dstData[ow] = T(0); + } else { + dstData[ow] = imData[imRowIdx * inputWidth + imColIdx]; + } + } + } + dstData += colWidth; + } + } + } + colData += filterHeight * filterWidth * colWidth; + imData += inputHeight * inputWidth; + } + } +}; } // namespace paddle