diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index cbdbf5335d32d55a0221728758025c9d2cb3e7d1..a9876cec2aabf7d116443b685391ee9d20bc1370 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -178,19 +178,22 @@ public: real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); real* outputData = outputs[0].data(); + real* colData = NULL; bool needIm2col = isNeedIm2col(filter); TensorShape imShape = TensorShape({inputChannels / groups_, inputHeight, inputWidth}); - TensorShape colShape; - real* colData = NULL; - size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth; - size_t colWidth = outputHeight * outputWidth; - // Max col matrix height 256, Max col matrix width 1024 - size_t stepColHeight = std::min(colHeight, static_cast(256)); - size_t stepColWidth = std::min(colWidth, static_cast(2048)); + // Max col matrix width 4096, Max col matrix size 4M. + size_t outputHeightSteps = + std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight); + size_t maxColWidth = outputHeightSteps * outputWidth; + size_t channelSteps = + std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth, + (size_t)1), + inputChannels / groups_); + size_t maxColHeight = channelSteps * filterHeight * filterWidth; if (needIm2col) { colShape = TensorShape({inputChannels / groups_, @@ -199,7 +202,7 @@ public: outputHeight, outputWidth}); - resizeBuffer(stepColHeight * stepColWidth * sizeof(real)); + resizeBuffer(maxColHeight * maxColWidth * sizeof(real)); colData = reinterpret_cast(memory_->getBuf()); } @@ -209,20 +212,24 @@ public: (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; - int nStride = colWidth; - int kStride = colHeight; + int nStride = outputHeight * outputWidth; + int kStride = inputChannels / groups_ * filterHeight * filterWidth; for (size_t i = 0; i < batchSize; i++) { + filterData = inputs[1].data(); for (size_t g = 0; g < groups_; g++) { if (needIm2col) { real beta_ = beta; - for (size_t colHeightStart = 0; colHeightStart < colHeight; - colHeightStart += stepColHeight) { - for (size_t colWidthStart = 0; colWidthStart < colWidth; - colWidthStart += stepColWidth) { - int N = std::min(colWidth - colWidthStart, stepColWidth); - int K = std::min(colHeight - colHeightStart, stepColHeight); + for (size_t ic = 0; ic < inputChannels / groups_; + ic += channelSteps) { + int channels = std::min(inputChannels / groups_ - ic, channelSteps); + for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) { + int height = std::min(outputHeight - oh, outputHeightSteps); + + int M = outputChannels / groups_; + int N = height * outputWidth; + int K = channels * filterHeight * filterWidth; // im2col - im2col(inputData + g * inputOffset, + im2col(inputData, imShape, colData, colShape, @@ -232,13 +239,12 @@ public: paddingW(), dilationH(), dilationW(), - colHeightStart, - K, - colWidthStart, + channels, + oh, + height, N); // gemm - int M = outputChannels / groups_; BlasGemm::compute( false, false, @@ -246,12 +252,12 @@ public: N, K, 1.0f, - filterData + g * filterOffset + colHeightStart, + filterData + ic * filterHeight * filterWidth, kStride, colData, N, beta_, - outputData + g * outputOffset + colWidthStart, + outputData + oh * outputWidth, nStride); } beta_ = 1.0; @@ -266,17 +272,18 @@ public: N, K, 1.0f, - filterData + g * filterOffset, + filterData, K, - inputData + g * inputOffset, + inputData, N, beta, - outputData + g * outputOffset, + outputData, N); } + inputData += inputOffset; + outputData += outputOffset; + filterData += filterOffset; } - inputData += inputChannels * inputHeight * inputWidth; - outputData += outputChannels * outputHeight * outputWidth; } memory_.reset(); diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 36a9bcf84e4b14965c83627821b71d1c7c0da1b2..915119e291caaa223249cf8e37078723621517b0 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -111,39 +111,42 @@ public: int paddingWidth, int dilationHeight, int dilationWidth, - int colHeightStart, - int colHeightSize, - int colWidthStart, - int colWidthSize) { + int inputChannels, + int colOffset, + int colOutputHeight, + int colWidth) { int inputHeight = imShape[1]; int inputWidth = imShape[2]; int filterHeight = colShape[1]; int filterWidth = colShape[2]; int outputWidth = colShape[4]; - for (int colh = 0; colh < colHeightSize; colh++) { - int wOffset = (colHeightStart + colh) % filterWidth; - int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight; - int c_im = (colHeightStart + colh) / filterWidth / filterHeight; - - for (int colw = 0; colw < colWidthSize; colw++) { - int h = (colWidthStart + colw) / outputWidth; - int w = (colWidthStart + colw) % outputWidth; - - int imRowIdx = h * strideHeight + hOffset * dilationHeight; - int imColIdx = w * strideWidth + wOffset * dilationWidth; - if ((imRowIdx - paddingHeight) < 0 || - (imRowIdx - paddingHeight) >= inputHeight || - (imColIdx - paddingWidth) < 0 || - (imColIdx - paddingWidth) >= inputWidth) { - colData[colh * colWidthSize + colw] = static_cast(0); - } else { - imRowIdx += c_im * inputHeight - paddingHeight; - imColIdx -= paddingWidth; - colData[colh * colWidthSize + colw] = - imData[imRowIdx * inputWidth + imColIdx]; + for (int ic = 0; ic < inputChannels; ic++) { + for (int oh = 0; oh < colOutputHeight; oh++) { + T* dstData = colData + oh * outputWidth; + for (int fh = 0; fh < filterHeight; fh++) { + for (int fw = 0; fw < filterWidth; fw++) { + int imRowIdx = (oh + colOffset) * strideHeight + + fh * dilationHeight - paddingHeight; + if (imRowIdx < 0 || imRowIdx >= inputHeight) { + memset(dstData, 0, outputWidth * sizeof(T)); + } else { + for (int ow = 0; ow < outputWidth; ow++) { + int imColIdx = + ow * strideWidth + fw * dilationWidth - paddingWidth; + if (imColIdx < 0 || imColIdx >= inputWidth) { + dstData[ow] = T(0); + } else { + dstData[ow] = imData[imRowIdx * inputWidth + imColIdx]; + } + } + } + dstData += colWidth; + } } } + colData += filterHeight * filterWidth * colWidth; + imData += inputHeight * inputWidth; } } }; diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp index 3ba866dcdd845403d52f7a85adfef08cbb11c305..fe44a8bf79005efb87c56f6a79f46421129bab22 100644 --- a/paddle/function/Im2ColTest.cpp +++ b/paddle/function/Im2ColTest.cpp @@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() { padding, dilation, dilation, + channels, 0, - height, - 0, - width); + outputHeight, + outputHeight * outputWidth); autotest::TensorCheckEqual(*output1, *output2); } diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 1ec4336cabbc7d3073b7638b7484bf61e83a2dc5..cc86b12be08ba987f9682ebf3fda56c2f07fb576 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, CHECK_EQ(channels * outLength, maskMatP->getWidth()); } - /* initialize the data_ */ - for (size_t i = 0; i < height_; i++) { - for (size_t j = 0; j < width_; j++) { - outData[i * outStride + j] = -(real)FLT_MAX; - } - } - /* pool max one by one */ for (size_t n = 0; n < num; ++n) { // frame by frame if (!isContiguous()) { @@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, for (size_t c = 0; c < channels; ++c) { // channel by channel for (size_t ph = 0; ph < outputH; ++ph) { int hstart = ph * strideH - paddingH; - int hend = std::min(hstart + sizeY, imgSizeH); - hstart = std::max(hstart, 0); + int hend = hstart + sizeY; + hstart = hstart < 0 ? 0 : hstart; + hend = hend < (int)imgSizeH ? hend : (int)imgSizeH; for (size_t pw = 0; pw < outputW; ++pw) { int wstart = pw * strideW - paddingW; - int wend = std::min(wstart + sizeX, imgSizeW); - wstart = std::max(wstart, 0); + int wend = wstart + sizeX; + wstart = wstart < 0 ? 0 : wstart; + wend = wend < (int)imgSizeW ? wend : (int)imgSizeW; if (maskData == NULL) { + real tmp = -(real)FLT_MAX; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - outData[ph * outputW + pw] = std::max( - outData[ph * outputW + pw], inputData[h * imgSizeW + w]); + tmp = tmp < inputData[h * imgSizeW + w] + ? inputData[h * imgSizeW + w] + : tmp; } } + outData[ph * outputW + pw] = tmp; } else { for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) {