diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h index 06ee3b3654b576ec57dc437582e37ed0cea328ee..c5787630abbe105af64888692b1106bd21f4c1e8 100644 --- a/paddle/cuda/include/hl_cnn.h +++ b/paddle/cuda/include/hl_cnn.h @@ -240,62 +240,6 @@ extern void hl_avgpool_backward(const int frameCnt, real* backGrad, const int outStride); -/** - * @brief Cross-map-respose normalize forward. - * - * @param[in] frameCnt batch size of input image. - * @param[in] in input data. - * @param[in] scale buffer. - * @param[out] out output data. - * @param[in] channels number of channel. - * @param[in] height image height. - * @param[in] width image width. - * @param[in] sizeX size. - * @param[in] alpha scale. - * @param[in] beta scale. - * - */ -extern void hl_CMRNorm_forward(size_t frameCnt, - const real* in, - real* scale, - real* out, - size_t channels, - size_t height, - size_t width, - size_t sizeX, - real alpha, - real beta); - -/** - * @brief Cross-map-respose normalize backward. - * - * @param[in] frameCnt batch size of input image. - * @param[in] inV input data. - * @param[in] scale buffer. - * @param[out] outV output value. - * @param[out] outDiff output grad. - * @param[out] inDiff input grad. - * @param[in] channels number of channel. - * @param[in] height image height. - * @param[in] width image width. - * @param[in] sizeX size. - * @param[in] alpha scale. - * @param[in] beta scale. - * - */ -extern void hl_CMRNorm_backward(size_t frameCnt, - const real* inV, - const real* scale, - const real* outV, - const real* outDiff, - real* inDiff, - size_t channels, - size_t height, - size_t width, - size_t sizeX, - real alpha, - real beta); - /** * @brief Bilinear interpolation forward. * diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h index 52c978735279ed804c44f0e93472355637e8b98d..039551c6cc69525e71c8c311f78fb6dec07d7fed 100644 --- a/paddle/cuda/include/stub/hl_cnn_stub.h +++ b/paddle/cuda/include/stub/hl_cnn_stub.h @@ -117,30 +117,6 @@ inline void hl_avgpool_backward(const int frameCnt, real* backGrad, const int outStride) {} -inline void hl_CMRNorm_forward(size_t frameCnt, - const real* in, - real* scale, - real* out, - size_t channels, - size_t height, - size_t width, - size_t sizeX, - real alpha, - real beta) {} - -inline void hl_CMRNorm_backward(size_t frameCnt, - const real* inV, - const real* scale, - const real* outV, - const real* outDiff, - real* inDiff, - size_t channels, - size_t height, - size_t width, - size_t sizeX, - real alpha, - real beta) {} - inline void hl_bilinear_forward(const real* inData, const size_t inImgH, const size_t inImgW, diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index 1516accaae17fbeff4f4e48584940ec3e9873897..b94f4d8fe4a251750c527d4b686fcc8f452d4606 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -381,126 +381,6 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad, CHECK_SYNC("hl_avgpool_backward failed"); } -__global__ void KeCMRNormFillScale(size_t imageSize, const real* in, - real* scale, size_t channels, - size_t height, size_t width, size_t size, - real alpha) { - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < imageSize) { - const int w = idx % width; - const int h = (idx / width) % height; - const int n = idx / width / height; - const int offset = (n * channels * height + h) * width + w; - - in += offset; - scale += offset; - const int step = height * width; - const int pre_pad = (size - 1) / 2; - const int post_pad = size - pre_pad - 1; - - real accum = 0; - int index = 0; - while (index < channels + post_pad) { - if (index < channels) { - accum += in[index * step] * in[index * step]; - } - if (index >= size) { - accum -= in[(index - size) * step] * in[(index - size) * step]; - } - if (index >= post_pad) { - scale[(index - post_pad) * step] = 1. + accum * alpha; - } - ++index; - } - } -} - -__global__ void KeCMRNormOutput(size_t inputSize, const real* in, - const real* scale, real negative_beta, - real* out) { - const int index = threadIdx.x + blockIdx.x * blockDim.x; - if (index < inputSize) { - out[index] = in[index] * pow(scale[index], negative_beta); - } -} - -void hl_CMRNorm_forward(size_t frameCnt, const real* in, real* scale, - real* out, size_t channels, - size_t height, size_t width, size_t sizeX, - real alpha, real beta) { - size_t imageSize = frameCnt * height * width; - int blockSize = 1024; - int gridSize = (imageSize + 1024 - 1) / 1024; - KeCMRNormFillScale<<>> - (imageSize, in, scale, channels, height, width, sizeX, alpha); - - size_t inputSize = frameCnt * height * width *channels; - blockSize = 1024; - gridSize = (inputSize + 1024 - 1) / 1024; - KeCMRNormOutput<<>> - (inputSize, in, scale, beta, out); - CHECK_SYNC("hl_CMRNorm_forward"); -} - -__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, - const real* top_data, const real* scale, - const real* top_diff, size_t channels, - size_t height, size_t width, size_t size, - real negative_beta, real cache_ratio, - real* bottom_diff ) { - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < imageSize) { - const int w = idx % width; - const int h = (idx / width) % height; - const int n = idx / width / height; - const int offset = (n * channels * height + h) * width + w; - bottom_data += offset; - top_data += offset; - scale += offset; - top_diff += offset; - bottom_diff += offset; - - const int step = height * width; - const int pre_pad = size - (size + 1) / 2; - const int post_pad = size - pre_pad - 1; - - int index = 0; - real accum = 0; - while (index < channels + post_pad) { - if (index < channels) { - accum += top_diff[index * step] * top_data[index * step] / - scale[index * step]; - } - if (index >= size) { - accum -= top_diff[(index - size) * step] * - top_data[(index - size) * step] / scale[(index - size) * step]; - } - if (index >= post_pad) { - bottom_diff[(index - post_pad) * step] += - top_diff[(index - post_pad) * step] * - pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio * - bottom_data[(index - post_pad) * step] * accum; - } - ++index; - } - } -} - -void hl_CMRNorm_backward(size_t frameCnt, const real* inV, - const real* scale, - const real* outV, const real* outDiff, - real *inDiff, size_t channels, - size_t height, size_t width, size_t sizeX, - real alpha, real beta) { - size_t imageSize = frameCnt * height * width; - int blockSize = 1024; - int gridSize = (imageSize + 1024 - 1) / 1024; - KeCMRNormDiff <<>> - (imageSize, inV, outV, scale, outDiff, channels, - height, width, sizeX, alpha, beta, inDiff); - CHECK_SYNC("hl_CMRNorm_backward"); -} - __global__ void KeBilinearInterpFw(const real* in, const size_t inImgH, const size_t inImgW, diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp index e69c40699305466261b5febf0588c94597e04d25..4ff3b805fbb061cbb2630330c1b9b4f6c2b354d6 100644 --- a/paddle/gserver/layers/NormProjectionLayer.cpp +++ b/paddle/gserver/layers/NormProjectionLayer.cpp @@ -110,34 +110,5 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) { Tensor(denoms_->getData(), dims_)}, {Tensor(preOutGrad->getData(), dims_)}, {}); -#if 0 - if (useGpu_) { - CrossMapNormalGrad crossGrad; - crossGrad(dynamic_cast(*preOutGrad), - dynamic_cast(*preOutV), - dynamic_cast(*localGrad), - dynamic_cast(*localOutV), - dynamic_cast(*denoms_), - channels_, - imgSizeH_, - imgSizeW_, - size_, - scale_, - pow_); - } else { - CrossMapNormalGrad crossGrad; - crossGrad(dynamic_cast(*preOutGrad), - dynamic_cast(*preOutV), - dynamic_cast(*localGrad), - dynamic_cast(*localOutV), - dynamic_cast(*denoms_), - channels_, - imgSizeH_, - imgSizeW_, - size_, - scale_, - pow_); - } -#endif } } // namespace paddle diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 2cde11dd479dc0d150c5d7ce5c0c5c1cbf40e449..a36c31d32b7ca9d737a2b9df1934555cb490ee00 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -1265,69 +1265,6 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, outGrad.getStride()); } -void GpuMatrix::crossMapNormalFwd(Matrix& input, - size_t imgSizeH, - size_t imgSizeW, - Matrix& denoms, - size_t channels, - size_t sizeX, - float scale, - float pow) { - size_t num = input.getHeight(); - size_t height = imgSizeH; - size_t width = imgSizeW; - - CHECK(height * width * channels == input.getWidth()); - CHECK(denoms.getHeight() == input.getHeight() && - denoms.getWidth() == input.getWidth() && input.getHeight() == height_ && - input.getWidth() == width_); - hl_CMRNorm_forward(num, - input.getData(), - denoms.getData(), - data_, - channels, - height, - width, - sizeX, - scale, - -pow); -} - -void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, - Matrix& denoms, - Matrix& preOutV, - Matrix& localOutV, - size_t channels, - size_t imgSizeH, - size_t imgSizeW, - size_t sizeX, - float scale, - float pow) { - size_t num = preOutV.getHeight(); - size_t height = imgSizeH; - size_t width = imgSizeW; - - CHECK(width * height * channels == preOutV.getWidth()); - CHECK(denoms.getHeight() == preOutV.getHeight() && - denoms.getWidth() == preOutV.getWidth() && - preOutV.getHeight() == height_ && preOutV.getWidth() == width_); - CHECK(denoms.getHeight() == localGrad.getHeight() && - denoms.getWidth() == localGrad.getWidth()); - - hl_CMRNorm_backward(num, - preOutV.getData(), - denoms.getData(), - localOutV.getData(), - localGrad.getData(), - data_, - channels, - height, - width, - sizeX, - -pow, - 2.0f * pow * scale); -} - void GpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence, IVector& index) { @@ -2219,119 +2156,6 @@ void CpuMatrix::avgPoolBackward(Matrix& input, } } -void CpuMatrix::crossMapNormalFwd(Matrix& input, - size_t imgSizeH, - size_t imgSizeW, - Matrix& denoms, - size_t channels, - size_t sizeX, - float scale, - float pow) { - CHECK(isContiguous()); - CHECK(input.isContiguous()); - CHECK(denoms.isContiguous()); - CHECK_EQ(getHeight(), input.getHeight()); - CHECK_EQ(getWidth(), input.getWidth()); - CHECK_EQ(getHeight(), denoms.getHeight()); - CHECK_EQ(getWidth(), denoms.getWidth()); - - size_t numSample = input.getHeight(); - size_t numCols = input.getWidth(); - size_t height = imgSizeH; - size_t width = imgSizeW; - CHECK(height * width * channels == numCols); - - // TODO(hedaoyuan) After commit TensorExpress code, - // Reconstruction this code to remove the temporary memory. - CpuMatrix tmp(channels, height * width); - CpuMatrix tmp2(tmp.getData(), 1, channels * height * width); - denoms.zero(); - const int start = -((int)sizeX - 1) / 2; - const int end = (int)sizeX + start; - for (size_t i = 0; i < numSample; i++) { - input.subMatrix(i, 1)->square2(tmp2); - CpuMatrix subDen( - denoms.subMatrix(i, 1)->getData(), channels, height * width); - for (int c = 0; c < (int)channels; c++) { - for (int s = start; s < end; s++) { - if (c + s >= 0 && c + s < (int)channels) { - subDen.subMatrix(c, 1)->add(*tmp.subMatrix(c + s, 1)); - } - } - } - } - - denoms.add(scale, (real)1); - this->pow2(denoms, -pow); - this->dotMul(input); -} - -void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, - Matrix& denoms, - Matrix& preOutV, - Matrix& localOutV, - size_t channels, - size_t imgSizeH, - size_t imgSizeW, - size_t sizeX, - float scale, - float pow) { - CHECK(isContiguous()); - CHECK(localGrad.isContiguous()); - CHECK(denoms.isContiguous()); - CHECK(preOutV.isContiguous()); - CHECK(localOutV.isContiguous()); - CHECK_EQ(getHeight(), localGrad.getHeight()); - CHECK_EQ(getWidth(), localGrad.getWidth()); - CHECK_EQ(getHeight(), denoms.getHeight()); - CHECK_EQ(getWidth(), denoms.getWidth()); - CHECK_EQ(getHeight(), preOutV.getHeight()); - CHECK_EQ(getWidth(), preOutV.getWidth()); - CHECK_EQ(getHeight(), localOutV.getHeight()); - CHECK_EQ(getWidth(), localOutV.getWidth()); - - size_t numSample = getHeight(); - size_t numCols = getWidth(); - size_t height = imgSizeH; - size_t width = imgSizeW; - CHECK(height * width * channels == numCols); - - // TODO(hedaoyuan) After commit TensorExpress code, - // Reconstruction this code to remove the temporary memory. - CpuMatrix tmp(1, height * width); - - const int start = -((int)sizeX) / 2; - const int end = (int)sizeX + start; - const real ratio = -(real)2 * scale * pow; - for (size_t i = 0; i < numSample; i++) { - CpuMatrix inputDiff( - this->subMatrix(i, 1)->getData(), channels, height * width); - CpuMatrix outDiff( - localGrad.subMatrix(i, 1)->getData(), channels, height * width); - CpuMatrix input( - preOutV.subMatrix(i, 1)->getData(), channels, height * width); - CpuMatrix output( - localOutV.subMatrix(i, 1)->getData(), channels, height * width); - CpuMatrix subDen( - denoms.subMatrix(i, 1)->getData(), channels, height * width); - - for (int c = 0; c < (int)channels; c++) { - tmp.pow2(*subDen.subMatrix(c, 1), -pow); - inputDiff.subMatrix(c, 1) - ->addDotMul(tmp, *outDiff.subMatrix(c, 1), (real)1, (real)1); - for (int s = start; s < end; s++) { - if (c + s >= 0 && c + s < (int)channels) { - tmp.dotMul(*outDiff.subMatrix(c + s, 1), *output.subMatrix(c + s, 1)); - tmp.mulScalar(ratio); - tmp.dotDiv(tmp, *subDen.subMatrix(c + s, 1)); - tmp.dotMul(*input.subMatrix(c, 1)); - inputDiff.subMatrix(c, 1)->add(tmp); - } - } - } - } -} - /** * Input: one or more sequences. Each sequence contains some instances. * Output: output size is the number of input sequences (NOT input instances). diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index 5685cb7bcbbb6b90687790953d676e3792f36f36..62bc1b16fc7b6eedd2c6ac163b1d1d1c8de02ab5 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -952,31 +952,6 @@ public: LOG(FATAL) << "Not implemeted"; } - /// normalize-operation. - virtual void crossMapNormalFwd(Matrix& input, - size_t imgSizeH, - size_t imgSizeW, - Matrix& denoms, - size_t channels, - size_t sizeX, - float scale, - float pow) { - LOG(FATAL) << "Not implemeted"; - } - - virtual void crossMapNormalBwd(Matrix& localGrad, - Matrix& denoms, - Matrix& preOutV, - Matrix& localOutV, - size_t channels, - size_t imgSizeH, - size_t imgSizeW, - size_t size, - float scale, - float pow) { - LOG(FATAL) << "Not implemeted"; - } - /** * Input: one or more sequences. Each sequence contains some instances. * @@ -1459,26 +1434,6 @@ public: size_t paddingH, size_t paddingW); - void crossMapNormalFwd(Matrix& input, - size_t imgSizeH, - size_t imgSizeW, - Matrix& denoms, - size_t channels, - size_t sizeX, - float scale, - float pow); - - void crossMapNormalBwd(Matrix& localGrad, - Matrix& denoms, - Matrix& preOutV, - Matrix& localOutV, - size_t channels, - size_t imgSizeH, - size_t imgSizeW, - size_t sizeX, - float scale, - float pow); - void maxSequenceForward(Matrix& input, const IVector& sequence, IVector& index); @@ -1685,26 +1640,6 @@ public: size_t paddingH, size_t paddingW); - void crossMapNormalFwd(Matrix& input, - size_t imgSizeH, - size_t imgSizeW, - Matrix& denoms, - size_t channels, - size_t sizeX, - float scale, - float pow); - - void crossMapNormalBwd(Matrix& localGrad, - Matrix& denoms, - Matrix& preOutV, - Matrix& localOutV, - size_t channels, - size_t imgSizeH, - size_t imgSizeW, - size_t sizeX, - float scale, - float pow); - void maxSequenceForward(Matrix& input, const IVector& sequence, IVector& index); diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index da7a585484e7ee1f6579732206d3cafa109bd3cb..c89b7ff490232f5876ceb5e928c9352c431203ec 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -1385,33 +1385,6 @@ void testCrossMapNormalBwd( Tensor(denomsGpu.getData(), dims)}, {Tensor(inputsGradGpu.getData(), dims)}, {}); -#if 0 - CrossMapNormalGrad cpuCross; - cpuCross(inputsGrad, - inputsValue, - outputsGrad, - outputsValue, - denoms, - channels, - imgSizeH, - imgSizeW, - sizeX, - scale, - pow); - - CrossMapNormalGrad gpuCross; - gpuCross(inputsGradGpu, - inputsValueGpu, - outputsGradGpu, - outputsValueGpu, - denomsGpu, - channels, - imgSizeH, - imgSizeW, - sizeX, - scale, - pow); -#endif TensorCheckErr(inputsGrad, inputsGradGpu); }