diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp index c1473a19ede5c438de479cbf4109c0379cb32393..0ed5991ff1382ea958e89bf9d75d3096f6d2ebfc 100644 --- a/paddle/function/CosSimOp.cpp +++ b/paddle/function/CosSimOp.cpp @@ -86,8 +86,130 @@ private: real scale_; }; +template <> +void CosSimBackward(const CpuMatrix* out_grad, + const CpuMatrix* out_val, + const CpuMatrix* in1_val, + const CpuMatrix* in2_val, + CpuMatrix* in1_grad, + CpuMatrix* in2_grad, + real scale) { + CHECK(out_grad && out_val && in1_val && in2_val && in1_grad && in2_grad); + CHECK_EQ(out_val->useGpu_, false) << "Matrix type are GPU, CPU required"; + + const real* grad = out_grad->getData(); + const real* out = out_val->getData(); + const real* prev_out_x = in1_val->getData(); + const real* prev_out_y = in2_val->getData(); + real* prev_grad_x = in1_grad->getData(); + real* prev_grad_y = in2_grad->getData(); + + size_t num_samples = out_grad->getHeight(); + size_t dim = in1_val->getWidth(); + CHECK_EQ(in2_val->getHeight(), in2_grad->getHeight()); + CHECK(in2_val->getHeight() == 1LU || in2_val->getHeight() == num_samples); + size_t inc = (in2_val->getHeight() == 1LU) ? 0 : dim; + for (size_t i = 0; i < num_samples; ++i, + prev_out_x += dim, + prev_out_y += inc, + prev_grad_x += dim, + prev_grad_y += inc) { + real square_sum_x = 0; + real square_sum_y = 0; + real xy = 0; + for (size_t j = 0; j < dim; ++j) { + square_sum_x += prev_out_x[j] * prev_out_x[j]; + square_sum_y += prev_out_y[j] * prev_out_y[j]; + xy += prev_out_x[j] * prev_out_y[j]; + } + CHECK(square_sum_x > 0 && square_sum_y > 0); + if (xy == 0) { + real reciprocal = + 1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y)); + for (size_t j = 0; j < dim; ++j) { + prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal; + prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal; + } + } else { + real reciprocal_xy = 1.0f / xy; + real reciprocal_square_sum_x = 1.0f / square_sum_x; + real reciprocal_square_sum_y = 1.0f / square_sum_y; + for (size_t j = 0; j < dim; ++j) { + prev_grad_x[j] += + out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy - + prev_out_x[j] * reciprocal_square_sum_x); + prev_grad_y[j] += + out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy - + prev_out_y[j] * reciprocal_square_sum_y); + } + } + } +} + +/** + * \param inputs[0] output value 1, size: nSamples * 1. + * \param inputs[1] input value 1, size: nSamples * dim. + * \param inputs[2] input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples). + * \param inputs[3] input grad 1, size: nSamples * dim. + * \param inputs[4] input grad 2, size: n2 * dim (n2 == 1 or n2 == nSamples). + * \param outputs[0] output grad, size : nSamples * 1. + */ +template +class CosSimBackwardFunc : public FunctionBase { + void init(const FuncConfig& config) override { + scale_ = config.get("scale"); + } + + void calc(const Arguments& inputs, + const Arguments& outputs, + const Arguments& inouts) override { + CHECK_EQ(inputs.size(), 5); + CHECK_EQ(outputs.size(), 1); + CHECK_EQ(inouts.size(), 0); + /// dim of out_grad and out_val == 1, column vector + CHECK_EQ(outputs[0].dims_[1], 1UL); + CHECK_EQ(inputs[0].dims_[1], 1UL); + /// nSamples of out_grad == out_val == in_val1 == in_grad1 + CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]); + CHECK_EQ(inputs[1].dims_[0], outputs[0].dims_[0]); + CHECK_EQ(inputs[3].dims_[0], outputs[0].dims_[0]); + /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2 + CHECK_EQ(inputs[2].dims_[1], inputs[1].dims_[1]); + CHECK_EQ(inputs[3].dims_[1], inputs[1].dims_[1]); + CHECK_EQ(inputs[4].dims_[1], inputs[1].dims_[1]); + + CHECK(outputs[0].getData() && inputs[0].getData() && inputs[1].getData() && + inputs[2].getData() && inputs[3].getData() && inputs[4].getData()); + const auto out_grad = std::make_shared::type>( + outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]); + const auto out_val = std::make_shared::type>( + inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]); + const auto in1_val = std::make_shared::type>( + inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]); + const auto in2_val = std::make_shared::type>( + inputs[2].getData(), inputs[2].dims_[0], inputs[2].dims_[1]); + auto in1_grad = std::make_shared::type>( + inputs[3].getData(), inputs[3].dims_[0], inputs[3].dims_[1]); + auto in2_grad = std::make_shared::type>( + inputs[4].getData(), inputs[4].dims_[0], inputs[4].dims_[1]); + + CosSimBackward(out_grad.get(), + out_val.get(), + in1_val.get(), + in2_val.get(), + in1_grad.get(), + in2_grad.get(), + scale_); + } + +private: + real scale_; +}; + REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc); +REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc); #ifndef PADDLE_ONLY_CPU REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc); +REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc); #endif } // namespace paddle diff --git a/paddle/function/CosSimOp.h b/paddle/function/CosSimOp.h index 02250d6db9c644549589c0cf8f2cc110d5f740c3..f66a4344d09ce8a44e957b56d560c55fc85afe6c 100644 --- a/paddle/function/CosSimOp.h +++ b/paddle/function/CosSimOp.h @@ -37,4 +37,25 @@ void CosSimForward(typename MatrixT::type* output, const typename MatrixT::type* input2, real scale); +/** + * \brief Cosine Similarity BackWard for Derivative. + * + * \param[out] output1 backward loss output grad. + * \param[in] input1 forward-output value. + * \param[in] input2 forward input value 1. + * \param[in] input3 forward input value 2. + * \param[in] input4 forward input grad 1. + * \param[in] input5 forward input grad 2. + * \param[in] scale default 1.0. + * + */ +template +void CosSimBackward(const typename MatrixT::type* out_grad, + const typename MatrixT::type* out_value, + const typename MatrixT::type* in1_value, + const typename MatrixT::type* in2_value, + typename MatrixT::type* in1_grad, + typename MatrixT::type* in2_grad, + real scale); + } // namespace paddle diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu index 34835fa5d87c398faa3f52769d317995bbc44bfa..f654f0bc209c248a992f063f0b901d5866b39ba7 100644 --- a/paddle/function/CosSimOpGpu.cu +++ b/paddle/function/CosSimOpGpu.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "hl_base.h" +#include "hl_device_functions.cuh" #include "CosSimOp.h" namespace paddle { @@ -79,7 +80,7 @@ void hlCossim(real* output, KeCosSim<<>> (output, input1, input2, width, input1_height, input2_height, scale); - CHECK_SYNC("hl_cossim failed"); + CHECK_SYNC("hlCossim failed"); } template <> @@ -91,7 +92,7 @@ void CosSimForward(GpuMatrix* out_mat, CHECK(in1_mat->useGpu_ == true && in2_mat->useGpu_ == true) << "Matrix type are not GPU"; - size_t numSamples = out_mat->getHeight(); + size_t num_samples = out_mat->getHeight(); size_t dim = in1_mat->getWidth(); real* out = out_mat->getData(); const real* x = in1_mat->getData(); @@ -99,4 +100,141 @@ void CosSimForward(GpuMatrix* out_mat, hlCossim(out, x, y, dim, in1_mat->getHeight(), in2_mat->getHeight(), scale); } +template +__global__ void KeCosSimDerivative(const real* grad, + const real* output, + const real* prev_out_x, + const real* prev_out_y, + real* prev_grad_x, + real* prev_grad_y, + size_t width, + size_t input1_height, + size_t input2_height, + real scale) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + __shared__ real xx[block_size]; + __shared__ real yy[block_size]; + __shared__ real xy[block_size]; + + xx[tid] = 0.0; + yy[tid] = 0.0; + xy[tid] = 0.0; + __syncthreads(); + + prev_out_x += ty * width; + prev_grad_x += ty * width; + if (input2_height > 1) { + prev_out_y += ty * width; + prev_grad_y += ty * width; + } + for (int index = tid; index < width; index += block_size) { + real x = prev_out_x[index]; + real y = prev_out_y[index]; + xx[tid] += x * x; + yy[tid] += y * y; + xy[tid] += x * y; + } + __syncthreads(); + + for (int s = block_size / 2; s > 0; s >>= 1) { + if (tid < s) { + xx[tid] += xx[tid + s]; + yy[tid] += yy[tid + s]; + xy[tid] += xy[tid + s]; + } + __syncthreads(); + } + if (xy[0] == 0) { + real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0])); + for (int index = tid; index < width; index += block_size) { + prev_grad_x[index] += + scale * grad[ty] * prev_out_y[index] * reciprocal; + if (input2_height > 1) { + prev_grad_y[index] += + scale * grad[ty] * prev_out_x[index] * reciprocal; + } else { + paddle::paddleAtomicAdd(prev_grad_y + index, + scale * grad[ty] * prev_out_x[index] * reciprocal); + } + } + } else { + real reciprocalXY = 1.0 / xy[0]; + real reciprocalSquareSumX = 1.0 / xx[0]; + real reciprocalSquareSumY = 1.0 / yy[0]; + for (int index = tid; index < width; index += block_size) { + prev_grad_x[index] += output[ty] * grad[ty] * + (prev_out_y[index] * reciprocalXY - + prev_out_x[index] * reciprocalSquareSumX); + if (input2_height > 1) { + prev_grad_y[index] += output[ty] * grad[ty] * + (prev_out_x[index] * reciprocalXY - + prev_out_y[index] * reciprocalSquareSumY); + } else { + paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] * + (prev_out_x[index] * reciprocalXY - + prev_out_y[index] * reciprocalSquareSumY)); + } + } + } +} + +void hlCossimDerivative(const real* grad, + const real* output, + const real* prev_out_x, + const real* prev_out_y, + real* prev_grad_x, + real* prev_grad_y, + size_t width, + size_t input1_height, + size_t input2_height, + real scale) { + CHECK_NOTNULL(grad); + CHECK_NOTNULL(output); + CHECK_NOTNULL(prev_out_x); + CHECK_NOTNULL(prev_out_y); + CHECK_NOTNULL(prev_grad_x); + CHECK_NOTNULL(prev_grad_y); + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid(1, input1_height); + KeCosSimDerivative<<>> + (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width, + input1_height, input2_height, scale); + CHECK_SYNC("hlCossimDerivate failed"); +} + +template <> +void CosSimBackward(const GpuMatrix* out_grad, + const GpuMatrix* out_val, + const GpuMatrix* in1_val, + const GpuMatrix* in2_val, + GpuMatrix* in1_grad, + GpuMatrix* in2_grad, + real scale) { + CHECK(out_grad && out_val && in1_val && in2_val && in1_grad && in2_grad); + CHECK(out_grad->useGpu_ && out_val->useGpu_ && in1_val->useGpu_ + && in2_val->useGpu_ && in1_grad->useGpu_ && in2_grad->useGpu_) + << "Matrix types are not equally GPU"; + + size_t dim = in1_val->getWidth(); + const real* grad = out_grad->getData(); + const real* out = out_val->getData(); + const real* prev_out_x = in1_val->getData(); + const real* prev_out_y = in2_val->getData(); + real* prev_grad_x = in1_grad->getData(); + real* prev_grad_y = in2_grad->getData(); + hlCossimDerivative(grad, + out, + prev_out_x, + prev_out_y, + prev_grad_x, + prev_grad_y, + dim, + in1_val->getHeight(), + in2_val->getHeight(), + scale); +} + } // namespace paddle diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp index 49c54620feb3557ae60c2ea624e5f0f2e5934149..0d2ece2b98625a528eecb53ef27182ae8d65eeb4 100644 --- a/paddle/function/CosSimOpTest.cpp +++ b/paddle/function/CosSimOpTest.cpp @@ -50,7 +50,7 @@ void testCosSimForward(size_t height_x, autotest::TensorCheckErr(cpu_out, gpu_out); } -TEST(Matrix, cosSim) { +TEST(Matrix, cosSimForward) { for (auto height_x : {10, 100, 1000}) { for (auto height_y : {1, height_x}) { for (auto width : {10, 100, 1000}) { @@ -61,3 +61,71 @@ TEST(Matrix, cosSim) { } } } + +void testCosSimBackward(size_t height_x, + size_t height_y, + size_t width, + real scale) { + FunctionCompare compare("CosSimBackward", FuncConfig().set("scale", scale)); + + CpuMatrix cpu_out_grad(height_x, 1); + CpuMatrix cpu_out_val(height_x, 1); + CpuMatrix cpu_in1_val(height_x, width); + CpuMatrix cpu_in2_val(height_x, width); + CpuMatrix cpu_in1_grad(height_x, width); + CpuMatrix cpu_in2_grad(height_x, width); + + cpu_out_grad.randomizeUniform(); + cpu_out_val.randomizeUniform(); + cpu_in1_val.randomizeUniform(); + cpu_in2_val.randomizeUniform(); + cpu_in1_grad.randomizeUniform(); + cpu_in2_grad.randomizeUniform(); + + GpuMatrix gpu_out_grad(height_x, 1); + GpuMatrix gpu_out_val(height_x, 1); + GpuMatrix gpu_in1_val(height_x, width); + GpuMatrix gpu_in2_val(height_x, width); + GpuMatrix gpu_in1_grad(height_x, width); + GpuMatrix gpu_in2_grad(height_x, width); + + gpu_out_grad.copyFrom(cpu_out_grad); + gpu_out_val.copyFrom(cpu_out_val); + gpu_in1_val.copyFrom(cpu_in1_val); + gpu_in2_val.copyFrom(cpu_in2_val); + gpu_in1_grad.copyFrom(cpu_in1_grad); + gpu_in2_grad.copyFrom(cpu_in2_grad); + + compare.getCpuFunction()->calc( + {Tensor(cpu_out_val.getData(), Dims{height_x, 1}), + Tensor(cpu_in1_val.getData(), Dims{height_x, width}), + Tensor(cpu_in2_val.getData(), Dims{height_x, width}), + Tensor(cpu_in1_grad.getData(), Dims{height_x, width}), + Tensor(cpu_in2_grad.getData(), Dims{height_x, width})}, + {Tensor(cpu_out_grad.getData(), Dims{height_x, 1})}, + {}); + + compare.getGpuFunction()->calc( + {Tensor(gpu_out_val.getData(), Dims{height_x, 1}), + Tensor(gpu_in1_val.getData(), Dims{height_x, width}), + Tensor(gpu_in2_val.getData(), Dims{height_x, width}), + Tensor(gpu_in1_grad.getData(), Dims{height_x, width}), + Tensor(gpu_in2_grad.getData(), Dims{height_x, width})}, + {Tensor(gpu_out_grad.getData(), Dims{height_x, 1})}, + {}); + + autotest::TensorCheckErr(cpu_in1_grad, gpu_in1_grad); + autotest::TensorCheckErr(cpu_in2_grad, gpu_in2_grad); +} + +TEST(Matrix, cosSimBackward) { + for (auto height_x : {1, 10, 100}) { + for (auto height_y : {1, height_x}) { + for (auto width : {1, 10, 100}) { + for (auto scale : {1.0, 2.0}) { + testCosSimBackward(height_x, height_y, width, scale); + } + } + } + } +}