// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include "paddle/fluid/operators/math/matrix_bit_code.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/selected_rows.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { namespace math = paddle::operators::math; template void HierarchicalSigmoidGradKernelImpl( const Context& ctx, const DenseTensor& x, const DenseTensor& w, const DenseTensor& label, const DenseTensor& pre_out, const DenseTensor& out_grad, paddle::optional path, paddle::optional code, paddle::optional bias, int num_classes, bool remote_prefetch, int trainer_id, const std::vector& height_sections, const std::vector& epmap, const std::vector& table_names, bool is_sparse, DenseTensor* x_grad, DenseTensor* w_grad, DenseTensor* bias_grad, SelectedRows* w_grad_sr = nullptr) { funcs::SetConstant zero; DenseTensor pre_out_grad; pre_out_grad.Resize(pre_out.dims()); ctx.template Alloc(&pre_out_grad); ctx.template Alloc(x_grad); zero(ctx, x_grad, static_cast(0.0)); bool is_custom = false; if (path.get_ptr()) { is_custom = true; } std::unique_ptr> bit_code; if (!is_custom) { bit_code.reset(new math::MatrixBitCodeFunctor( num_classes, label.template data())); } else { bit_code.reset(new math::MatrixBitCodeFunctor( *(path.get_ptr()), *(code.get_ptr()), label.template data())); } // softrelu derivative auto blas = funcs::GetBlas(ctx); auto* pre_out_grad_data = pre_out_grad.data(); auto* pre_out_data = pre_out.template data(); auto n = pre_out.numel(); blas.VEXP(n, pre_out_data, pre_out_grad_data); blas.VINV(n, pre_out_grad_data, pre_out_grad_data); for (int64_t i = 0; i < n; ++i) { pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; } bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) auto* out_grad_data = out_grad.template data(); int64_t dim0 = pre_out_grad.dims()[0]; int64_t dim1 = pre_out_grad.dims()[1]; for (int64_t i = 0; i < dim0; ++i) { T tmp = out_grad_data[i]; blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); } // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // be consistent with the clipping in forward. if (bias_grad) { ctx.template Alloc(bias_grad); zero(ctx, bias_grad, static_cast(0.0)); bit_code->AddGrad(pre_out_grad, bias_grad); } ctx.template Alloc(w_grad); zero(ctx, w_grad, static_cast(0.0)); if (!is_sparse) { bit_code->MulGradWeight(pre_out_grad, w_grad, x); } else { bit_code->MulGradWeight(pre_out_grad, w_grad_sr, x); } bit_code->MulGradError(pre_out_grad, w, x_grad); } } // namespace phi