hierarchical_sigmoid_op.h 9.2 KB
Newer Older
Y
Yancey1989 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
W
weixing02 已提交
16
#include <iostream>
J
JiabinYang 已提交
17
#include <set>
W
weixing02 已提交
18
#include <vector>
J
JiabinYang 已提交
19
#include "paddle/fluid/framework/mixed_vector.h"
W
weixing02 已提交
20 21 22 23 24
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/clip_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h"
Y
Yancey1989 已提交
25 26 27
namespace paddle {
namespace operators {

Y
Yancey1989 已提交
28 29 30
template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
Y
Yancey1989 已提交
31
using platform::Transform;
Y
Yancey1989 已提交
32

J
JiabinYang 已提交
33 34 35 36 37 38 39 40 41 42 43 44 45
std::vector<int64_t> cal_rows(const framework::LoDTensor* path) {
  std::set<int64_t> tmp;
  std::vector<int64_t> rows;
  rows.clear();
  for (size_t i = 0; i < static_cast<size_t>(path->dims()[0]); i++) {
    for (size_t j = 0; j < static_cast<size_t>(path->dims()[1]); j++) {
      int64_t temp =
          path->data<int64_t>()[i * static_cast<size_t>(path->dims()[1]) + j];
      if (temp >= 0) {
        tmp.insert(temp);
      }
    }
  }
J
JiabinYang 已提交
46
  rows.assign(tmp.begin(), tmp.end());
J
JiabinYang 已提交
47 48 49
  return rows;
}

Y
Yancey1989 已提交
50
template <typename DeviceContext, typename T>
Y
Yancey1989 已提交
51 52
class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
 public:
Y
Yancey1989 已提交
53
  void Compute(const framework::ExecutionContext& ctx) const override {
J
JiabinYang 已提交
54 55 56 57 58 59 60 61
    auto* in = ctx.Input<framework::LoDTensor>("X");
    auto* w = ctx.Input<framework::LoDTensor>("W");
    auto* path = ctx.Input<framework::LoDTensor>("PTable");
    auto* code = ctx.Input<framework::LoDTensor>("PCode");
    auto* label = ctx.Input<framework::LoDTensor>("Label");
    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
    auto* out = ctx.Output<framework::LoDTensor>("Out");
    auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
Y
Yancey1989 已提交
62
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
63 64 65 66 67 68 69 70
    bool is_custom = false;
    if (path) {
      is_custom = true;
    } else {
      is_custom = false;
    }
    int64_t code_length =
        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
Y
Yancey1989 已提交
71
    int64_t batch_size = in->dims()[0];
J
JiabinYang 已提交
72
    framework::LoDTensor sum;
W
weixing02 已提交
73
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
G
guosheng 已提交
74
    auto* pre_out_data = pre_out->mutable_data<T>(
Y
Yancey1989 已提交
75
        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
W
weixing02 已提交
76
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
G
guosheng 已提交
77 78
    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
    // 0s can avoid out of path's loss.
79
    math::SetConstant<DeviceContext, T> zero;
W
weixing02 已提交
80
    zero(dev_ctx, pre_out, static_cast<T>(0.0));
Y
Yancey1989 已提交
81 82
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    math::RowwiseSum<DeviceContext, T> row_sum;
83 84 85 86 87 88 89 90 91

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
                                                       label->data<int64_t>()));
    } else {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
                                                       label->data<int64_t>()));
    }
Y
Yancey1989 已提交
92

Y
Yancey1989 已提交
93 94
    std::vector<int64_t> sum_dims({batch_size, 1UL});
    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
Y
Yancey1989 已提交
95
    auto sum_mat = EigenMatrix<T>::From(sum);
Y
Yancey1989 已提交
96
    out->mutable_data<T>(ctx.GetPlace());
Y
Yancey1989 已提交
97
    auto out_mat = framework::EigenVector<T>::Flatten(*out);
Y
Yancey1989 已提交
98
    if (bias) {
99
      bit_code->Add(pre_out, *bias);
Y
Yancey1989 已提交
100
    }
101
    bit_code->Mul(pre_out, *w, *in);
G
guosheng 已提交
102
    // clip to [-40, 40]
Y
Yancey1989 已提交
103 104
    Transform<DeviceContext> trans;
    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
W
weixing02 已提交
105
          pre_out_data + pre_out->numel(), pre_out_data,
Y
Yancey1989 已提交
106
          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
107
    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
G
guosheng 已提交
108
    // use softrelu to calculate cross entropy
Y
Yancey1989 已提交
109
    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
W
weixing02 已提交
110
    row_sum(dev_ctx, *pre_out, &sum);
111 112 113 114
    // TODO(guosheng): Subtract the out of path's loss, since not all
    // class(leaf) nodes' path lengths equal code_length. But it won't break the
    // gradient check since both have the out of path's loss and will cancel out
    // each other.
Y
Yancey1989 已提交
115
    out_mat.device(place) = sum_mat + out_mat;
Y
Yancey1989 已提交
116
  }
Y
Yancey1989 已提交
117 118
};

Y
Yancey1989 已提交
119
template <typename DeviceContext, typename T>
Y
Yancey1989 已提交
120 121
class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
 public:
Y
Yancey1989 已提交
122
  void Compute(const framework::ExecutionContext& ctx) const override {
J
JiabinYang 已提交
123 124 125 126
    auto* in = ctx.Input<framework::LoDTensor>("X");
    auto* w = ctx.Input<framework::LoDTensor>("W");
    auto* path = ctx.Input<framework::LoDTensor>("PTable");
    auto* code = ctx.Input<framework::LoDTensor>("PCode");
J
JiabinYang 已提交
127
    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
J
JiabinYang 已提交
128 129 130 131 132 133 134
    auto* in_grad =
        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
    bool is_sparse = ctx.Attr<bool>("is_sparse");
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    math::SetConstant<DeviceContext, T> zero;
    auto* label = ctx.Input<framework::LoDTensor>("Label");
    auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut");
W
weixing02 已提交
135
    auto* out_grad =
J
JiabinYang 已提交
136 137
        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
    framework::LoDTensor pre_out_grad;
138 139 140 141

    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
    in_grad->mutable_data<T>(ctx.GetPlace());
    zero(dev_ctx, in_grad, static_cast<T>(0.0));
W
weixing02 已提交
142

Y
Yancey1989 已提交
143
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159

    bool is_custom = false;
    if (path) {
      is_custom = true;
    } else {
      is_custom = false;
    }

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
                                                       label->data<int64_t>()));
    } else {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
                                                       label->data<int64_t>()));
    }
160

Y
Yancey1989 已提交
161
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
W
weixing02 已提交
162 163
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
W
weixing02 已提交
164
    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
J
JiabinYang 已提交
165

J
JiabinYang 已提交
166
    Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
167 168 169 170

    // softrelu derivative
    pre_out_grad_mat.device(place) =
        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
171
    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
W
weixing02 已提交
172
    pre_out_grad_mat.device(place) =
173
        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
G
guosheng 已提交
174 175
    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
    // be consistent with the clipping in forward.
J
JiabinYang 已提交
176

J
JiabinYang 已提交
177
    if (!is_sparse) {
J
JiabinYang 已提交
178 179 180 181 182 183 184
      auto* bias_grad =
          ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
      if (bias_grad) {
        bias_grad->mutable_data<T>(ctx.GetPlace());
        zero(dev_ctx, bias_grad, static_cast<T>(0.0));
        bit_code->AddGrad(pre_out_grad, bias_grad);
      }
J
JiabinYang 已提交
185 186 187 188 189 190 191 192 193 194 195 196
      auto* w_grad =
          ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
      w_grad->mutable_data<T>(ctx.GetPlace());
      zero(dev_ctx, w_grad, static_cast<T>(0.0));
      bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
    } else {
      framework::Vector<int64_t> real_rows = cal_rows(path);
      auto* w_grad =
          ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
      w_grad->set_rows(real_rows);
      // build ids -> rows index map
      w_grad->SyncIndex();
J
JiabinYang 已提交
197
      w_grad->set_height(w->dims()[0]);
J
JiabinYang 已提交
198 199 200 201 202 203
      auto* w_grad_value = w_grad->mutable_value();
      framework::DDim temp_dim(w->dims());
      set(temp_dim, 0, real_rows.size());

      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
J
JiabinYang 已提交
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
      auto* bias_grad =
          ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
      if (bias_grad) {
        bias_grad->set_rows(real_rows);
        // build ids -> rows index map
        bias_grad->SyncIndex();
        bias_grad->set_height(bias->dims()[0]);
        auto* bias_grad_value = bias_grad->mutable_value();
        std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
                                     bias->dims()[1]};
        bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
                                         ctx.GetPlace());
        zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
        bit_code->AddGrad(pre_out_grad, bias_grad);
      }
J
JiabinYang 已提交
219 220
      bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
    }
221
    bit_code->MulGradError(pre_out_grad, *w, in_grad);
Y
Yancey1989 已提交
222
  }
Y
Yancey1989 已提交
223 224 225 226
};

}  // namespace operators
}  // namespace paddle