hierarchical_sigmoid_op.h 9.1 KB
Newer Older
Y
Yancey1989 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
W
weixing02 已提交
16
#include <iostream>
J
JiabinYang 已提交
17
#include <set>
W
weixing02 已提交
18
#include <vector>
J
JiabinYang 已提交
19
#include "paddle/fluid/framework/mixed_vector.h"
W
weixing02 已提交
20 21 22 23 24
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/clip_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h"
Y
Yancey1989 已提交
25 26 27
namespace paddle {
namespace operators {

Y
Yancey1989 已提交
28 29 30
template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
Y
Yancey1989 已提交
31
using platform::Transform;
Y
Yancey1989 已提交
32

33
std::vector<int64_t> cal_rows(const framework::LoDTensor& path) {
J
JiabinYang 已提交
34 35
  std::set<int64_t> tmp;
  std::vector<int64_t> rows;
36 37
  for (size_t i = 0; i < static_cast<size_t>(path.dims()[0]); i++) {
    for (size_t j = 0; j < static_cast<size_t>(path.dims()[1]); j++) {
J
JiabinYang 已提交
38
      int64_t temp =
39
          path.data<int64_t>()[i * static_cast<size_t>(path.dims()[1]) + j];
J
JiabinYang 已提交
40 41 42 43 44
      if (temp >= 0) {
        tmp.insert(temp);
      }
    }
  }
J
JiabinYang 已提交
45
  rows.assign(tmp.begin(), tmp.end());
J
JiabinYang 已提交
46 47 48
  return rows;
}

Y
Yancey1989 已提交
49
template <typename DeviceContext, typename T>
Y
Yancey1989 已提交
50 51
class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
 public:
Y
Yancey1989 已提交
52
  void Compute(const framework::ExecutionContext& ctx) const override {
J
JiabinYang 已提交
53 54 55 56 57 58 59 60
    auto* in = ctx.Input<framework::LoDTensor>("X");
    auto* w = ctx.Input<framework::LoDTensor>("W");
    auto* path = ctx.Input<framework::LoDTensor>("PTable");
    auto* code = ctx.Input<framework::LoDTensor>("PCode");
    auto* label = ctx.Input<framework::LoDTensor>("Label");
    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
    auto* out = ctx.Output<framework::LoDTensor>("Out");
    auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
Y
Yancey1989 已提交
61
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
62 63 64 65 66 67
    bool is_custom = false;
    if (path) {
      is_custom = true;
    }
    int64_t code_length =
        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
Y
Yancey1989 已提交
68
    int64_t batch_size = in->dims()[0];
J
JiabinYang 已提交
69
    framework::LoDTensor sum;
W
weixing02 已提交
70
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
G
guosheng 已提交
71
    auto* pre_out_data = pre_out->mutable_data<T>(
Y
Yancey1989 已提交
72
        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
W
weixing02 已提交
73
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
G
guosheng 已提交
74 75
    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
    // 0s can avoid out of path's loss.
76
    math::SetConstant<DeviceContext, T> zero;
W
weixing02 已提交
77
    zero(dev_ctx, pre_out, static_cast<T>(0.0));
Y
Yancey1989 已提交
78 79
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    math::RowwiseSum<DeviceContext, T> row_sum;
80 81 82 83 84 85 86 87 88

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
                                                       label->data<int64_t>()));
    } else {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
                                                       label->data<int64_t>()));
    }
Y
Yancey1989 已提交
89

Y
Yancey1989 已提交
90 91
    std::vector<int64_t> sum_dims({batch_size, 1UL});
    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
Y
Yancey1989 已提交
92
    auto sum_mat = EigenMatrix<T>::From(sum);
Y
Yancey1989 已提交
93
    out->mutable_data<T>(ctx.GetPlace());
Y
Yancey1989 已提交
94
    auto out_mat = framework::EigenVector<T>::Flatten(*out);
Y
Yancey1989 已提交
95
    if (bias) {
96
      bit_code->Add(*bias, pre_out);
Y
Yancey1989 已提交
97
    }
98
    bit_code->Mul(pre_out, *w, *in);
G
guosheng 已提交
99
    // clip to [-40, 40]
Y
Yancey1989 已提交
100 101
    Transform<DeviceContext> trans;
    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
W
weixing02 已提交
102
          pre_out_data + pre_out->numel(), pre_out_data,
Y
Yancey1989 已提交
103
          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
104
    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
G
guosheng 已提交
105
    // use softrelu to calculate cross entropy
Y
Yancey1989 已提交
106
    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
W
weixing02 已提交
107
    row_sum(dev_ctx, *pre_out, &sum);
108 109 110 111
    // TODO(guosheng): Subtract the out of path's loss, since not all
    // class(leaf) nodes' path lengths equal code_length. But it won't break the
    // gradient check since both have the out of path's loss and will cancel out
    // each other.
Y
Yancey1989 已提交
112
    out_mat.device(place) = sum_mat + out_mat;
Y
Yancey1989 已提交
113
  }
Y
Yancey1989 已提交
114 115
};

Y
Yancey1989 已提交
116
template <typename DeviceContext, typename T>
Y
Yancey1989 已提交
117 118
class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
 public:
Y
Yancey1989 已提交
119
  void Compute(const framework::ExecutionContext& ctx) const override {
J
JiabinYang 已提交
120 121 122 123
    auto* in = ctx.Input<framework::LoDTensor>("X");
    auto* w = ctx.Input<framework::LoDTensor>("W");
    auto* path = ctx.Input<framework::LoDTensor>("PTable");
    auto* code = ctx.Input<framework::LoDTensor>("PCode");
J
JiabinYang 已提交
124
    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
J
JiabinYang 已提交
125 126 127 128 129 130 131
    auto* in_grad =
        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
    bool is_sparse = ctx.Attr<bool>("is_sparse");
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    math::SetConstant<DeviceContext, T> zero;
    auto* label = ctx.Input<framework::LoDTensor>("Label");
    auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut");
W
weixing02 已提交
132
    auto* out_grad =
J
JiabinYang 已提交
133 134
        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
    framework::LoDTensor pre_out_grad;
135 136 137 138

    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
    in_grad->mutable_data<T>(ctx.GetPlace());
    zero(dev_ctx, in_grad, static_cast<T>(0.0));
W
weixing02 已提交
139

Y
Yancey1989 已提交
140
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
141 142 143 144 145 146 147 148 149 150 151 152 153 154

    bool is_custom = false;
    if (path) {
      is_custom = true;
    }

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
                                                       label->data<int64_t>()));
    } else {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
                                                       label->data<int64_t>()));
    }
155

Y
Yancey1989 已提交
156
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
W
weixing02 已提交
157 158
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
W
weixing02 已提交
159
    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
J
JiabinYang 已提交
160

J
JiabinYang 已提交
161
    Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
162 163 164 165

    // softrelu derivative
    pre_out_grad_mat.device(place) =
        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
166
    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
W
weixing02 已提交
167
    pre_out_grad_mat.device(place) =
168
        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
G
guosheng 已提交
169 170
    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
    // be consistent with the clipping in forward.
J
JiabinYang 已提交
171

J
JiabinYang 已提交
172
    if (!is_sparse) {
J
JiabinYang 已提交
173 174 175 176 177 178 179
      auto* bias_grad =
          ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
      if (bias_grad) {
        bias_grad->mutable_data<T>(ctx.GetPlace());
        zero(dev_ctx, bias_grad, static_cast<T>(0.0));
        bit_code->AddGrad(pre_out_grad, bias_grad);
      }
J
JiabinYang 已提交
180 181 182 183 184 185
      auto* w_grad =
          ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
      w_grad->mutable_data<T>(ctx.GetPlace());
      zero(dev_ctx, w_grad, static_cast<T>(0.0));
      bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
    } else {
186
      framework::Vector<int64_t> real_rows = cal_rows(*path);
J
JiabinYang 已提交
187 188 189
      auto* w_grad =
          ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
      w_grad->set_rows(real_rows);
190
      // Build a map of id -> row_index to speed up finding the index of one id
J
JiabinYang 已提交
191
      w_grad->SyncIndex();
J
JiabinYang 已提交
192
      w_grad->set_height(w->dims()[0]);
J
JiabinYang 已提交
193 194 195 196 197 198
      auto* w_grad_value = w_grad->mutable_value();
      framework::DDim temp_dim(w->dims());
      set(temp_dim, 0, real_rows.size());

      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
J
JiabinYang 已提交
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
      auto* bias_grad =
          ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
      if (bias_grad) {
        bias_grad->set_rows(real_rows);
        // build ids -> rows index map
        bias_grad->SyncIndex();
        bias_grad->set_height(bias->dims()[0]);
        auto* bias_grad_value = bias_grad->mutable_value();
        std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
                                     bias->dims()[1]};
        bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
                                         ctx.GetPlace());
        zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
        bit_code->AddGrad(pre_out_grad, bias_grad);
      }
J
JiabinYang 已提交
214 215
      bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
    }
216
    bit_code->MulGradError(pre_out_grad, *w, in_grad);
Y
Yancey1989 已提交
217
  }
Y
Yancey1989 已提交
218 219 220 221
};

}  // namespace operators
}  // namespace paddle