hierarchical_sigmoid_op.h 8.4 KB
Newer Older
Y
Yancey1989 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
W
weixing02 已提交
16
#include <iostream>
J
JiabinYang 已提交
17
#include <set>
W
weixing02 已提交
18
#include <vector>
J
JiabinYang 已提交
19
#include "paddle/fluid/framework/mixed_vector.h"
W
weixing02 已提交
20 21 22 23 24
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/clip_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h"
Y
Yancey1989 已提交
25 26 27
namespace paddle {
namespace operators {

Y
Yancey1989 已提交
28 29 30
template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
Y
Yancey1989 已提交
31
using platform::Transform;
Y
Yancey1989 已提交
32

J
JiabinYang 已提交
33 34 35 36 37 38 39 40 41 42 43 44 45
std::vector<int64_t> cal_rows(const framework::LoDTensor* path) {
  std::set<int64_t> tmp;
  std::vector<int64_t> rows;
  rows.clear();
  for (size_t i = 0; i < static_cast<size_t>(path->dims()[0]); i++) {
    for (size_t j = 0; j < static_cast<size_t>(path->dims()[1]); j++) {
      int64_t temp =
          path->data<int64_t>()[i * static_cast<size_t>(path->dims()[1]) + j];
      if (temp >= 0) {
        tmp.insert(temp);
      }
    }
  }
J
JiabinYang 已提交
46
  rows.assign(tmp.begin(), tmp.end());
J
JiabinYang 已提交
47 48 49
  return rows;
}

Y
Yancey1989 已提交
50
template <typename DeviceContext, typename T>
Y
Yancey1989 已提交
51 52
class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
 public:
Y
Yancey1989 已提交
53
  void Compute(const framework::ExecutionContext& ctx) const override {
J
JiabinYang 已提交
54 55 56 57 58 59 60 61
    auto* in = ctx.Input<framework::LoDTensor>("X");
    auto* w = ctx.Input<framework::LoDTensor>("W");
    auto* path = ctx.Input<framework::LoDTensor>("PTable");
    auto* code = ctx.Input<framework::LoDTensor>("PCode");
    auto* label = ctx.Input<framework::LoDTensor>("Label");
    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
    auto* out = ctx.Output<framework::LoDTensor>("Out");
    auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
Y
Yancey1989 已提交
62
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
63 64 65 66 67 68 69 70
    bool is_custom = false;
    if (path) {
      is_custom = true;
    } else {
      is_custom = false;
    }
    int64_t code_length =
        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
Y
Yancey1989 已提交
71
    int64_t batch_size = in->dims()[0];
J
JiabinYang 已提交
72
    framework::LoDTensor sum;
W
weixing02 已提交
73
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
G
guosheng 已提交
74
    auto* pre_out_data = pre_out->mutable_data<T>(
Y
Yancey1989 已提交
75
        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
W
weixing02 已提交
76
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
G
guosheng 已提交
77 78
    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
    // 0s can avoid out of path's loss.
79
    math::SetConstant<DeviceContext, T> zero;
W
weixing02 已提交
80
    zero(dev_ctx, pre_out, static_cast<T>(0.0));
Y
Yancey1989 已提交
81 82
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    math::RowwiseSum<DeviceContext, T> row_sum;
83 84 85 86 87 88 89 90 91

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
                                                       label->data<int64_t>()));
    } else {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
                                                       label->data<int64_t>()));
    }
Y
Yancey1989 已提交
92

Y
Yancey1989 已提交
93 94
    std::vector<int64_t> sum_dims({batch_size, 1UL});
    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
Y
Yancey1989 已提交
95
    auto sum_mat = EigenMatrix<T>::From(sum);
Y
Yancey1989 已提交
96
    out->mutable_data<T>(ctx.GetPlace());
Y
Yancey1989 已提交
97
    auto out_mat = framework::EigenVector<T>::Flatten(*out);
Y
Yancey1989 已提交
98
    if (bias) {
99
      bit_code->Add(pre_out, *bias);
Y
Yancey1989 已提交
100
    }
101
    bit_code->Mul(pre_out, *w, *in);
G
guosheng 已提交
102
    // clip to [-40, 40]
Y
Yancey1989 已提交
103 104
    Transform<DeviceContext> trans;
    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
W
weixing02 已提交
105
          pre_out_data + pre_out->numel(), pre_out_data,
Y
Yancey1989 已提交
106
          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
107
    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
G
guosheng 已提交
108
    // use softrelu to calculate cross entropy
Y
Yancey1989 已提交
109
    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
W
weixing02 已提交
110
    row_sum(dev_ctx, *pre_out, &sum);
111 112 113 114
    // TODO(guosheng): Subtract the out of path's loss, since not all
    // class(leaf) nodes' path lengths equal code_length. But it won't break the
    // gradient check since both have the out of path's loss and will cancel out
    // each other.
Y
Yancey1989 已提交
115
    out_mat.device(place) = sum_mat + out_mat;
Y
Yancey1989 已提交
116
  }
Y
Yancey1989 已提交
117 118
};

Y
Yancey1989 已提交
119
template <typename DeviceContext, typename T>
Y
Yancey1989 已提交
120 121
class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
 public:
Y
Yancey1989 已提交
122
  void Compute(const framework::ExecutionContext& ctx) const override {
J
JiabinYang 已提交
123 124 125 126 127 128 129 130 131
    auto* in = ctx.Input<framework::LoDTensor>("X");
    auto* w = ctx.Input<framework::LoDTensor>("W");
    auto* path = ctx.Input<framework::LoDTensor>("PTable");
    auto* code = ctx.Input<framework::LoDTensor>("PCode");
    auto* in_grad =
        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
    bool is_sparse = ctx.Attr<bool>("is_sparse");
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    math::SetConstant<DeviceContext, T> zero;
W
weixing02 已提交
132
    auto* bias_grad =
J
JiabinYang 已提交
133 134 135
        ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
    auto* label = ctx.Input<framework::LoDTensor>("Label");
    auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut");
W
weixing02 已提交
136
    auto* out_grad =
J
JiabinYang 已提交
137 138
        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
    framework::LoDTensor pre_out_grad;
139 140 141 142

    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
    in_grad->mutable_data<T>(ctx.GetPlace());
    zero(dev_ctx, in_grad, static_cast<T>(0.0));
W
weixing02 已提交
143

Y
Yancey1989 已提交
144
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160

    bool is_custom = false;
    if (path) {
      is_custom = true;
    } else {
      is_custom = false;
    }

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
                                                       label->data<int64_t>()));
    } else {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
                                                       label->data<int64_t>()));
    }
161

Y
Yancey1989 已提交
162
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
W
weixing02 已提交
163 164
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
W
weixing02 已提交
165
    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
J
JiabinYang 已提交
166

167 168 169 170 171
    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});

    // softrelu derivative
    pre_out_grad_mat.device(place) =
        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
172
    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
W
weixing02 已提交
173
    pre_out_grad_mat.device(place) =
174
        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
G
guosheng 已提交
175 176
    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
    // be consistent with the clipping in forward.
W
weixing02 已提交
177 178
    if (bias_grad) {
      bias_grad->mutable_data<T>(ctx.GetPlace());
179
      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
180
      bit_code->AddGrad(pre_out_grad, bias_grad);
Y
Yancey1989 已提交
181
    }
J
JiabinYang 已提交
182 183 184 185 186 187 188 189 190 191 192 193 194
    if (!is_sparse) {
      auto* w_grad =
          ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
      w_grad->mutable_data<T>(ctx.GetPlace());
      zero(dev_ctx, w_grad, static_cast<T>(0.0));
      bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
    } else {
      framework::Vector<int64_t> real_rows = cal_rows(path);
      auto* w_grad =
          ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
      w_grad->set_rows(real_rows);
      // build ids -> rows index map
      w_grad->SyncIndex();
J
JiabinYang 已提交
195
      w_grad->set_height(w->dims()[0]);
J
JiabinYang 已提交
196 197 198 199 200 201 202 203
      auto* w_grad_value = w_grad->mutable_value();
      framework::DDim temp_dim(w->dims());
      set(temp_dim, 0, real_rows.size());

      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
      bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
    }
204
    bit_code->MulGradError(pre_out_grad, *w, in_grad);
Y
Yancey1989 已提交
205
  }
Y
Yancey1989 已提交
206 207 208 209
};

}  // namespace operators
}  // namespace paddle