hierarchical_sigmoid_op.h 8.8 KB
Newer Older
Y
Yancey1989 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
Q
Qiao Longfei 已提交
16

W
weixing02 已提交
17
#include <iostream>
18
#include <iterator>
Q
Qiao Longfei 已提交
19
#include <memory>
J
JiabinYang 已提交
20
#include <set>
21
#include <string>
W
weixing02 已提交
22
#include <vector>
Q
Qiao Longfei 已提交
23

J
JiabinYang 已提交
24
#include "paddle/fluid/framework/mixed_vector.h"
W
weixing02 已提交
25 26
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/clip_op.h"
J
JiabinYang 已提交
27
#include "paddle/fluid/operators/detail/safe_ref.h"
W
weixing02 已提交
28 29 30
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h"
J
JiabinYang 已提交
31

32 33 34 35
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif

Y
Yancey1989 已提交
36 37 38
namespace paddle {
namespace operators {

Y
Yancey1989 已提交
39 40 41
template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
Y
Yancey1989 已提交
42
using platform::Transform;
Y
Yancey1989 已提交
43

J
JiabinYang 已提交
44 45
static std::vector<int64_t> PathToRows(const framework::LoDTensor& path) {
  std::set<int64_t> rows;
46
  const int64_t* paths = path.data<int64_t>();
J
JiabinYang 已提交
47
  for (int64_t i = 0; i < path.numel(); ++i) {
48
    int64_t row = paths[i];
J
JiabinYang 已提交
49 50
    if (row < 0) {
      continue;
J
JiabinYang 已提交
51
    }
J
JiabinYang 已提交
52
    rows.emplace(row);
J
JiabinYang 已提交
53
  }
J
JiabinYang 已提交
54
  return std::vector<int64_t>(rows.begin(), rows.end());
J
JiabinYang 已提交
55
}
Y
Yancey1989 已提交
56
template <typename DeviceContext, typename T>
Y
Yancey1989 已提交
57 58
class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
 public:
Y
Yancey1989 已提交
59
  void Compute(const framework::ExecutionContext& ctx) const override {
60 61
    auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
    auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
62
    auto* path = ctx.Input<framework::LoDTensor>("PathTable");
J
JiabinYang 已提交
63
    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
64
    auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
J
JiabinYang 已提交
65 66 67
    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
    auto* out = ctx.Output<framework::LoDTensor>("Out");
    auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
Y
Yancey1989 已提交
68
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
69 70
    // for remote prefetch

71 72 73 74 75 76
    bool is_custom = false;
    if (path) {
      is_custom = true;
    }
    int64_t code_length =
        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
J
JiabinYang 已提交
77
    int64_t batch_size = in.dims()[0];
J
JiabinYang 已提交
78
    framework::LoDTensor sum;
W
weixing02 已提交
79
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
G
guosheng 已提交
80
    auto* pre_out_data = pre_out->mutable_data<T>(
Y
Yancey1989 已提交
81
        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
W
weixing02 已提交
82
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
G
guosheng 已提交
83 84
    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
    // 0s can avoid out of path's loss.
85
    math::SetConstant<DeviceContext, T> zero;
W
weixing02 已提交
86
    zero(dev_ctx, pre_out, static_cast<T>(0.0));
Y
Yancey1989 已提交
87 88
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    math::RowwiseSum<DeviceContext, T> row_sum;
89 90 91 92

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
J
JiabinYang 已提交
93
                                                       label.data<int64_t>()));
94
    } else {
J
JiabinYang 已提交
95 96
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
                                                       label.data<int64_t>()));
97
    }
Y
Yancey1989 已提交
98

Y
Yancey1989 已提交
99 100
    std::vector<int64_t> sum_dims({batch_size, 1UL});
    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
Y
Yancey1989 已提交
101
    auto sum_mat = EigenMatrix<T>::From(sum);
Y
Yancey1989 已提交
102
    out->mutable_data<T>(ctx.GetPlace());
103
    auto out_mat = framework::EigenMatrix<T>::From(*out);
Y
Yancey1989 已提交
104
    if (bias) {
105
      bit_code->Add(*bias, pre_out);
Y
Yancey1989 已提交
106
    }
J
JiabinYang 已提交
107
    bit_code->Mul(pre_out, w, in);
G
guosheng 已提交
108
    // clip to [-40, 40]
Y
Yancey1989 已提交
109 110
    Transform<DeviceContext> trans;
    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
W
weixing02 已提交
111
          pre_out_data + pre_out->numel(), pre_out_data,
Y
Yancey1989 已提交
112
          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
113
    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
G
guosheng 已提交
114
    // use softrelu to calculate cross entropy
Y
Yancey1989 已提交
115
    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
W
weixing02 已提交
116
    row_sum(dev_ctx, *pre_out, &sum);
117 118 119 120
    // TODO(guosheng): Subtract the out of path's loss, since not all
    // class(leaf) nodes' path lengths equal code_length. But it won't break the
    // gradient check since both have the out of path's loss and will cancel out
    // each other.
Y
Yancey1989 已提交
121
    out_mat.device(place) = sum_mat + out_mat;
Y
Yancey1989 已提交
122
  }
Y
Yancey1989 已提交
123 124
};

Y
Yancey1989 已提交
125
template <typename DeviceContext, typename T>
Y
Yancey1989 已提交
126 127
class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
 public:
Y
Yancey1989 已提交
128
  void Compute(const framework::ExecutionContext& ctx) const override {
129 130
    auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
    auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
131
    auto* path = ctx.Input<framework::LoDTensor>("PathTable");
J
JiabinYang 已提交
132
    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
J
JiabinYang 已提交
133 134 135 136 137
    auto* in_grad =
        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
    bool is_sparse = ctx.Attr<bool>("is_sparse");
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    math::SetConstant<DeviceContext, T> zero;
138 139 140
    auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
    auto& pre_out = detail::Ref(ctx.Input<framework::LoDTensor>("PreOut"));
    auto& out_grad = detail::Ref(
J
JiabinYang 已提交
141
        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out")));
J
JiabinYang 已提交
142
    framework::LoDTensor pre_out_grad;
143

J
JiabinYang 已提交
144
    pre_out_grad.mutable_data<T>(pre_out.dims(), ctx.GetPlace());
145 146
    in_grad->mutable_data<T>(ctx.GetPlace());
    zero(dev_ctx, in_grad, static_cast<T>(0.0));
W
weixing02 已提交
147

Y
Yancey1989 已提交
148
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
149 150 151 152 153 154 155 156 157

    bool is_custom = false;
    if (path) {
      is_custom = true;
    }

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
J
JiabinYang 已提交
158
                                                       label.data<int64_t>()));
159
    } else {
J
JiabinYang 已提交
160 161
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
                                                       label.data<int64_t>()));
162
    }
163

Y
Use mkl  
Yu Yang 已提交
164
    // softrelu derivative
J
JiabinYang 已提交
165

Y
Use mkl  
Yu Yang 已提交
166
    auto blas = math::GetBlas<DeviceContext, T>(ctx);
167

Y
Use mkl  
Yu Yang 已提交
168 169 170 171 172 173 174 175
    auto* pre_out_grad_data = pre_out_grad.data<T>();
    auto* pre_out_data = pre_out.data<T>();
    auto n = pre_out.numel();
    blas.VEXP(n, pre_out_data, pre_out_grad_data);
    blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
    for (int64_t i = 0; i < n; ++i) {
      pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
    }
176
    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
Y
Use mkl  
Yu Yang 已提交
177 178 179 180 181 182 183 184
    auto* out_grad_data = out_grad.data<T>();

    int64_t dim0 = pre_out_grad.dims()[0];
    int64_t dim1 = pre_out_grad.dims()[1];
    for (int64_t i = 0; i < dim0; ++i) {
      T tmp = out_grad_data[i];
      blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
    }
G
guosheng 已提交
185 186
    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
    // be consistent with the clipping in forward.
187 188 189 190 191 192 193
    auto* bias_grad =
        ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
    if (bias_grad) {
      bias_grad->mutable_data<T>(ctx.GetPlace());
      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
      bit_code->AddGrad(pre_out_grad, bias_grad);
    }
J
JiabinYang 已提交
194 195 196 197 198
    if (!is_sparse) {
      auto* w_grad =
          ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
      w_grad->mutable_data<T>(ctx.GetPlace());
      zero(dev_ctx, w_grad, static_cast<T>(0.0));
J
JiabinYang 已提交
199
      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
J
JiabinYang 已提交
200
    } else {
201 202
      PADDLE_ENFORCE(path != nullptr,
                     "Sparse mode should not be used without custom tree!");
J
JiabinYang 已提交
203
      framework::Vector<int64_t> real_rows = PathToRows(*path);
J
JiabinYang 已提交
204 205 206
      auto* w_grad =
          ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
      w_grad->set_rows(real_rows);
207
      // Build a map of id -> row_index to speed up finding the index of one id
J
JiabinYang 已提交
208
      w_grad->set_height(w.dims()[0]);
J
JiabinYang 已提交
209
      auto* w_grad_value = w_grad->mutable_value();
J
JiabinYang 已提交
210
      framework::DDim temp_dim(w.dims());
211
      temp_dim[0] = real_rows.size();
J
JiabinYang 已提交
212 213
      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
J
JiabinYang 已提交
214
      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
J
JiabinYang 已提交
215
    }
J
JiabinYang 已提交
216
    bit_code->MulGradError(pre_out_grad, w, in_grad);
Y
Yancey1989 已提交
217
  }
Y
Yancey1989 已提交
218 219 220 221
};

}  // namespace operators
}  // namespace paddle