fc_mkldnn_op.cc 16.6 KB
Newer Older
M
mozga-intel 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include <memory>
W
wanghuancoder 已提交
16

17
#include "paddle/fluid/operators/fc_op.h"
M
mozga-intel 已提交
18
#include "paddle/fluid/platform/mkldnn_helper.h"
19
#include "paddle/fluid/platform/mkldnn_reuse.h"
20

M
mozga-intel 已提交
21 22 23
namespace paddle {
namespace operators {

24 25 26 27 28
using dnnl::inner_product_forward;
using dnnl::memory;
using dnnl::primitive;
using dnnl::prop_kind;
using dnnl::stream;
29 30
using framework::DDim;
using framework::ExecutionContext;
31
using LoDTensor = phi::DenseTensor;
32
using platform::GetMKLDNNFormat;
33
using platform::MKLDNNDeviceContext;
34
using platform::MKLDNNGetDataType;
35
using platform::to_void_cast;
M
mozga-intel 已提交
36

37 38 39 40 41
template <typename T>
constexpr bool IsInt8() {
  return std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
}

M
Michał Gallus 已提交
42
template <typename T_in, typename T_w, typename T_out>
43 44 45
class FCMKLDNNHandler
    : public platform::MKLDNNHandlerNoCachingT<T_in,
                                               dnnl::inner_product_forward> {
M
mozga-intel 已提交
46
 public:
47 48
  FCMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
                  const platform::MKLDNNDeviceContext& dev_ctx,
49 50 51 52
                  const phi::DenseTensor* x,
                  const phi::DenseTensor* weights,
                  const phi::DenseTensor* bias,
                  phi::DenseTensor* out,
53 54 55 56 57 58 59 60 61 62 63 64 65 66
                  const int in_num_col_dims,
                  dnnl::engine mkldnn_engine,
                  platform::Place cpu_place)
      : platform::MKLDNNHandlerNoCachingT<T_in, dnnl::inner_product_forward>(
            mkldnn_engine, cpu_place),
        dev_ctx_(dev_ctx) {
    this->memory_key_ = ctx.InputName("W");

    auto x_vec_dims = phi::vectorize(x->dims());
    auto weights_vec_dims = phi::vectorize(weights->dims());

    int MB = 1;
    for (int i = 0; i < in_num_col_dims; ++i) {
      MB *= x_vec_dims[i];
67 68
    }

69 70 71
    int IC = 1;
    for (size_t i = in_num_col_dims; i < x_vec_dims.size(); ++i) {
      IC *= x_vec_dims[i];
72
    }
73

74
    int OC = weights_vec_dims[1];
M
mozga-intel 已提交
75

76
    dnnl::memory::desc bias_md;
77

78 79 80 81 82 83 84 85 86 87 88
    auto src_md = dnnl::memory::desc(
        {MB, IC}, MKLDNNGetDataType<T_in>(), dnnl::memory::format_tag::any);
    auto weights_md = dnnl::memory::desc(
        {OC, IC}, MKLDNNGetDataType<T_w>(), dnnl::memory::format_tag::any);
    auto dst_md = dnnl::memory::desc(
        {MB, OC}, MKLDNNGetDataType<T_out>(), dnnl::memory::format_tag::any);
    if (bias) {
      bias_md = dnnl::memory::desc({bias->numel()},
                                   MKLDNNGetDataType<float>(),
                                   dnnl::memory::format_tag::a);
    }
89

90
    const auto attrs = CreateFCAttrs(ctx);
A
Adam 已提交
91

92 93 94 95 96 97
    this->AcquireForwardPrimitiveDescriptor(attrs,
                                            prop_kind::forward_inference,
                                            src_md,
                                            weights_md,
                                            bias_md,
                                            dst_md);
M
mozga-intel 已提交
98 99
  }

100
 private:
101 102 103
  dnnl::primitive_attr CreateFCAttrs(const ExecutionContext& ctx) {
    dnnl::primitive_attr attributes;
    dnnl::post_ops post_operations;
104

105 106 107 108 109
    std::vector<float> output_shift_scale;
    float scale = 1.0f;
    if (IsInt8<T_w>()) {
      std::tie(output_shift_scale, scale) = ComputeOutputShiftScale(ctx);
      int mask = CreateMask(1, output_shift_scale.size() > 1);
110
      attributes.set_output_scales(mask, output_shift_scale);
111
    }
112

113
    float sum_scale = 1.0f;
114 115
    if (ctx.HasAttr("fuse_residual_connection") &&
        ctx.Attr<bool>("fuse_residual_connection")) {
116
      post_operations.append_sum(sum_scale);
117
    }
M
mozga-intel 已提交
118

119 120 121 122
    // ReLU from "fc_fuse_pass"
    if (ctx.Attr<std::string>("activation_type") == "relu") {
      post_operations.append_eltwise(
          scale, dnnl::algorithm::eltwise_relu, 0.0f, 0.0f);
123
    }
124
    platform::AppendActivation(ctx, post_operations, scale);
125

126 127 128 129 130 131
    if (ctx.HasAttr("fused_output_scale")) {
      float scale_alpha = ctx.Attr<float>("fused_output_scale");
      post_operations.append_eltwise(
          1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
    }

132 133
    attributes.set_post_ops(post_operations);
    return attributes;
134 135
  }

M
Michał Gallus 已提交
136 137
  // Compute the bias scales so that its values correspond to the
  // scale of data being an output of weights and input multiplication
138 139 140
  std::vector<float> ComputeBiasScales(
      const float scale_in, const std::vector<float>& scale_weights) {
    std::vector<float> bias_scales(scale_weights.size());
M
Michał Gallus 已提交
141

142 143
    for (size_t i = 0; i < bias_scales.size(); ++i) {
      if (scale_weights[i] == 0.0)
M
Michał Gallus 已提交
144 145
        bias_scales[i] = 1.0f;
      else
146
        bias_scales[i] = scale_in * scale_weights[i];
M
Michał Gallus 已提交
147 148 149 150 151 152 153 154 155 156
    }

    return bias_scales;
  }

  // Correct output scale, to take into account scaling of input and weights
  // Since the data that comes out of input and weight multiplication is
  // scaled with its own scales, this data needs to be divided by
  // those scales to normalise them back to what their floating-point range
  // was. Then we multiply them by desired output scale we want on the output.
157 158
  std::tuple<std::vector<float>, float> ComputeOutputShiftScale(
      const ExecutionContext& ctx) {
M
Michał Gallus 已提交
159 160
    auto scale_in_data = ctx.Attr<float>("Scale_in");
    auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
161 162
    bool has_activation = !ctx.Attr<std::string>("activation_type").empty();
    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
163

M
Michał Gallus 已提交
164
    // If the output will be in floats, we don't multiply by scale_out.
165

166 167 168 169 170 171
    float scale = (!force_fp32_output && has_activation)
                      ? ctx.Attr<float>("Scale_out")
                      : 1.0f;
    float inner_scale = (force_fp32_output || has_activation)
                            ? 1.0f
                            : ctx.Attr<float>("Scale_out");
M
Michał Gallus 已提交
172 173
    const size_t weight_scales_num = scale_weights_data.size();

174
    for (size_t i = 0; i < weight_scales_num; ++i) {
M
Michał Gallus 已提交
175
      if (scale_weights_data[i] == 0.0)
176
        scale_weights_data[i] = inner_scale;
M
Michał Gallus 已提交
177
      else
178
        scale_weights_data[i] =
179
            inner_scale / (scale_in_data * scale_weights_data[i]);
M
Michał Gallus 已提交
180 181
    }

182
    return make_tuple(scale_weights_data, scale);
M
Michał Gallus 已提交
183 184 185 186 187 188 189 190 191 192
  }

  // Computing MKL-DNN's scaling mask which determines along which dimension
  // slice should the scaling be applied. For more data plase refer to:
  // https://intel.github.io/mkl-dnn/group__c__api__attributes.html
  // Section dnnl_status_t DNNL_API dnnl_primitive_attr_set_output_scales
  int CreateMask(int slice_dimension, bool is_multi_channel_quantizied) {
    return is_multi_channel_quantizied ? 1 << slice_dimension : 0;
  }

193 194 195 196 197 198
  std::shared_ptr<dnnl::memory> AcquireMemoryWithReorderAndAttrs(
      const dnnl::memory::desc& user_md,
      const dnnl::memory::desc& target_md,
      void* ptr,
      const dnnl::primitive_attr& attrs) {
    std::shared_ptr<dnnl::memory> target_memory_p;
M
Michał Gallus 已提交
199

200 201 202 203 204
    auto user_memory_p =
        std::make_shared<dnnl::memory>(user_md, this->engine_, ptr);
    target_memory_p = std::make_shared<dnnl::memory>(target_md, this->engine_);
    auto reorder_p = std::make_shared<dnnl::reorder>(
        *user_memory_p, *target_memory_p, attrs);
M
Michał Gallus 已提交
205

206 207 208 209 210
    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
    reorder_p->execute(
        astream,
        {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
    astream.wait();
M
Michał Gallus 已提交
211

212 213
    return target_memory_p;
  }
214

215 216
  std::string memory_key_;
  const platform::MKLDNNDeviceContext& dev_ctx_;
M
Michał Gallus 已提交
217

218
 public:
219 220
  std::shared_ptr<dnnl::memory> AcquireSrcMemoryWithReorder(
      const phi::DenseTensor* x) {
221 222 223 224 225 226 227
    const T_in* x_data = x->data<T_in>();

    auto user_md = x->mem_desc();
    if (x->dims().size() != 2) {
      // reshape restrictions are always satisfied because in case of 3 or 4 dim
      // input, plain layout is enforced
      user_md = user_md.reshape(this->fwd_pd_->src_desc().dims());
M
Michał Gallus 已提交
228 229
    }

230 231
    return this->AcquireMemoryWithReorder(
        user_md, this->fwd_pd_->src_desc(), to_void_cast<T_in>(x_data));
232
  }
M
mozga-intel 已提交
233

234
  std::shared_ptr<dnnl::memory> AcquireBiasMemoryWithReorder(
235
      const phi::DenseTensor* bias,
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
      const float scale_in,
      const std::vector<float>& scale_weights) {
    const float* bias_data = bias->data<float>();

    if (IsInt8<T_w>() == false) {
      // for BF16/FP32 bias is 1D and has no scales, so reorder is not needed
      return this->AcquireMemoryFromPrimitive(this->fwd_pd_->bias_desc(),
                                              to_void_cast<float>(bias_data));
    } else {
      const std::string bias_key = this->memory_key_ + "@bias";
      auto memory_p = std::static_pointer_cast<dnnl::memory>(
          this->dev_ctx_.GetBlob(bias_key));

      if (!memory_p) {
        const auto& scale_data = ComputeBiasScales(scale_in, scale_weights);
        dnnl::primitive_attr attrs;

        int mask = CreateMask(0, scale_data.size() > 1);
        attrs.set_output_scales(mask, scale_data);

        auto user_md = dnnl::memory::desc({bias->dims()[0]},
                                          MKLDNNGetDataType<float>(),
                                          dnnl::memory::format_tag::a);

        memory_p = this->AcquireMemoryWithReorderAndAttrs(
            user_md,
            this->fwd_pd_->bias_desc(),
            to_void_cast<float>(bias_data),
            attrs);
265
        this->dev_ctx_.SetBlob(bias_key, memory_p);
266 267 268 269 270 271
      }
      return memory_p;
    }
  }

  std::shared_ptr<dnnl::memory> AcquireWeightsMemoryWithReorder(
272
      const phi::DenseTensor* weights, const std::vector<float>& scale_data) {
273 274 275
    const std::string weights_key = this->memory_key_ + "@weights";
    auto memory_p = std::static_pointer_cast<dnnl::memory>(
        this->dev_ctx_.GetBlob(weights_key));
M
mozga-intel 已提交
276

277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
    if (!memory_p) {
      const float* weights_data = weights->data<float>();
      auto weights_dims = this->fwd_pd_->weights_desc().dims();

      auto user_md = dnnl::memory::desc(weights_dims,
                                        MKLDNNGetDataType<float>(),
                                        dnnl::memory::format_tag::io);

      if (IsInt8<T_w>()) {
        dnnl::primitive_attr attrs;
        int mask = CreateMask(0, scale_data.size() > 1);
        attrs.set_output_scales(mask, scale_data);

        memory_p = this->AcquireMemoryWithReorderAndAttrs(
            user_md,
            this->fwd_pd_->weights_desc(),
            to_void_cast<float>(weights_data),
            attrs);
      } else {
        memory_p =
            this->AcquireMemoryWithReorder(user_md,
                                           this->fwd_pd_->weights_desc(),
                                           to_void_cast<float>(weights_data));
      }

      this->dev_ctx_.SetBlob(weights_key, memory_p);
    }
    return memory_p;
305
  }
M
mozga-intel 已提交
306

307
  std::shared_ptr<dnnl::memory> AcquireCustomDstMemory(
308
      const ExecutionContext& ctx, phi::DenseTensor* out) {
309 310
    if (ctx.HasAttr("fuse_residual_connection") &&
        ctx.Attr<bool>("fuse_residual_connection")) {
311
      auto* residual_param = ctx.Output<phi::DenseTensor>("ResidualData");
312 313

      PADDLE_ENFORCE_EQ(
314
          out->dims(),
315
          residual_param->dims(),
316 317 318 319
          platform::errors::InvalidArgument(
              "Output and elementwise parameter need to have the "
              "same dimension sizes, but got output's dimension = %d"
              " and residual param's dimension =%d .",
320
              out->dims().size(),
321
              residual_param->dims().size()));
322

323
      out->ShareDataWith(*residual_param);
324
    }
325
    return this->template AcquireDstMemory<T_out>(out);
326 327
  }  // namespace operators
};   // namespace paddle
328

329 330 331 332 333 334 335 336 337 338
#define IF_CHANGE_FC_TW_TYPENAME(condition, ...) \
  if (condition) {                               \
    using T_w = int8_t;                          \
    __VA_ARGS__();                               \
  } else {                                       \
    using T_w = T_in;                            \
    __VA_ARGS__();                               \
  }

template <typename T_in>
339 340 341 342 343
class FCMKLDNNKernel : public framework::OpKernel<T_in> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
    bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
344

345 346 347 348 349 350 351 352 353 354 355 356 357
    IF_CHANGE_FC_TW_TYPENAME((std::is_same<T_in, uint8_t>::value), ([&] {
                               if (force_fp32_output) {
                                 this->RunKernel<float, T_w>(ctx);
                               } else if (IsInt8<T_in>()) {
                                 if (fuse_relu) {
                                   this->RunKernel<uint8_t, T_w>(ctx);
                                 } else {
                                   this->RunKernel<int8_t, T_w>(ctx);
                                 }
                               } else {
                                 this->RunKernel<T_in, T_w>(ctx);
                               }
                             }));
358 359
  }

360
  template <typename T_out, typename T_w>
361 362 363 364 365 366
  void RunKernel(const framework::ExecutionContext& ctx) const {
    const auto& dev_ctx =
        ctx.template device_context<platform::MKLDNNDeviceContext>();
    const auto& mkldnn_engine = dev_ctx.GetEngine();

    const auto* x = ctx.Input<LoDTensor>("Input");
367 368
    const auto* weights = ctx.Input<phi::DenseTensor>("W");
    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
    auto out = ctx.Output<LoDTensor>("Out");

    auto in_col_dims = ctx.Attr<int>("in_num_col_dims");

    const float scale_in = ctx.Attr<float>("Scale_in");
    const auto& scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");

    RecomputeOutputDims(ctx, x, weights, out);

    FCMKLDNNHandler<T_in, T_w, T_out> handler(ctx,
                                              dev_ctx,
                                              x,
                                              weights,
                                              bias,
                                              out,
                                              in_col_dims,
                                              mkldnn_engine,
                                              ctx.GetPlace());

    auto src_memory_p = handler.AcquireSrcMemoryWithReorder(x);
    auto weights_memory_p =
        handler.AcquireWeightsMemoryWithReorder(weights, scale_weights);
    auto dst_memory_p = handler.AcquireCustomDstMemory(ctx, out);

    auto fc_p = handler.AcquireForwardPrimitive();
    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();

    std::unordered_map<int, dnnl::memory> fc_args = {
        {DNNL_ARG_SRC, *src_memory_p},
        {DNNL_ARG_WEIGHTS, *weights_memory_p},
        {DNNL_ARG_DST, *dst_memory_p}};

    if (bias) {
      auto bias_memory_p =
          handler.AcquireBiasMemoryWithReorder(bias, scale_in, scale_weights);
      fc_args.insert({DNNL_ARG_BIAS, *bias_memory_p});
    }

    fc_p->execute(astream, fc_args);
    astream.wait();

    out->set_mem_desc(
        dst_memory_p->get_desc().reshape(phi::vectorize(out->dims())));
412
  }
M
mozga-intel 已提交
413

414
  void RecomputeOutputDims(const ExecutionContext& ctx,
415
                           const LoDTensor* x,
416
                           const phi::DenseTensor* weights,
417
                           LoDTensor* out) const {
L
luotao1 已提交
418
    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
419
    bool padding_weights = ctx.Attr<bool>("padding_weights");
420 421
    PADDLE_ENFORCE_EQ(padding_weights,
                      false,
422 423
                      platform::errors::PermissionDenied(
                          "Weight padding in fc can not be used in MKLDNN."));
L
luotao1 已提交
424
    std::vector<int64_t> output_dims;
425 426
    FCOutputSize(x->dims(),
                 weights->dims(),
427 428
                 output_dims,
                 in_num_col_dims,
429
                 padding_weights);
430 431
    out->Resize(phi::make_ddim(output_dims));
    out->set_lod(x->lod());
432 433
  }
};
M
mozga-intel 已提交
434 435 436 437

}  // namespace operators
}  // namespace paddle

M
Michał Gallus 已提交
438 439 440 441
// Weights of FC are by default stored using fp32, template argument of weight
// data type implies their destination data type. (What's eventually going to
// be used during computations of kernel).
namespace ops = paddle::operators;
442 443 444 445 446 447 448 449

REGISTER_OP_KERNEL(fc,
                   MKLDNN,
                   ::paddle::platform::CPUPlace,
                   ops::FCMKLDNNKernel<float>,
                   ops::FCMKLDNNKernel<paddle::platform::bfloat16>,
                   ops::FCMKLDNNKernel<uint8_t>,
                   ops::FCMKLDNNKernel<int8_t>);