TensorAssign.h 5.1 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
H
hedaoyuan 已提交
2 3 4 5 6 7 8 9 10 11 12 13

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
H
hedaoyuan 已提交
14 15 16 17 18 19 20 21

#pragma once

#include <algorithm>
#include "paddle/utils/Logging.h"

namespace paddle {

H
hedaoyuan 已提交
22 23
/**
 * \brief Tensor Assign Expression(return by lazyAssign,
H
hedaoyuan 已提交
24
 * and evaluated by AssignEvaluate)
H
hedaoyuan 已提交
25
 */
H
hedaoyuan 已提交
26
template <typename LhsType, typename RhsType, class T>
H
hedaoyuan 已提交
27 28 29
class TensorAssignOp {
public:
  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
H
hedaoyuan 已提交
30 31 32 33 34 35
      : lhs_(lhs), rhs_(rhs) {
#ifndef __CUDA_ARCH__
    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
#endif
H
hedaoyuan 已提交
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
  }

  INLINE void apply(const int i, const int j) {
    lhs_.applyRef(i, j) = rhs_.apply(i, j);
  }
  INLINE void apply(const int index) {
    lhs_.applyRef(index) = rhs_.apply(index);
  }

  INLINE size_t getWidth() const { return lhs_.getWidth(); }
  INLINE size_t getHeight() const { return rhs_.getHeight(); }
  INLINE bool isContiguous() const {
    return lhs_.isContiguous() && rhs_.isContiguous();
  }
  INLINE bool useGpu() const { return lhs_.useGpu(); }

private:
  TensorApply<LhsType, T> lhs_;
  TensorApply<const RhsType, T> rhs_;
};

template <typename Assign, typename... AssignOp>
H
hedaoyuan 已提交
58 59 60 61 62
void AssignCpuEvaluate(int height,
                       int width,
                       bool isContiguous,
                       Assign&& assign,
                       AssignOp&&... args) {
H
hedaoyuan 已提交
63 64 65 66
  if (isContiguous) {
    int size = height * width;
    for (int index = 0; index < size; index++) {
      assign.apply(index);
H
hedaoyuan 已提交
67
      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
H
hedaoyuan 已提交
68 69 70 71 72
    }
  } else {
    for (int i = 0; i < height; i++) {
      for (int j = 0; j < width; j++) {
        assign.apply(i, j);
H
hedaoyuan 已提交
73
        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
H
hedaoyuan 已提交
74 75 76 77 78 79 80
      }
    }
  }
}

#ifdef __NVCC__
template <typename Assign, typename... AssignOp>
H
hedaoyuan 已提交
81 82 83
__global__ void AssignGpuEvaluate1(const int border,
                                   Assign assign,
                                   AssignOp... args) {
H
hedaoyuan 已提交
84 85 86
  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < border) {
    assign.apply(idx);
H
hedaoyuan 已提交
87
    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
H
hedaoyuan 已提交
88 89 90 91
  }
}

template <typename Assign, typename... AssignOp>
H
hedaoyuan 已提交
92 93 94 95
__global__ void AssignGpuEvaluate2(const int height,
                                   const int width,
                                   Assign assign,
                                   AssignOp... args) {
H
hedaoyuan 已提交
96 97 98 99 100
  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
      assign.apply(i, j);
H
hedaoyuan 已提交
101
      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
H
hedaoyuan 已提交
102 103 104 105 106
    }
  }
}
#endif

H
hedaoyuan 已提交
107 108 109 110 111
/**
 * \brief Evaluate one or more TensorAssignOp objects.
 *
 * \note At least one assignment expression is required
 */
H
hedaoyuan 已提交
112
template <typename Assign, typename... AssignOp>
H
hedaoyuan 已提交
113
void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
H
hedaoyuan 已提交
114 115 116 117 118 119
  const bool useGpu_ = assign.useGpu();
  bool isContiguous_ = assign.isContiguous();
  const size_t height = assign.getHeight();
  const size_t width = assign.getWidth();

  const int packSize = sizeof...(args);
H
hedaoyuan 已提交
120 121 122 123
  const bool packUseGpu[] = {((args)).useGpu()...};
  const bool packIsContiguous[] = {((args)).isContiguous()...};
  const size_t packHeight[] = {((args)).getHeight()...};
  const size_t packWidth[] = {((args)).getWidth()...};
H
hedaoyuan 已提交
124 125 126 127 128

  for (int i = 0; i < packSize; i++) {
    CHECK_EQ(useGpu_, packUseGpu[i]);
    CHECK_EQ(height, packHeight[i]);
    CHECK_EQ(width, packWidth[i]);
H
hedaoyuan 已提交
129
    isContiguous_ = isContiguous_ && packIsContiguous[i];
H
hedaoyuan 已提交
130 131 132 133 134 135 136 137
  }

  if (useGpu_) {
#ifdef __NVCC__
    if (isContiguous_) {
      int size = height * width;
      int blockSize = size <= 1024 ? size : 1024;
      int gridSize = (size + 1024 - 1) / 1024;
H
hedaoyuan 已提交
138 139
      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
          size, assign, args...);
H
hedaoyuan 已提交
140 141 142 143 144 145 146
    } else {
      int blockSizeY = std::min(32, (int)height);
      int blockSizeX = (32 / blockSizeY) * 32;
      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
      dim3 threads(blockSizeX, blockSizeY);
      dim3 grid(gridSizeX, gridSizeY);
H
hedaoyuan 已提交
147 148
      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
          height, width, assign, args...);
H
hedaoyuan 已提交
149 150 151 152 153 154 155 156 157 158
    }

    CHECK_SYNC("AssignEvaluate failed");
#endif
  } else {
    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
  }
}

}  // namespace paddle