TensorEvaluate.h

/**
 * TensorEvaluate.h
 *
 * Author: hedaoyuan (hedaoyuan@baidu.com)
 * Created on: 2016-06-06
 *
 * Copyright (c) Baidu.com, Inc. All Rights Reserved
 *
 */

#pragma once

#include <algorithm>
#include "paddle/utils/Logging.h"
#include "hl_base.h"

namespace paddle {

/**
 * \brief The tensor cpu evaluate api.
 */
template<class T, typename LeftType, typename RightType>
inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
  TensorApply<LeftType, T> lhs_(lhs);
  TensorApply<const RightType, T> rhs_(rhs);
  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());

  if (lhs_.isContiguous() && rhs_.isContiguous()) {
    int size = lhs_.getHeight() * lhs_.getWidth();
    for (int index = 0; index < size; index++) {
      lhs_.applyRef(index) = rhs_.apply(index);
    }
  } else {
    for (size_t i = 0; i < lhs_.getHeight(); i++) {
      for (size_t j = 0; j < lhs_.getWidth(); j++) {
        lhs_.applyRef(i, j) = rhs_.apply(i, j);
      }
    }
  }
}

#ifdef __NVCC__
template<typename LeftType, typename RightType>
__global__
void TensorElementWiseOp(LeftType lhs, RightType rhs, const int border) {
  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < border) {
    lhs.applyRef(idx) = rhs.apply(idx);
  }
}

template<typename LeftType, typename RightType>
__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
      lhs.applyRef(i, j) = rhs.apply(i, j);
    }
  }
}

/**
 * \brief The tensor gpu evaluate api.
 */
template<class T, typename LeftType, typename RightType>
inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
  TensorApply<LeftType, T> lhs_(lhs);
  TensorApply<const RightType, T> rhs_(rhs);
  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());

  int dimM = lhs_.getHeight();
  int dimN = lhs_.getWidth();

  if (lhs_.isContiguous() && rhs_.isContiguous()) {
    int size = dimM * dimN;
    int blockSize = size <= 1024 ? size : 1024;
    int gridSize = (size + 1024 - 1) / 1024;
    TensorElementWiseOp
      <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(lhs_, rhs_, size);
  } else {
    int blockSizeY = std::min(32, dimM);
    int blockSizeX = (32 / blockSizeY) * 32;
    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
    dim3 threads(blockSizeX, blockSizeY);
    dim3 grid(gridSizeX, gridSizeY);
    TensorElementWiseOp
      <<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
  }

  CHECK_SYNC("TensorGpuApply failed");
}
#else
template<class T, typename LeftType, typename RightType>
inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {
}
#endif

}  // namespace paddle