/** * TensorEvaluate.h * * Author: hedaoyuan (hedaoyuan@baidu.com) * Created on: 2016-06-06 * * Copyright (c) Baidu.com, Inc. All Rights Reserved * */ #pragma once #include #include "paddle/utils/Logging.h" #include "hl_base.h" namespace paddle { /** * \brief The tensor cpu evaluate api. */ template inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) { TensorApply lhs_(lhs); TensorApply rhs_(rhs); CHECK_EQ(lhs_.getWidth(), rhs_.getWidth()); CHECK_EQ(lhs_.getHeight(), rhs_.getHeight()); CHECK_EQ(lhs_.useGpu(), rhs_.useGpu()); if (lhs_.isContiguous() && rhs_.isContiguous()) { int size = lhs_.getHeight() * lhs_.getWidth(); for (int index = 0; index < size; index++) { lhs_.applyRef(index) = rhs_.apply(index); } } else { for (size_t i = 0; i < lhs_.getHeight(); i++) { for (size_t j = 0; j < lhs_.getWidth(); j++) { lhs_.applyRef(i, j) = rhs_.apply(i, j); } } } } #ifdef __NVCC__ template __global__ void TensorElementWiseOp(LeftType lhs, RightType rhs, const int border) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < border) { lhs.applyRef(idx) = rhs.apply(idx); } } template __global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) { const int colIdx = blockIdx.x * blockDim.x + threadIdx.x; const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y; for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) { for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) { lhs.applyRef(i, j) = rhs.apply(i, j); } } } /** * \brief The tensor gpu evaluate api. */ template inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) { TensorApply lhs_(lhs); TensorApply rhs_(rhs); CHECK_EQ(lhs_.getWidth(), rhs_.getWidth()); CHECK_EQ(lhs_.getHeight(), rhs_.getHeight()); CHECK_EQ(lhs_.useGpu(), rhs_.useGpu()); int dimM = lhs_.getHeight(); int dimN = lhs_.getWidth(); if (lhs_.isContiguous() && rhs_.isContiguous()) { int size = dimM * dimN; int blockSize = size <= 1024 ? size : 1024; int gridSize = (size + 1024 - 1) / 1024; TensorElementWiseOp <<>>(lhs_, rhs_, size); } else { int blockSizeY = std::min(32, dimM); int blockSizeX = (32 / blockSizeY) * 32; int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX); int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY); dim3 threads(blockSizeX, blockSizeY); dim3 grid(gridSizeX, gridSizeY); TensorElementWiseOp <<>>(lhs_, rhs_); } CHECK_SYNC("TensorGpuApply failed"); } #else template inline void TensorGpuApply(LeftType& lhs, RightType& rhs) { } #endif } // namespace paddle