未验证 提交 72c11758 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Fix layout, target pass for OpenCL, add macro of...

[LITE][OPENCL] Fix layout, target pass for OpenCL, add macro of CONVERT_TYPE_TO and READ/WRITE image, memory reuse in ResetLazyImage2D (#2170)

* add macro of CONVERT_TYPE_TO and READ/WRITE image. test=develop

* add data type control. test=develop

* fix io op as general layout and precision. test=develop

* Fix memory reuse strategy for opencl image2d. test=develop

* remove std::array, std::map in about opencl backend. test=develop
上级 7a731b7f
......@@ -31,8 +31,8 @@ static void CopyImageData(CLContext* context,
float* image_data = new float[height * width * 4];
cl::Image* image = cl_image.cl_image();
const std::array<size_t, 3> origin{0, 0, 0};
const std::array<size_t, 3> region{
cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = {
static_cast<size_t>(width), static_cast<size_t>(height), 1};
cl_int err = context->GetCommandQueue().enqueueReadImage(
*image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <array>
#include <memory>
#include <random>
#include <vector>
......@@ -395,51 +394,74 @@ TEST(cl_test, target_wrapper_buffer_test) {
}
TEST(cl_test, target_wrapper_image_test) {
const std::array<size_t, 2> image_shape{28, 32};
std::array<size_t, 2> image_pitch;
const size_t cl_image2d_width = 28;
const size_t cl_image2d_height = 32;
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
auto *d_image = static_cast<cl::Image2D *>(
TargetWrapperCL::MallocImage<float>(image_shape));
TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
// Map/Unmap test
auto *h_image = static_cast<float *>(
TargetWrapperCL::MapImage(d_image, image_shape, &image_pitch));
// row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes)
// slice_pitch = 0
size_t row_pitch = image_pitch[0];
size_t slice_pitch = image_pitch[1];
CHECK_EQ(row_pitch, 448);
CHECK_EQ(slice_pitch, 0);
LOG(INFO) << "row_pitch = " << row_pitch << ", slice_pitch " << slice_pitch;
auto *h_image =
static_cast<float *>(TargetWrapperCL::MapImage(d_image,
cl_image2d_width,
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch));
CHECK_EQ(
cl_image2d_row_pitch,
cl_image2d_width * 4 *
4); // row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes)
CHECK_EQ(cl_image2d_slice_pitch, 0); // slice_pitch = 0
LOG(INFO) << "cl_image2d_row_pitch = " << cl_image2d_row_pitch
<< ", cl_image2d_slice_pitch " << cl_image2d_slice_pitch;
for (int i = 0; i < 10; i++) {
h_image[i] = 3.14f * i;
}
TargetWrapperCL::Unmap(d_image, h_image);
auto *h_ptr = static_cast<float *>(
TargetWrapperCL::MapImage(d_image, image_shape, &image_pitch));
auto *h_ptr =
static_cast<float *>(TargetWrapperCL::MapImage(d_image,
cl_image2d_width,
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch));
for (int i = 0; i < 10; i++) {
EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6);
}
TargetWrapperCL::Unmap(d_image, h_ptr);
// Imagecpy test
std::vector<float> h_image_cpy(28 * 4 * 32);
for (int i = 0; i < 28 * 4 * 32; i++) {
std::vector<float> h_image_cpy(cl_image2d_width * 4 *
cl_image2d_height); // 4 for RGBA channels
for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
h_image_cpy[i] = 3.14f;
}
TargetWrapperCL::ImgcpySync(
d_image, h_image_cpy.data(), image_shape, image_pitch, IoDirection::HtoD);
TargetWrapperCL::ImgcpySync(d_image,
h_image_cpy.data(),
cl_image2d_width,
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::HtoD);
auto *d_image_cpy = static_cast<cl::Image2D *>(
TargetWrapperCL::MallocImage<float>(image_shape));
TargetWrapperCL::ImgcpySync(
d_image_cpy, d_image, image_shape, image_pitch, IoDirection::DtoD);
TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
TargetWrapperCL::ImgcpySync(d_image_cpy,
d_image,
cl_image2d_width,
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoD);
std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0);
TargetWrapperCL::ImgcpySync(h_image_cpy.data(),
d_image_cpy,
image_shape,
image_pitch,
cl_image2d_width,
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
for (int i = 0; i < 28 * 4 * 32; i++) {
for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6);
}
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/opencl/cl_image.h"
#include <array>
#include "lite/backends/opencl/cl_runtime.h"
#include "lite/backends/opencl/cl_utility.h"
#include "lite/utils/cp_logging.h"
......@@ -27,8 +26,9 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
float* image_data = new float[height * width * 4];
cl::Image* image = cl_image.cl_image();
const std::array<size_t, 3> origin{0, 0, 0};
const std::array<size_t, 3> region{
cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = {
static_cast<size_t>(width), static_cast<size_t>(height), 1};
cl_int err = CLRuntime::Global()->command_queue().enqueueReadImage(
*image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
......
......@@ -16,10 +16,35 @@ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
// Data type: pass one of macros on host: [CL_DTYPE_float, CL_DYPE_half]
#ifdef CL_DTYPE_float
#define CL_DTYPE float
#define CL_DTYPE_CHAR f
#endif
#ifdef CL_DTYPE_half
#define CL_DTYPE half
#define CL_DTYPE_CHAR h
#endif
// Note: macro name replacement need twice parser
#define GET_VEC_TYPE(type__, size__) type__##size__
#define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
#define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
#define _CONVERT_TYPE_TO(value, type) convert_##type(value)
#define CONVERT_TYPE_TO(value, type) _CONVERT_TYPE_TO(value, type)
#define _WRITE_IMG_TYPE(type_char, img, pos, value) \
write_image##type_char(img, pos, value)
#define WRITE_IMG_TYPE(type_char, img, pos, value) \
_WRITE_IMG_TYPE(type_char, img, pos, value)
#define _READ_IMG_TYPE(type_char, img, pos, sampler) \
read_image##type_char(img, sampler, pos)
#define READ_IMG_TYPE(type_char, img, pos, sampler) \
_READ_IMG_TYPE(type_char, img, pos, sampler)
inline CL_DTYPE activation(CL_DTYPE in
#ifdef PRELU
,
......
......@@ -14,7 +14,6 @@
#include "lite/backends/opencl/target_wrapper.h"
#include <algorithm>
#include <array>
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_runtime.h"
#include "lite/backends/opencl/cl_utility.h"
......@@ -58,18 +57,16 @@ void TargetWrapperCL::Free(void *ptr) {
}
template <>
void *TargetWrapperCL::MallocImage<float>(
const std::array<size_t, 2> &image_shape) {
void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
const size_t cl_image2d_height) {
cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFloat)));
cl_int status;
size_t width = image_shape[0];
size_t height = image_shape[1];
cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format,
width,
height,
cl_image2d_width,
cl_image2d_height,
0,
nullptr,
&status);
......@@ -82,18 +79,16 @@ void *TargetWrapperCL::MallocImage<float>(
}
template <>
void *TargetWrapperCL::MallocImage<int8_t>(
const std::array<size_t, 2> &image_shape) {
void *TargetWrapperCL::MallocImage<int8_t>(const size_t cl_image2d_width,
const size_t cl_image2d_height) {
cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt8)));
cl_int status;
size_t width = image_shape[0];
size_t height = image_shape[1];
cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format,
width,
height,
cl_image2d_width,
cl_image2d_height,
0,
nullptr,
&status);
......@@ -106,18 +101,16 @@ void *TargetWrapperCL::MallocImage<int8_t>(
}
template <>
void *TargetWrapperCL::MallocImage<int32_t>(
const std::array<size_t, 2> &image_shape) {
void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
const size_t cl_image2d_height) {
cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt32)));
cl_int status;
size_t width = image_shape[0];
size_t height = image_shape[1];
cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format,
width,
height,
cl_image2d_width,
cl_image2d_height,
0,
nullptr,
&status);
......@@ -156,15 +149,13 @@ void *TargetWrapperCL::Map(void *buffer, size_t offset, size_t size) {
}
void *TargetWrapperCL::MapImage(void *image,
const std::array<size_t, 2> &image_shape,
std::array<size_t, 2> *image_pitch) {
const size_t cl_image2d_width,
const size_t cl_image2d_height,
size_t cl_image2d_row_pitch,
size_t cl_image2d_slice_pitch) {
cl::Image2D *cl_image = static_cast<cl::Image2D *>(image);
size_t width = image_shape[0];
size_t height = image_shape[1];
size_t *row_pitch = image_pitch->data();
size_t *slice_pitch = image_pitch->data() + 1;
std::array<size_t, 3> origin{{0, 0, 0}};
std::array<size_t, 3> region{{width, height, 1}};
cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
cl_int status;
void *mapped_ptr = CLRuntime::Global()->command_queue().enqueueMapImage(
*cl_image,
......@@ -172,8 +163,8 @@ void *TargetWrapperCL::MapImage(void *image,
CL_MAP_READ | CL_MAP_WRITE,
origin,
region,
row_pitch,
slice_pitch,
&cl_image2d_row_pitch,
&cl_image2d_slice_pitch,
nullptr,
nullptr,
&status);
......@@ -279,15 +270,13 @@ void TargetWrapperCL::MemcpyAsync(void *dst,
void TargetWrapperCL::ImgcpySync(void *dst,
const void *src,
const std::array<size_t, 2> &image_shape,
const std::array<size_t, 2> &image_pitch,
const size_t cl_image2d_width,
const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch,
IoDirection dir) {
size_t width = image_shape[0];
size_t height = image_shape[1];
size_t row_pitch = image_pitch[0];
size_t slice_pitch = image_pitch[1];
std::array<size_t, 3> origin{{0, 0, 0}};
std::array<size_t, 3> region{{width, height, 1}};
cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
cl_int status;
cl::Event event;
auto stream = CLRuntime::Global()->command_queue();
......@@ -308,8 +297,8 @@ void TargetWrapperCL::ImgcpySync(void *dst,
CL_TRUE,
origin,
region,
row_pitch,
slice_pitch,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
src,
nullptr,
nullptr);
......@@ -320,8 +309,8 @@ void TargetWrapperCL::ImgcpySync(void *dst,
CL_TRUE,
origin,
region,
row_pitch,
slice_pitch,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
dst,
nullptr,
nullptr);
......@@ -334,16 +323,14 @@ void TargetWrapperCL::ImgcpySync(void *dst,
void TargetWrapperCL::ImgcpyAsync(void *dst,
const void *src,
const std::array<size_t, 2> &image_shape,
const std::array<size_t, 2> &image_pitch,
const size_t cl_image2d_width,
const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch,
IoDirection dir,
const stream_t &stream) {
size_t width = image_shape[0];
size_t height = image_shape[1];
size_t row_pitch = image_pitch[0];
size_t slice_pitch = image_pitch[1];
std::array<size_t, 3> origin{{0, 0, 0}};
std::array<size_t, 3> region{{width, height, 1}};
cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
cl_int status;
switch (dir) {
case IoDirection::DtoD:
......@@ -361,8 +348,8 @@ void TargetWrapperCL::ImgcpyAsync(void *dst,
CL_FALSE,
origin,
region,
row_pitch,
slice_pitch,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
src,
nullptr,
nullptr);
......@@ -373,8 +360,8 @@ void TargetWrapperCL::ImgcpyAsync(void *dst,
CL_FALSE,
origin,
region,
row_pitch,
slice_pitch,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
dst,
nullptr,
nullptr);
......
......@@ -14,7 +14,6 @@
#pragma once
#include <array>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/target_wrapper.h"
......@@ -48,13 +47,16 @@ class TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event> {
static void Free(void* ptr);
template <typename R>
static void* MallocImage(const std::array<size_t, 2>& image_shape);
static void* MallocImage(const size_t cl_image2d_width,
const size_t cl_image2d_height);
static void FreeImage(void* image);
static void* Map(void* buffer, size_t offset, size_t size);
static void* MapImage(void* image,
const std::array<size_t, 2>& image_shape,
std::array<size_t, 2>* image_pitch);
const size_t cl_image2d_width,
const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch);
static void Unmap(void* cl_obj, void* mapped_ptr);
static void MemcpySync(void* dst,
......@@ -68,13 +70,17 @@ class TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event> {
const stream_t& stream);
static void ImgcpySync(void* dst,
const void* src,
const std::array<size_t, 2>& image_shape,
const std::array<size_t, 2>& image_pitch,
const size_t cl_image2d_width,
const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch,
IoDirection dir);
static void ImgcpyAsync(void* dst,
const void* src,
const std::array<size_t, 2>& image_shape,
const std::array<size_t, 2>& image_pitch,
const size_t cl_image2d_width,
const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch,
IoDirection dir,
const stream_t& stream);
};
......
......@@ -109,10 +109,17 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
void TargetCopyImage2D(TargetType target,
void* dst,
const void* src,
const std::array<size_t, 2>& image_shape,
const std::array<size_t, 2>& image_pitch) {
TargetWrapperCL::ImgcpySync(
dst, src, image_shape, image_pitch, IoDirection::DtoD);
const size_t cl_image2d_width,
const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch) {
TargetWrapperCL::ImgcpySync(dst,
src,
cl_image2d_width,
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoD);
}
#endif
......
......@@ -42,8 +42,10 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size);
void TargetCopyImage2D(TargetType target,
void* dst,
const void* src,
const std::array<size_t, 2>& image_shape,
const std::array<size_t, 2>& image_pitch);
const size_t cl_image2d_width,
const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch);
#endif // LITE_WITH_OPENCL
template <TargetType Target>
......@@ -97,32 +99,20 @@ class Buffer {
#ifdef LITE_WITH_OPENCL
template <typename T>
void ResetLazyImage2D(TargetType target,
const std::array<size_t, 2>& image2d_shape) {
size_t size =
sizeof(T) * image2d_shape[0] * image2d_shape[1] * 4; // 4 for RGBA
VLOG(4) << "image2d_shape:" << image2d_shape[0] << " " << image2d_shape[1];
if (target != target_) {
const size_t img_w,
const size_t img_h) {
size_t size = sizeof(T) * img_w * img_h *
4; // 4 for RGBA, un-used for opencl Image2D
if (target != target_ || cl_image2d_width_ < img_w ||
cl_image2d_height_ < img_h) {
Free();
data_ = TargetWrapperCL::MallocImage<T>(image2d_shape);
data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h);
target_ = target;
space_ = size;
space_ = size; // un-used for opencl Image2D
cl_image2d_width_ = img_w;
cl_image2d_height_ = img_h;
}
}
template <typename T>
void ResizeLazyImage2D(const std::array<size_t, 2>& image2d_shape) {
ResetLazyImage2D<T>(target_, image2d_shape);
}
template <typename T>
void CopyImage2DFrom(const Buffer& other,
const std::array<size_t, 2>& image2d_shape,
const std::array<size_t, 2>& image2d_pitch) {
target_ = other.target_;
ResizeLazyImage2D<T>(image2d_shape, image2d_pitch);
TargetCopyImage2D(
target_, data_, other.data_, image2d_shape, image2d_pitch);
}
#endif
void Free() {
......@@ -145,6 +135,8 @@ class Buffer {
private:
// memory it actually malloced.
size_t space_{0};
size_t cl_image2d_width_{0}; // only used for OpenCL Image2D
size_t cl_image2d_height_{0}; // only used for OpenCL Image2D
void* data_{nullptr};
TargetType target_{TargetType::kHost};
};
......
......@@ -62,11 +62,13 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
CHECK(in->IsRoleSet());
CHECK(in->IsArg());
auto in_arg_name = in->AsArg().name;
std::string tmp;
CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
std::string inst_in_tensor_name;
CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name));
auto decl_arg_type =
inst.picked_kernel().GetInputDeclType(inst_in_tensor_name);
CHECK(in->AsArg().type);
VLOG(4) << "\n tmp:" << tmp << "\n in->AsArg().name:" << in->AsArg().name
VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name
<< "\n in->AsArg().name:" << in->AsArg().name
<< "\n *in->AsArg().type:" << *in->AsArg().type
<< "\n *decl_arg_type:" << *decl_arg_type
<< "\n inst.op()->DebugString():" << inst.op()->DebugString();
......@@ -125,12 +127,13 @@ void TypeLayoutTransformPass::AddLayoutInst(
for (auto& kernel : kernels) {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
// const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); // unused variable
#ifdef LITE_WITH_OPENCL
// ignore [layout check] for layout trans from image2d to buffer
// layout kernel choose
// must ignore [layout check] for layout of kernels's input and output
if (TargetCompatibleTo(*in_arg_ty, from) &&
PrecisionCompatibleTo(*in_arg_ty, from) &&
DeviceCompatibleTo(*in_arg_ty, from)) {
DeviceCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) {
#else
if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) {
......@@ -142,12 +145,12 @@ void TypeLayoutTransformPass::AddLayoutInst(
break;
}
}
CHECK(is_found) << "Can't find a layout kernel for layout op: " << from
<< ":" << in->AsArg().name << "->" << to << ":"
CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":"
<< in->AsArg().name << "->" << to << ":"
<< inst_node->AsStmt().op_info()->Type();
VLOG(4) << "========= final picked kernel [info]:"
<< layout_inst->AsStmt().picked_kernel().name()
<< " [summary]:" << layout_inst->AsStmt().picked_kernel().summary()
VLOG(4) << "========= final picked layout kernel ========= ";
VLOG(4) << "[info]:" << layout_inst->AsStmt().picked_kernel().name();
VLOG(4) << "[summary]:" << layout_inst->AsStmt().picked_kernel().summary()
<< "\n";
// Remove the old link
......
......@@ -120,16 +120,36 @@ void TypeTargetTransformPass::AddIoCopyInst(
std::vector<std::unique_ptr<KernelBase>> selected_kernels;
for (auto& kernel : kernels) {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
VLOG(4) << "------ kernel info -------";
VLOG(4) << "*in_arg_ty(io_copy kernel input):" << *in_arg_ty;
VLOG(4) << "from(last kernel output):" << from;
VLOG(4) << "to:" << to;
// kernel choose branch for opencl backend
// judge inst's target whether is kOpenCL
// Note: to == *decl_arg_type == in of inst, not output of last inst
#ifdef LITE_WITH_OPENCL
// ignore [layout check] for layout trans from buffer to image2d
// ignore [layout check] for layout between [to] and [from]
// Because all of origin opencl insts in model, are not default layout
// NCHW,
// so skip layout check.
// detailed node info see below:
// [*in->AsArg().type] -> [from]: out of inst's previous kernel
// [*decl_arg_type] -> [to]: input of inst, not output of last
// [in_arg_ty]: in of io_copy
// [out_arg_ty]: out of io_copy
if (TargetCompatibleTo(*in_arg_ty, from) &&
PrecisionCompatibleTo(*in_arg_ty, from) &&
DeviceCompatibleTo(*in_arg_ty, from)) {
DeviceCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) {
VLOG(4) << "do nothing. opencl found";
#else
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) {
#endif
VLOG(4) << "picked";
is_found = true;
selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel
......@@ -137,6 +157,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
io_copy_type, std::move(selected_kernels), io_copy_op);
break;
}
VLOG(4) << "not picked";
}
CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from
<< ":" << in->AsArg().name << " -> " << to << ":"
......@@ -147,9 +168,12 @@ void TypeTargetTransformPass::AddIoCopyInst(
// Update the original instruction OpDesc.
// Update its input to the io_copy_output_name
// Add new link, var -> new_inst, new_inst->newarg, newarg->inst
DirectedLink(in, io_copy_inst);
DirectedLink(io_copy_inst, io_copy_output_arg);
DirectedLink(io_copy_output_arg, inst_node);
DirectedLink(in, io_copy_inst); // [last kernel]'s output -> [io_copy kernel]
DirectedLink(
io_copy_inst,
io_copy_output_arg); // [io_copy kernel] -> [io_copy kernel]'s output
DirectedLink(io_copy_output_arg,
inst_node); // [io_copy kernel]'s output -> [current kernel]
// reset opdesc and update kernel information
UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
......
......@@ -149,8 +149,7 @@ class TensorLite {
template <typename T, typename R = T>
R *mutable_data(const size_t img_w, const size_t img_h) {
target_ = TARGET(kOpenCL);
std::array<size_t, 2> image2d_shape{img_w, img_h};
buffer_->ResetLazyImage2D<T>(target_, image2d_shape);
buffer_->ResetLazyImage2D<T>(target_, img_w, img_h);
return static_cast<cl::Image2D *>(buffer_->data());
}
#endif
......
......@@ -35,7 +35,7 @@ void CopyToHostSync(void* target, const void* source, size_t size) {
* This kernel copies a tensor from host to OpenCL space.
*/
class IoCopyHostToOpenCLCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
auto& param = Param<operators::IoCopyParam>();
......@@ -83,7 +83,7 @@ class IoCopyHostToOpenCLCompute
* This kernel copies a tensor from OpenCL to host space.
*/
class IoCopykOpenCLToHostCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
auto& param = Param<operators::IoCopyParam>();
......@@ -128,64 +128,40 @@ class IoCopykOpenCLToHostCompute
REGISTER_LITE_KERNEL(io_copy,
kOpenCL,
kFloat,
kNCHW,
kAny,
kAny,
paddle::lite::kernels::opencl::IoCopyHostToOpenCLCompute,
host_to_device)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy,
kOpenCL,
kFloat,
kNCHW,
kAny,
kAny,
paddle::lite::kernels::opencl::IoCopykOpenCLToHostCompute,
device_to_host)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy_once,
kOpenCL,
kFloat,
kNCHW,
kAny,
kAny,
paddle::lite::kernels::opencl::IoCopyHostToOpenCLCompute,
host_to_device)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy_once,
kOpenCL,
kFloat,
kNCHW,
kAny,
kAny,
paddle::lite::kernels::opencl::IoCopykOpenCLToHostCompute,
device_to_host)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
......@@ -79,5 +79,5 @@ TEST(io_copy, compute) {
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(io_copy, kOpenCL, kFloat, kNCHW, host_to_device);
USE_LITE_KERNEL(io_copy, kOpenCL, kFloat, kNCHW, device_to_host);
USE_LITE_KERNEL(io_copy, kOpenCL, kAny, kAny, host_to_device);
USE_LITE_KERNEL(io_copy, kOpenCL, kAny, kAny, device_to_host);
......@@ -29,7 +29,7 @@ namespace kernels {
namespace opencl {
class LayoutComputeBufferChwToImage2DHwc
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::LayoutParam;
......@@ -122,12 +122,12 @@ class LayoutComputeBufferChwToImage2DHwc
private:
std::string kernel_func_name_{"buffer_to_image2d"};
std::string build_options_{"-DCL_DTYPE=float"};
std::string build_options_{"-DCL_DTYPE_float "};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class LayoutComputeImage2DHwcToBufferChw
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::LayoutParam;
......@@ -211,7 +211,7 @@ class LayoutComputeImage2DHwcToBufferChw
private:
std::string kernel_func_name_{"image2d_to_buffer"};
std::string build_options_{"-DCL_DTYPE=float"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......@@ -225,17 +225,17 @@ class LayoutComputeImage2DHwcToBufferChw
REGISTER_LITE_KERNEL(
layout,
kOpenCL,
kFloat,
kAny,
kNHWC,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc,
buffer_chw_to_image2d_hwc_opencl_fp32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kAny),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -243,17 +243,17 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL(
layout_once,
kOpenCL,
kFloat,
kAny,
kNHWC,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc,
buffer_chw_to_image2d_hwc_opencl_fp32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kAny),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -262,17 +262,17 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL(
layout,
kOpenCL,
kFloat,
kAny,
kNCHW,
paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw,
image2d_hwc_to_buffer_chw_opencl_fp32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kAny),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.Finalize();
......@@ -280,16 +280,16 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL(
layout_once,
kOpenCL,
kFloat,
kAny,
kNCHW,
paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw,
image2d_hwc_to_buffer_chw_opencl_fp32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kAny),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.Finalize();
......@@ -44,9 +44,9 @@ TEST(layout, compute) {
<< h << " " << w << " ========";
// set layout kernels
auto buf_to_img_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC));
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC));
auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty());
......@@ -149,6 +149,6 @@ TEST(layout, compute) {
} // namespace paddle
USE_LITE_KERNEL(
layout, kOpenCL, kFloat, kNHWC, buffer_chw_to_image2d_hwc_opencl_fp32);
layout, kOpenCL, kAny, kNHWC, buffer_chw_to_image2d_hwc_opencl_fp32);
USE_LITE_KERNEL(
layout, kOpenCL, kFloat, kNCHW, image2d_hwc_to_buffer_chw_opencl_fp32);
layout, kOpenCL, kAny, kNCHW, image2d_hwc_to_buffer_chw_opencl_fp32);
......@@ -135,7 +135,7 @@ class ReluComputeFloatImage
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE=float -DRELU"};
std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册