未验证 提交 72c11758 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Fix layout, target pass for OpenCL, add macro of...

[LITE][OPENCL] Fix layout, target pass for OpenCL, add macro of CONVERT_TYPE_TO and READ/WRITE image, memory reuse in ResetLazyImage2D (#2170)

* add macro of CONVERT_TYPE_TO and READ/WRITE image. test=develop

* add data type control. test=develop

* fix io op as general layout and precision. test=develop

* Fix memory reuse strategy for opencl image2d. test=develop

* remove std::array, std::map in about opencl backend. test=develop
上级 7a731b7f
...@@ -31,8 +31,8 @@ static void CopyImageData(CLContext* context, ...@@ -31,8 +31,8 @@ static void CopyImageData(CLContext* context,
float* image_data = new float[height * width * 4]; float* image_data = new float[height * width * 4];
cl::Image* image = cl_image.cl_image(); cl::Image* image = cl_image.cl_image();
const std::array<size_t, 3> origin{0, 0, 0}; cl::array<size_t, 3> origin = {0, 0, 0};
const std::array<size_t, 3> region{ cl::array<size_t, 3> region = {
static_cast<size_t>(width), static_cast<size_t>(height), 1}; static_cast<size_t>(width), static_cast<size_t>(height), 1};
cl_int err = context->GetCommandQueue().enqueueReadImage( cl_int err = context->GetCommandQueue().enqueueReadImage(
*image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr); *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm> #include <algorithm>
#include <array>
#include <memory> #include <memory>
#include <random> #include <random>
#include <vector> #include <vector>
...@@ -395,51 +394,74 @@ TEST(cl_test, target_wrapper_buffer_test) { ...@@ -395,51 +394,74 @@ TEST(cl_test, target_wrapper_buffer_test) {
} }
TEST(cl_test, target_wrapper_image_test) { TEST(cl_test, target_wrapper_image_test) {
const std::array<size_t, 2> image_shape{28, 32}; const size_t cl_image2d_width = 28;
std::array<size_t, 2> image_pitch; const size_t cl_image2d_height = 32;
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
auto *d_image = static_cast<cl::Image2D *>( auto *d_image = static_cast<cl::Image2D *>(
TargetWrapperCL::MallocImage<float>(image_shape)); TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
// Map/Unmap test // Map/Unmap test
auto *h_image = static_cast<float *>( auto *h_image =
TargetWrapperCL::MapImage(d_image, image_shape, &image_pitch)); static_cast<float *>(TargetWrapperCL::MapImage(d_image,
// row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes) cl_image2d_width,
// slice_pitch = 0 cl_image2d_height,
size_t row_pitch = image_pitch[0]; cl_image2d_row_pitch,
size_t slice_pitch = image_pitch[1]; cl_image2d_slice_pitch));
CHECK_EQ(row_pitch, 448); CHECK_EQ(
CHECK_EQ(slice_pitch, 0); cl_image2d_row_pitch,
LOG(INFO) << "row_pitch = " << row_pitch << ", slice_pitch " << slice_pitch; cl_image2d_width * 4 *
4); // row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes)
CHECK_EQ(cl_image2d_slice_pitch, 0); // slice_pitch = 0
LOG(INFO) << "cl_image2d_row_pitch = " << cl_image2d_row_pitch
<< ", cl_image2d_slice_pitch " << cl_image2d_slice_pitch;
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
h_image[i] = 3.14f * i; h_image[i] = 3.14f * i;
} }
TargetWrapperCL::Unmap(d_image, h_image); TargetWrapperCL::Unmap(d_image, h_image);
auto *h_ptr = static_cast<float *>( auto *h_ptr =
TargetWrapperCL::MapImage(d_image, image_shape, &image_pitch)); static_cast<float *>(TargetWrapperCL::MapImage(d_image,
cl_image2d_width,
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch));
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6); EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6);
} }
TargetWrapperCL::Unmap(d_image, h_ptr); TargetWrapperCL::Unmap(d_image, h_ptr);
// Imagecpy test // Imagecpy test
std::vector<float> h_image_cpy(28 * 4 * 32); std::vector<float> h_image_cpy(cl_image2d_width * 4 *
for (int i = 0; i < 28 * 4 * 32; i++) { cl_image2d_height); // 4 for RGBA channels
for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
h_image_cpy[i] = 3.14f; h_image_cpy[i] = 3.14f;
} }
TargetWrapperCL::ImgcpySync( TargetWrapperCL::ImgcpySync(d_image,
d_image, h_image_cpy.data(), image_shape, image_pitch, IoDirection::HtoD); h_image_cpy.data(),
cl_image2d_width,
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::HtoD);
auto *d_image_cpy = static_cast<cl::Image2D *>( auto *d_image_cpy = static_cast<cl::Image2D *>(
TargetWrapperCL::MallocImage<float>(image_shape)); TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
TargetWrapperCL::ImgcpySync( TargetWrapperCL::ImgcpySync(d_image_cpy,
d_image_cpy, d_image, image_shape, image_pitch, IoDirection::DtoD); d_image,
cl_image2d_width,
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoD);
std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0); std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0);
TargetWrapperCL::ImgcpySync(h_image_cpy.data(), TargetWrapperCL::ImgcpySync(h_image_cpy.data(),
d_image_cpy, d_image_cpy,
image_shape, cl_image2d_width,
image_pitch, cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH); IoDirection::DtoH);
for (int i = 0; i < 28 * 4 * 32; i++) { for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6); EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6);
} }
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "lite/backends/opencl/cl_image.h" #include "lite/backends/opencl/cl_image.h"
#include <array>
#include "lite/backends/opencl/cl_runtime.h" #include "lite/backends/opencl/cl_runtime.h"
#include "lite/backends/opencl/cl_utility.h" #include "lite/backends/opencl/cl_utility.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
...@@ -27,8 +26,9 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { ...@@ -27,8 +26,9 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
float* image_data = new float[height * width * 4]; float* image_data = new float[height * width * 4];
cl::Image* image = cl_image.cl_image(); cl::Image* image = cl_image.cl_image();
const std::array<size_t, 3> origin{0, 0, 0};
const std::array<size_t, 3> region{ cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = {
static_cast<size_t>(width), static_cast<size_t>(height), 1}; static_cast<size_t>(width), static_cast<size_t>(height), 1};
cl_int err = CLRuntime::Global()->command_queue().enqueueReadImage( cl_int err = CLRuntime::Global()->command_queue().enqueueReadImage(
*image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr); *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
......
...@@ -16,10 +16,35 @@ limitations under the License. */ ...@@ -16,10 +16,35 @@ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
// Data type: pass one of macros on host: [CL_DTYPE_float, CL_DYPE_half]
#ifdef CL_DTYPE_float
#define CL_DTYPE float
#define CL_DTYPE_CHAR f
#endif
#ifdef CL_DTYPE_half
#define CL_DTYPE half
#define CL_DTYPE_CHAR h
#endif
// Note: macro name replacement need twice parser
#define GET_VEC_TYPE(type__, size__) type__##size__ #define GET_VEC_TYPE(type__, size__) type__##size__
#define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__) #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
#define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4) #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
#define _CONVERT_TYPE_TO(value, type) convert_##type(value)
#define CONVERT_TYPE_TO(value, type) _CONVERT_TYPE_TO(value, type)
#define _WRITE_IMG_TYPE(type_char, img, pos, value) \
write_image##type_char(img, pos, value)
#define WRITE_IMG_TYPE(type_char, img, pos, value) \
_WRITE_IMG_TYPE(type_char, img, pos, value)
#define _READ_IMG_TYPE(type_char, img, pos, sampler) \
read_image##type_char(img, sampler, pos)
#define READ_IMG_TYPE(type_char, img, pos, sampler) \
_READ_IMG_TYPE(type_char, img, pos, sampler)
inline CL_DTYPE activation(CL_DTYPE in inline CL_DTYPE activation(CL_DTYPE in
#ifdef PRELU #ifdef PRELU
, ,
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "lite/backends/opencl/target_wrapper.h" #include "lite/backends/opencl/target_wrapper.h"
#include <algorithm> #include <algorithm>
#include <array>
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_runtime.h" #include "lite/backends/opencl/cl_runtime.h"
#include "lite/backends/opencl/cl_utility.h" #include "lite/backends/opencl/cl_utility.h"
...@@ -58,18 +57,16 @@ void TargetWrapperCL::Free(void *ptr) { ...@@ -58,18 +57,16 @@ void TargetWrapperCL::Free(void *ptr) {
} }
template <> template <>
void *TargetWrapperCL::MallocImage<float>( void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
const std::array<size_t, 2> &image_shape) { const size_t cl_image2d_height) {
cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFloat))); cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFloat)));
cl_int status; cl_int status;
size_t width = image_shape[0];
size_t height = image_shape[1];
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format, img_format,
width, cl_image2d_width,
height, cl_image2d_height,
0, 0,
nullptr, nullptr,
&status); &status);
...@@ -82,18 +79,16 @@ void *TargetWrapperCL::MallocImage<float>( ...@@ -82,18 +79,16 @@ void *TargetWrapperCL::MallocImage<float>(
} }
template <> template <>
void *TargetWrapperCL::MallocImage<int8_t>( void *TargetWrapperCL::MallocImage<int8_t>(const size_t cl_image2d_width,
const std::array<size_t, 2> &image_shape) { const size_t cl_image2d_height) {
cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt8))); cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt8)));
cl_int status; cl_int status;
size_t width = image_shape[0];
size_t height = image_shape[1];
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format, img_format,
width, cl_image2d_width,
height, cl_image2d_height,
0, 0,
nullptr, nullptr,
&status); &status);
...@@ -106,18 +101,16 @@ void *TargetWrapperCL::MallocImage<int8_t>( ...@@ -106,18 +101,16 @@ void *TargetWrapperCL::MallocImage<int8_t>(
} }
template <> template <>
void *TargetWrapperCL::MallocImage<int32_t>( void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
const std::array<size_t, 2> &image_shape) { const size_t cl_image2d_height) {
cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt32))); cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt32)));
cl_int status; cl_int status;
size_t width = image_shape[0];
size_t height = image_shape[1];
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format, img_format,
width, cl_image2d_width,
height, cl_image2d_height,
0, 0,
nullptr, nullptr,
&status); &status);
...@@ -156,15 +149,13 @@ void *TargetWrapperCL::Map(void *buffer, size_t offset, size_t size) { ...@@ -156,15 +149,13 @@ void *TargetWrapperCL::Map(void *buffer, size_t offset, size_t size) {
} }
void *TargetWrapperCL::MapImage(void *image, void *TargetWrapperCL::MapImage(void *image,
const std::array<size_t, 2> &image_shape, const size_t cl_image2d_width,
std::array<size_t, 2> *image_pitch) { const size_t cl_image2d_height,
size_t cl_image2d_row_pitch,
size_t cl_image2d_slice_pitch) {
cl::Image2D *cl_image = static_cast<cl::Image2D *>(image); cl::Image2D *cl_image = static_cast<cl::Image2D *>(image);
size_t width = image_shape[0]; cl::array<size_t, 3> origin = {0, 0, 0};
size_t height = image_shape[1]; cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
size_t *row_pitch = image_pitch->data();
size_t *slice_pitch = image_pitch->data() + 1;
std::array<size_t, 3> origin{{0, 0, 0}};
std::array<size_t, 3> region{{width, height, 1}};
cl_int status; cl_int status;
void *mapped_ptr = CLRuntime::Global()->command_queue().enqueueMapImage( void *mapped_ptr = CLRuntime::Global()->command_queue().enqueueMapImage(
*cl_image, *cl_image,
...@@ -172,8 +163,8 @@ void *TargetWrapperCL::MapImage(void *image, ...@@ -172,8 +163,8 @@ void *TargetWrapperCL::MapImage(void *image,
CL_MAP_READ | CL_MAP_WRITE, CL_MAP_READ | CL_MAP_WRITE,
origin, origin,
region, region,
row_pitch, &cl_image2d_row_pitch,
slice_pitch, &cl_image2d_slice_pitch,
nullptr, nullptr,
nullptr, nullptr,
&status); &status);
...@@ -279,15 +270,13 @@ void TargetWrapperCL::MemcpyAsync(void *dst, ...@@ -279,15 +270,13 @@ void TargetWrapperCL::MemcpyAsync(void *dst,
void TargetWrapperCL::ImgcpySync(void *dst, void TargetWrapperCL::ImgcpySync(void *dst,
const void *src, const void *src,
const std::array<size_t, 2> &image_shape, const size_t cl_image2d_width,
const std::array<size_t, 2> &image_pitch, const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch,
IoDirection dir) { IoDirection dir) {
size_t width = image_shape[0]; cl::array<size_t, 3> origin = {0, 0, 0};
size_t height = image_shape[1]; cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
size_t row_pitch = image_pitch[0];
size_t slice_pitch = image_pitch[1];
std::array<size_t, 3> origin{{0, 0, 0}};
std::array<size_t, 3> region{{width, height, 1}};
cl_int status; cl_int status;
cl::Event event; cl::Event event;
auto stream = CLRuntime::Global()->command_queue(); auto stream = CLRuntime::Global()->command_queue();
...@@ -308,8 +297,8 @@ void TargetWrapperCL::ImgcpySync(void *dst, ...@@ -308,8 +297,8 @@ void TargetWrapperCL::ImgcpySync(void *dst,
CL_TRUE, CL_TRUE,
origin, origin,
region, region,
row_pitch, cl_image2d_row_pitch,
slice_pitch, cl_image2d_slice_pitch,
src, src,
nullptr, nullptr,
nullptr); nullptr);
...@@ -320,8 +309,8 @@ void TargetWrapperCL::ImgcpySync(void *dst, ...@@ -320,8 +309,8 @@ void TargetWrapperCL::ImgcpySync(void *dst,
CL_TRUE, CL_TRUE,
origin, origin,
region, region,
row_pitch, cl_image2d_row_pitch,
slice_pitch, cl_image2d_slice_pitch,
dst, dst,
nullptr, nullptr,
nullptr); nullptr);
...@@ -334,16 +323,14 @@ void TargetWrapperCL::ImgcpySync(void *dst, ...@@ -334,16 +323,14 @@ void TargetWrapperCL::ImgcpySync(void *dst,
void TargetWrapperCL::ImgcpyAsync(void *dst, void TargetWrapperCL::ImgcpyAsync(void *dst,
const void *src, const void *src,
const std::array<size_t, 2> &image_shape, const size_t cl_image2d_width,
const std::array<size_t, 2> &image_pitch, const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch,
IoDirection dir, IoDirection dir,
const stream_t &stream) { const stream_t &stream) {
size_t width = image_shape[0]; cl::array<size_t, 3> origin = {0, 0, 0};
size_t height = image_shape[1]; cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
size_t row_pitch = image_pitch[0];
size_t slice_pitch = image_pitch[1];
std::array<size_t, 3> origin{{0, 0, 0}};
std::array<size_t, 3> region{{width, height, 1}};
cl_int status; cl_int status;
switch (dir) { switch (dir) {
case IoDirection::DtoD: case IoDirection::DtoD:
...@@ -361,8 +348,8 @@ void TargetWrapperCL::ImgcpyAsync(void *dst, ...@@ -361,8 +348,8 @@ void TargetWrapperCL::ImgcpyAsync(void *dst,
CL_FALSE, CL_FALSE,
origin, origin,
region, region,
row_pitch, cl_image2d_row_pitch,
slice_pitch, cl_image2d_slice_pitch,
src, src,
nullptr, nullptr,
nullptr); nullptr);
...@@ -373,8 +360,8 @@ void TargetWrapperCL::ImgcpyAsync(void *dst, ...@@ -373,8 +360,8 @@ void TargetWrapperCL::ImgcpyAsync(void *dst,
CL_FALSE, CL_FALSE,
origin, origin,
region, region,
row_pitch, cl_image2d_row_pitch,
slice_pitch, cl_image2d_slice_pitch,
dst, dst,
nullptr, nullptr,
nullptr); nullptr);
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#pragma once #pragma once
#include <array>
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/target_wrapper.h" #include "lite/core/target_wrapper.h"
...@@ -48,13 +47,16 @@ class TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event> { ...@@ -48,13 +47,16 @@ class TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event> {
static void Free(void* ptr); static void Free(void* ptr);
template <typename R> template <typename R>
static void* MallocImage(const std::array<size_t, 2>& image_shape); static void* MallocImage(const size_t cl_image2d_width,
const size_t cl_image2d_height);
static void FreeImage(void* image); static void FreeImage(void* image);
static void* Map(void* buffer, size_t offset, size_t size); static void* Map(void* buffer, size_t offset, size_t size);
static void* MapImage(void* image, static void* MapImage(void* image,
const std::array<size_t, 2>& image_shape, const size_t cl_image2d_width,
std::array<size_t, 2>* image_pitch); const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch);
static void Unmap(void* cl_obj, void* mapped_ptr); static void Unmap(void* cl_obj, void* mapped_ptr);
static void MemcpySync(void* dst, static void MemcpySync(void* dst,
...@@ -68,13 +70,17 @@ class TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event> { ...@@ -68,13 +70,17 @@ class TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event> {
const stream_t& stream); const stream_t& stream);
static void ImgcpySync(void* dst, static void ImgcpySync(void* dst,
const void* src, const void* src,
const std::array<size_t, 2>& image_shape, const size_t cl_image2d_width,
const std::array<size_t, 2>& image_pitch, const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch,
IoDirection dir); IoDirection dir);
static void ImgcpyAsync(void* dst, static void ImgcpyAsync(void* dst,
const void* src, const void* src,
const std::array<size_t, 2>& image_shape, const size_t cl_image2d_width,
const std::array<size_t, 2>& image_pitch, const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch,
IoDirection dir, IoDirection dir,
const stream_t& stream); const stream_t& stream);
}; };
......
...@@ -109,10 +109,17 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { ...@@ -109,10 +109,17 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
void TargetCopyImage2D(TargetType target, void TargetCopyImage2D(TargetType target,
void* dst, void* dst,
const void* src, const void* src,
const std::array<size_t, 2>& image_shape, const size_t cl_image2d_width,
const std::array<size_t, 2>& image_pitch) { const size_t cl_image2d_height,
TargetWrapperCL::ImgcpySync( const size_t cl_image2d_row_pitch,
dst, src, image_shape, image_pitch, IoDirection::DtoD); const size_t cl_image2d_slice_pitch) {
TargetWrapperCL::ImgcpySync(dst,
src,
cl_image2d_width,
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoD);
} }
#endif #endif
......
...@@ -42,8 +42,10 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size); ...@@ -42,8 +42,10 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size);
void TargetCopyImage2D(TargetType target, void TargetCopyImage2D(TargetType target,
void* dst, void* dst,
const void* src, const void* src,
const std::array<size_t, 2>& image_shape, const size_t cl_image2d_width,
const std::array<size_t, 2>& image_pitch); const size_t cl_image2d_height,
const size_t cl_image2d_row_pitch,
const size_t cl_image2d_slice_pitch);
#endif // LITE_WITH_OPENCL #endif // LITE_WITH_OPENCL
template <TargetType Target> template <TargetType Target>
...@@ -97,32 +99,20 @@ class Buffer { ...@@ -97,32 +99,20 @@ class Buffer {
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
template <typename T> template <typename T>
void ResetLazyImage2D(TargetType target, void ResetLazyImage2D(TargetType target,
const std::array<size_t, 2>& image2d_shape) { const size_t img_w,
size_t size = const size_t img_h) {
sizeof(T) * image2d_shape[0] * image2d_shape[1] * 4; // 4 for RGBA size_t size = sizeof(T) * img_w * img_h *
VLOG(4) << "image2d_shape:" << image2d_shape[0] << " " << image2d_shape[1]; 4; // 4 for RGBA, un-used for opencl Image2D
if (target != target_) { if (target != target_ || cl_image2d_width_ < img_w ||
cl_image2d_height_ < img_h) {
Free(); Free();
data_ = TargetWrapperCL::MallocImage<T>(image2d_shape); data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h);
target_ = target; target_ = target;
space_ = size; space_ = size; // un-used for opencl Image2D
cl_image2d_width_ = img_w;
cl_image2d_height_ = img_h;
} }
} }
template <typename T>
void ResizeLazyImage2D(const std::array<size_t, 2>& image2d_shape) {
ResetLazyImage2D<T>(target_, image2d_shape);
}
template <typename T>
void CopyImage2DFrom(const Buffer& other,
const std::array<size_t, 2>& image2d_shape,
const std::array<size_t, 2>& image2d_pitch) {
target_ = other.target_;
ResizeLazyImage2D<T>(image2d_shape, image2d_pitch);
TargetCopyImage2D(
target_, data_, other.data_, image2d_shape, image2d_pitch);
}
#endif #endif
void Free() { void Free() {
...@@ -145,6 +135,8 @@ class Buffer { ...@@ -145,6 +135,8 @@ class Buffer {
private: private:
// memory it actually malloced. // memory it actually malloced.
size_t space_{0}; size_t space_{0};
size_t cl_image2d_width_{0}; // only used for OpenCL Image2D
size_t cl_image2d_height_{0}; // only used for OpenCL Image2D
void* data_{nullptr}; void* data_{nullptr};
TargetType target_{TargetType::kHost}; TargetType target_{TargetType::kHost};
}; };
......
...@@ -62,11 +62,13 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph, ...@@ -62,11 +62,13 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
CHECK(in->IsRoleSet()); CHECK(in->IsRoleSet());
CHECK(in->IsArg()); CHECK(in->IsArg());
auto in_arg_name = in->AsArg().name; auto in_arg_name = in->AsArg().name;
std::string tmp; std::string inst_in_tensor_name;
CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp)); CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name));
auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp); auto decl_arg_type =
inst.picked_kernel().GetInputDeclType(inst_in_tensor_name);
CHECK(in->AsArg().type); CHECK(in->AsArg().type);
VLOG(4) << "\n tmp:" << tmp << "\n in->AsArg().name:" << in->AsArg().name VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name
<< "\n in->AsArg().name:" << in->AsArg().name
<< "\n *in->AsArg().type:" << *in->AsArg().type << "\n *in->AsArg().type:" << *in->AsArg().type
<< "\n *decl_arg_type:" << *decl_arg_type << "\n *decl_arg_type:" << *decl_arg_type
<< "\n inst.op()->DebugString():" << inst.op()->DebugString(); << "\n inst.op()->DebugString():" << inst.op()->DebugString();
...@@ -125,12 +127,13 @@ void TypeLayoutTransformPass::AddLayoutInst( ...@@ -125,12 +127,13 @@ void TypeLayoutTransformPass::AddLayoutInst(
for (auto& kernel : kernels) { for (auto& kernel : kernels) {
const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
// const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); // unused variable
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
// ignore [layout check] for layout trans from image2d to buffer // layout kernel choose
// must ignore [layout check] for layout of kernels's input and output
if (TargetCompatibleTo(*in_arg_ty, from) && if (TargetCompatibleTo(*in_arg_ty, from) &&
PrecisionCompatibleTo(*in_arg_ty, from) && PrecisionCompatibleTo(*in_arg_ty, from) &&
DeviceCompatibleTo(*in_arg_ty, from)) { DeviceCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) {
#else #else
if (TypeCompatible(*in_arg_ty, from) && if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) { out_arg_ty->layout() == to.layout()) {
...@@ -142,12 +145,12 @@ void TypeLayoutTransformPass::AddLayoutInst( ...@@ -142,12 +145,12 @@ void TypeLayoutTransformPass::AddLayoutInst(
break; break;
} }
} }
CHECK(is_found) << "Can't find a layout kernel for layout op: " << from CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":"
<< ":" << in->AsArg().name << "->" << to << ":" << in->AsArg().name << "->" << to << ":"
<< inst_node->AsStmt().op_info()->Type(); << inst_node->AsStmt().op_info()->Type();
VLOG(4) << "========= final picked kernel [info]:" VLOG(4) << "========= final picked layout kernel ========= ";
<< layout_inst->AsStmt().picked_kernel().name() VLOG(4) << "[info]:" << layout_inst->AsStmt().picked_kernel().name();
<< " [summary]:" << layout_inst->AsStmt().picked_kernel().summary() VLOG(4) << "[summary]:" << layout_inst->AsStmt().picked_kernel().summary()
<< "\n"; << "\n";
// Remove the old link // Remove the old link
......
...@@ -120,16 +120,36 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -120,16 +120,36 @@ void TypeTargetTransformPass::AddIoCopyInst(
std::vector<std::unique_ptr<KernelBase>> selected_kernels; std::vector<std::unique_ptr<KernelBase>> selected_kernels;
for (auto& kernel : kernels) { for (auto& kernel : kernels) {
const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
VLOG(4) << "------ kernel info -------";
VLOG(4) << "*in_arg_ty(io_copy kernel input):" << *in_arg_ty;
VLOG(4) << "from(last kernel output):" << from;
VLOG(4) << "to:" << to;
// kernel choose branch for opencl backend
// judge inst's target whether is kOpenCL
// Note: to == *decl_arg_type == in of inst, not output of last inst
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
// ignore [layout check] for layout trans from buffer to image2d // ignore [layout check] for layout between [to] and [from]
// Because all of origin opencl insts in model, are not default layout
// NCHW,
// so skip layout check.
// detailed node info see below:
// [*in->AsArg().type] -> [from]: out of inst's previous kernel
// [*decl_arg_type] -> [to]: input of inst, not output of last
// [in_arg_ty]: in of io_copy
// [out_arg_ty]: out of io_copy
if (TargetCompatibleTo(*in_arg_ty, from) && if (TargetCompatibleTo(*in_arg_ty, from) &&
PrecisionCompatibleTo(*in_arg_ty, from) && PrecisionCompatibleTo(*in_arg_ty, from) &&
DeviceCompatibleTo(*in_arg_ty, from)) { DeviceCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) {
VLOG(4) << "do nothing. opencl found";
#else #else
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
if (TypeCompatible(*in_arg_ty, from) && if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) { out_arg_ty->target() == to.target()) {
#endif #endif
VLOG(4) << "picked";
is_found = true; is_found = true;
selected_kernels.emplace_back(std::move(kernel)); selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel // we pick the kernel
...@@ -137,6 +157,7 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -137,6 +157,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
io_copy_type, std::move(selected_kernels), io_copy_op); io_copy_type, std::move(selected_kernels), io_copy_op);
break; break;
} }
VLOG(4) << "not picked";
} }
CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from
<< ":" << in->AsArg().name << " -> " << to << ":" << ":" << in->AsArg().name << " -> " << to << ":"
...@@ -147,9 +168,12 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -147,9 +168,12 @@ void TypeTargetTransformPass::AddIoCopyInst(
// Update the original instruction OpDesc. // Update the original instruction OpDesc.
// Update its input to the io_copy_output_name // Update its input to the io_copy_output_name
// Add new link, var -> new_inst, new_inst->newarg, newarg->inst // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
DirectedLink(in, io_copy_inst); DirectedLink(in, io_copy_inst); // [last kernel]'s output -> [io_copy kernel]
DirectedLink(io_copy_inst, io_copy_output_arg); DirectedLink(
DirectedLink(io_copy_output_arg, inst_node); io_copy_inst,
io_copy_output_arg); // [io_copy kernel] -> [io_copy kernel]'s output
DirectedLink(io_copy_output_arg,
inst_node); // [io_copy kernel]'s output -> [current kernel]
// reset opdesc and update kernel information // reset opdesc and update kernel information
UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
......
...@@ -149,8 +149,7 @@ class TensorLite { ...@@ -149,8 +149,7 @@ class TensorLite {
template <typename T, typename R = T> template <typename T, typename R = T>
R *mutable_data(const size_t img_w, const size_t img_h) { R *mutable_data(const size_t img_w, const size_t img_h) {
target_ = TARGET(kOpenCL); target_ = TARGET(kOpenCL);
std::array<size_t, 2> image2d_shape{img_w, img_h}; buffer_->ResetLazyImage2D<T>(target_, img_w, img_h);
buffer_->ResetLazyImage2D<T>(target_, image2d_shape);
return static_cast<cl::Image2D *>(buffer_->data()); return static_cast<cl::Image2D *>(buffer_->data());
} }
#endif #endif
......
...@@ -35,7 +35,7 @@ void CopyToHostSync(void* target, const void* source, size_t size) { ...@@ -35,7 +35,7 @@ void CopyToHostSync(void* target, const void* source, size_t size) {
* This kernel copies a tensor from host to OpenCL space. * This kernel copies a tensor from host to OpenCL space.
*/ */
class IoCopyHostToOpenCLCompute class IoCopyHostToOpenCLCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> { : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
public: public:
void Run() override { void Run() override {
auto& param = Param<operators::IoCopyParam>(); auto& param = Param<operators::IoCopyParam>();
...@@ -83,7 +83,7 @@ class IoCopyHostToOpenCLCompute ...@@ -83,7 +83,7 @@ class IoCopyHostToOpenCLCompute
* This kernel copies a tensor from OpenCL to host space. * This kernel copies a tensor from OpenCL to host space.
*/ */
class IoCopykOpenCLToHostCompute class IoCopykOpenCLToHostCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> { : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
public: public:
void Run() override { void Run() override {
auto& param = Param<operators::IoCopyParam>(); auto& param = Param<operators::IoCopyParam>();
...@@ -128,64 +128,40 @@ class IoCopykOpenCLToHostCompute ...@@ -128,64 +128,40 @@ class IoCopykOpenCLToHostCompute
REGISTER_LITE_KERNEL(io_copy, REGISTER_LITE_KERNEL(io_copy,
kOpenCL, kOpenCL,
kFloat, kAny,
kNCHW, kAny,
paddle::lite::kernels::opencl::IoCopyHostToOpenCLCompute, paddle::lite::kernels::opencl::IoCopyHostToOpenCLCompute,
host_to_device) host_to_device)
.BindInput("Input", .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
{LiteType::GetTensorTy(TARGET(kHost), .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(io_copy, REGISTER_LITE_KERNEL(io_copy,
kOpenCL, kOpenCL,
kFloat, kAny,
kNCHW, kAny,
paddle::lite::kernels::opencl::IoCopykOpenCLToHostCompute, paddle::lite::kernels::opencl::IoCopykOpenCLToHostCompute,
device_to_host) device_to_host)
.BindInput("Input", .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
{LiteType::GetTensorTy(TARGET(kOpenCL), .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(io_copy_once, REGISTER_LITE_KERNEL(io_copy_once,
kOpenCL, kOpenCL,
kFloat, kAny,
kNCHW, kAny,
paddle::lite::kernels::opencl::IoCopyHostToOpenCLCompute, paddle::lite::kernels::opencl::IoCopyHostToOpenCLCompute,
host_to_device) host_to_device)
.BindInput("Input", .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
{LiteType::GetTensorTy(TARGET(kHost), .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(io_copy_once, REGISTER_LITE_KERNEL(io_copy_once,
kOpenCL, kOpenCL,
kFloat, kAny,
kNCHW, kAny,
paddle::lite::kernels::opencl::IoCopykOpenCLToHostCompute, paddle::lite::kernels::opencl::IoCopykOpenCLToHostCompute,
device_to_host) device_to_host)
.BindInput("Input", .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
{LiteType::GetTensorTy(TARGET(kOpenCL), .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize(); .Finalize();
...@@ -79,5 +79,5 @@ TEST(io_copy, compute) { ...@@ -79,5 +79,5 @@ TEST(io_copy, compute) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(io_copy, kOpenCL, kFloat, kNCHW, host_to_device); USE_LITE_KERNEL(io_copy, kOpenCL, kAny, kAny, host_to_device);
USE_LITE_KERNEL(io_copy, kOpenCL, kFloat, kNCHW, device_to_host); USE_LITE_KERNEL(io_copy, kOpenCL, kAny, kAny, device_to_host);
...@@ -29,7 +29,7 @@ namespace kernels { ...@@ -29,7 +29,7 @@ namespace kernels {
namespace opencl { namespace opencl {
class LayoutComputeBufferChwToImage2DHwc class LayoutComputeBufferChwToImage2DHwc
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)> { : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC)> {
public: public:
using param_t = operators::LayoutParam; using param_t = operators::LayoutParam;
...@@ -122,12 +122,12 @@ class LayoutComputeBufferChwToImage2DHwc ...@@ -122,12 +122,12 @@ class LayoutComputeBufferChwToImage2DHwc
private: private:
std::string kernel_func_name_{"buffer_to_image2d"}; std::string kernel_func_name_{"buffer_to_image2d"};
std::string build_options_{"-DCL_DTYPE=float"}; std::string build_options_{"-DCL_DTYPE_float "};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class LayoutComputeImage2DHwcToBufferChw class LayoutComputeImage2DHwcToBufferChw
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> { : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)> {
public: public:
using param_t = operators::LayoutParam; using param_t = operators::LayoutParam;
...@@ -211,7 +211,7 @@ class LayoutComputeImage2DHwcToBufferChw ...@@ -211,7 +211,7 @@ class LayoutComputeImage2DHwcToBufferChw
private: private:
std::string kernel_func_name_{"image2d_to_buffer"}; std::string kernel_func_name_{"image2d_to_buffer"};
std::string build_options_{"-DCL_DTYPE=float"}; std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
...@@ -225,17 +225,17 @@ class LayoutComputeImage2DHwcToBufferChw ...@@ -225,17 +225,17 @@ class LayoutComputeImage2DHwcToBufferChw
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
layout, layout,
kOpenCL, kOpenCL,
kFloat, kAny,
kNHWC, kNHWC,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc, paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc,
buffer_chw_to_image2d_hwc_opencl_fp32) buffer_chw_to_image2d_hwc_opencl_fp32)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kAny),
DATALAYOUT(kNCHW))}) DATALAYOUT(kNCHW))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
...@@ -243,17 +243,17 @@ REGISTER_LITE_KERNEL( ...@@ -243,17 +243,17 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
layout_once, layout_once,
kOpenCL, kOpenCL,
kFloat, kAny,
kNHWC, kNHWC,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc, paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc,
buffer_chw_to_image2d_hwc_opencl_fp32) buffer_chw_to_image2d_hwc_opencl_fp32)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kAny),
DATALAYOUT(kNCHW))}) DATALAYOUT(kNCHW))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
...@@ -262,17 +262,17 @@ REGISTER_LITE_KERNEL( ...@@ -262,17 +262,17 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
layout, layout,
kOpenCL, kOpenCL,
kFloat, kAny,
kNCHW, kNCHW,
paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw, paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw,
image2d_hwc_to_buffer_chw_opencl_fp32) image2d_hwc_to_buffer_chw_opencl_fp32)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kAny),
DATALAYOUT(kNCHW))}) DATALAYOUT(kNCHW))})
.Finalize(); .Finalize();
...@@ -280,16 +280,16 @@ REGISTER_LITE_KERNEL( ...@@ -280,16 +280,16 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
layout_once, layout_once,
kOpenCL, kOpenCL,
kFloat, kAny,
kNCHW, kNCHW,
paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw, paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw,
image2d_hwc_to_buffer_chw_opencl_fp32) image2d_hwc_to_buffer_chw_opencl_fp32)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kAny),
DATALAYOUT(kNCHW))}) DATALAYOUT(kNCHW))})
.Finalize(); .Finalize();
...@@ -44,9 +44,9 @@ TEST(layout, compute) { ...@@ -44,9 +44,9 @@ TEST(layout, compute) {
<< h << " " << w << " ========"; << h << " " << w << " ========";
// set layout kernels // set layout kernels
auto buf_to_img_kernels = KernelRegistry::Global().Create( auto buf_to_img_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)); "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC));
auto img_to_buf_kernels = KernelRegistry::Global().Create( auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)); "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
...@@ -149,6 +149,6 @@ TEST(layout, compute) { ...@@ -149,6 +149,6 @@ TEST(layout, compute) {
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL( USE_LITE_KERNEL(
layout, kOpenCL, kFloat, kNHWC, buffer_chw_to_image2d_hwc_opencl_fp32); layout, kOpenCL, kAny, kNHWC, buffer_chw_to_image2d_hwc_opencl_fp32);
USE_LITE_KERNEL( USE_LITE_KERNEL(
layout, kOpenCL, kFloat, kNCHW, image2d_hwc_to_buffer_chw_opencl_fp32); layout, kOpenCL, kAny, kNCHW, image2d_hwc_to_buffer_chw_opencl_fp32);
...@@ -135,7 +135,7 @@ class ReluComputeFloatImage ...@@ -135,7 +135,7 @@ class ReluComputeFloatImage
private: private:
std::string kernel_func_name_{"relu"}; std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE=float -DRELU"}; std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册