diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl index 0d61ec32bab089abdda11ddfd7cc7d4357cf6f90..f65e3e06d6d8e63a7b7c0159c14e2a0d5ce27e74 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl @@ -1,5 +1,5 @@ -__kernel void MaxPooling2d(__global float4 *input, __global float4 *output, const int4 input_shape, - const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { +__kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, + const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate int X = get_global_id(0); int Y = get_global_id(1); @@ -31,38 +31,37 @@ __kernel void MaxPooling2d(__global float4 *input, __global float4 *output, cons output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum; } -// __constant sampler_t sample_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; +__constant sampler_t sample_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; -//__kernel void MaxPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, -// const int4 output_shape, const int2 stride, const int2 kernel_size, -// const int2 padding) { -// // axis to dst tensor coordinate -// int X = get_global_id(0); -// int Y = get_global_id(1); -// int Z = get_global_id(2); -// -// // boundary check -// if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { -// return; -// } -// -// float4 maximum = (float4)(-10000.0f); -// int xs = X * stride.x + padding.x; -// int ys = Y * stride.y + padding.y; -// -// for (int ky = 0; ky < kernel_size.y; ++ky) { -// int y_c = ys + ky; -// if (y_c < 0 || y_c >= input_shape.y) { -// continue; -// } -// for (int kx = 0; kx < kernel_size.x; ++kx) { -// int x_c = xs + kx; -// if (x_c < 0 || x_c >= input_shape.x) { -// continue; -// } -// float4 src = read_imagef(input, sample_none, (int2)(x_c, y_c * input_shape.w + Z)); -// maximum = max(src, maximum); -// } -// } -// write_imagef(output, (int2)(X, Y * output_shape.w + Z), maximum); -//} \ No newline at end of file +__kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, + const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { + // axis to dst tensor coordinate + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + + // boundary check + if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { + return; + } + + float4 maximum = (float4)(-10000.0f); + int xs = X * stride.x + padding.x; + int ys = Y * stride.y + padding.y; + + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = xs + kx; + if (x_c < 0 || x_c >= input_shape.x) { + continue; + } + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = ys + ky; + if (y_c < 0 || y_c >= input_shape.y) { + continue; + } + float4 src = read_imagef(input, sample_none, (int2)(x_c, y_c * input_shape.w + Z)); + maximum = max(src, maximum); + } + } + write_imagef(output, (int2)(X, Y * output_shape.w + Z), maximum); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index b6b9131db2e237799cc0c9c24f0bfc3dd50d8628..8cd3113797d3401dc0e4038d3d38f174bb2b55a8 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -13,13 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +#include "src/runtime/kernel/opencl/kernel/arithmetic.h" #include #include #include #include "schema/model_generated.h" #include "src/kernel_registry.h" #include "src/runtime/kernel/opencl/utils.h" -#include "src/runtime/kernel/opencl/kernel/arithmetic.h" #ifndef PROGRAM_WITH_IL #include "src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl.inc" #include "src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl.inc" @@ -41,8 +42,8 @@ std::vector ArithmeticOpenCLKernel::InitGlobalSize() const { void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { global_size_ = InitGlobalSize(); int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())()); - local_size_ = GetLocalSize(global_size_, max_work_group_size); - global_size_ = GetGlobalSize(local_size_, global_size_); + local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); + global_size_ = GetCommonGlobalSize(local_size_, global_size_); } void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index fa969c31f5d269cbd1f24a92a975040fba102f95..e7a89821c05862e1149dcb163ec0070de06ad9d7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -31,12 +31,11 @@ #endif - -using mindspore::schema::PrimitiveType_DepthwiseConv2D; using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_DepthwiseConv2D; namespace mindspore::kernel { @@ -117,11 +116,9 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() { return RET_OK; } -int DepthwiseConv2dOpenCLKernel::ReSize() { - return RET_OK; -} +int DepthwiseConv2dOpenCLKernel::ReSize() { return RET_OK; } -int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector* img_size) { +int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); size_t im_dst_x, im_dst_y; if (inputs_[0]->GetFormat() == schema::Format_NHWC4) { @@ -141,16 +138,18 @@ int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector* i *img_size = vec; return RET_OK; } -int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector* global_size) { + +int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector *global_size) { size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); - std::vector global = {(size_t) outputs_[0]->Width(), (size_t) outputs_[0]->Height(), CO4}; + std::vector global = {(size_t)outputs_[0]->Width(), (size_t)outputs_[0]->Height(), CO4}; *global_size = std::move(global); return RET_OK; } -int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector& global_size, - std::vector* local_size) { + +int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector &global_size, + std::vector *local_size) { size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); - std::vector local = {1, 1, CO4}; + std::vector local = {1, 1, CO4}; *local_size = std::move(local); return RET_OK; } @@ -161,8 +160,8 @@ int DepthwiseConv2dOpenCLKernel::Run() { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); size_t CI4 = UP_DIV(inputs_[0]->Channel(), C4NUM); - std::vector global = {(size_t) outputs_[0]->Width(), (size_t) outputs_[0]->Height(), CO4}; - std::vector local; + std::vector global = {(size_t)outputs_[0]->Width(), (size_t)outputs_[0]->Height(), CO4}; + std::vector local; GetLocalSize(0, global, &local); float relu_clip1 = 6.0; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h index 90055dd1c331103f70d90ebca2487c2fc46e2850..adad41f6c82d031e90d43ebb5d3cc0b57f58394b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h @@ -28,11 +28,10 @@ namespace mindspore::kernel { class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { public: explicit DepthwiseConv2dOpenCLKernel(OpParameter *parameter, const std::vector &inputs, - const std::vector &outputs) - : OpenCLKernel(parameter, inputs, outputs), - packed_weight_(nullptr), bias_data_(nullptr), kernel_(nullptr) {} + const std::vector &outputs) + : OpenCLKernel(parameter, inputs, outputs), packed_weight_(nullptr), bias_data_(nullptr), kernel_(nullptr) {} - ~DepthwiseConv2dOpenCLKernel() override {}; + ~DepthwiseConv2dOpenCLKernel() override{}; int Init() override; @@ -42,20 +41,16 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { int InitBuffer(); - int GetImageSize(size_t idx, std::vector* img_size) override; - int GetGlobalSize(size_t idx, std::vector* global_size) override; - int GetLocalSize(size_t idx, const std::vector& global_size, - std::vector* local_size) override; + int GetImageSize(size_t idx, std::vector *img_size) override; + int GetGlobalSize(size_t idx, std::vector *global_size) override; + int GetLocalSize(size_t idx, const std::vector &global_size, std::vector *local_size) override; private: FLOAT_t *packed_weight_; FLOAT_t *bias_data_; cl::Kernel kernel_; - enum class MEM_TYPE { - BUF, IMG - } mem_type_{MEM_TYPE::IMG}; + enum class MEM_TYPE { BUF, IMG } mem_type_{MEM_TYPE::IMG}; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_DEPTHWISE_H_ - diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index 805376690e7344adff62bcfcfbae9d9f7539e67c..476ff23dbc901716a14765263373060ca79b8285 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -64,12 +64,18 @@ int PoolingOpenCLKernel::Init() { #ifdef PROGRAM_WITH_IL ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); #else + if (mem_type_ == MEM_TYPE::BUF) { + kernel_name += "_BUF"; + } else { + kernel_name += "_IMG"; + } std::set build_options; ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif outputs_[0]->SetFormat(schema::Format_NHWC4); MS_LOG(DEBUG) << kernel_name << " Init Done!"; + return RET_OK; } @@ -81,8 +87,30 @@ std::vector PoolingOpenCLKernel::InitGlobalSize() const { return global; } -int PoolingOpenCLKernel::InitBuffer() { return 0; } -int PoolingOpenCLKernel::ReSize() { return 0; } +int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { + size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); + size_t im_dst_x, im_dst_y; + if (inputs_[0]->GetFormat() == schema::Format_NHWC4) { + im_dst_x = outputs_[0]->Height(); + im_dst_y = outputs_[0]->Width() * CO4; + } else { + im_dst_y = outputs_[0]->Width(); + im_dst_x = outputs_[0]->Height() * CO4; + } +#ifdef ENABLE_FP16 + size_t img_dtype = CL_HALF_FLOAT; +#else + size_t img_dtype = CL_FLOAT; +#endif + img_size->clear(); + std::vector vec{im_dst_x, im_dst_y, img_dtype}; + *img_size = vec; + return RET_OK; +} + +int PoolingOpenCLKernel::InitBuffer() { return RET_OK; } + +int PoolingOpenCLKernel::ReSize() { return RET_OK; } int PoolingOpenCLKernel::Run() { MS_LOG(DEBUG) << this->Name() << " Running!"; @@ -110,12 +138,11 @@ int PoolingOpenCLKernel::Run() { std::vector local_size; std::vector global_size = InitGlobalSize(); int max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime->Device())()); - local_size = GetLocalSize(global_size, max_work_group_size); - global_size = GetGlobalSize(local_size, global_size); + local_size = GetCommonLocalSize(global_size, max_work_group_size); + global_size = GetCommonGlobalSize(local_size, global_size); // run opengl kernel ocl_runtime->RunKernel(kernel_, global_size, local_size, nullptr); - return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h index d81ce4c56911d3528a76002f06f4c49b07360a8f..b08b0003080d1821fe3bf25251ae60b9343ef9d2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h @@ -25,11 +25,11 @@ namespace mindspore::kernel { -class PoolingOpenCLKernel : public LiteKernel { +class PoolingOpenCLKernel : public OpenCLKernel { public: explicit PoolingOpenCLKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs) - : LiteKernel(parameter, inputs, outputs) { + : OpenCLKernel(parameter, inputs, outputs) { parameter_ = reinterpret_cast(parameter); } ~PoolingOpenCLKernel() override{}; @@ -38,10 +38,11 @@ class PoolingOpenCLKernel : public LiteKernel { int ReSize() override; int Run() override; int InitBuffer(); + int GetImageSize(size_t idx, std::vector *img_size) override; private: std::vector InitGlobalSize() const; - + enum class MEM_TYPE { BUF, IMG } mem_type_{MEM_TYPE::IMG}; PoolingParameter *parameter_; cl::Kernel kernel_; }; @@ -49,4 +50,3 @@ class PoolingOpenCLKernel : public LiteKernel { } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_POOLING_H_ - diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index 3940f0db886c7d612c92a35f30719cc271f7d052..1ce176ca22a3407c3f8c807a232ebac7df8f2d0c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -22,7 +22,7 @@ namespace mindspore { namespace kernel { -std::vector GetGlobalSize(const std::vector &local, const std::vector &global) { +std::vector GetCommonGlobalSize(const std::vector &local, const std::vector &global) { std::vector result(3, 1); for (int i = 0; i < 3; ++i) { result[i] = AlignByN(global[i], local[i]); @@ -30,7 +30,7 @@ std::vector GetGlobalSize(const std::vector &local, const std::v return result; } -std::vector GetLocalSize(const std::vector &global, int max_size) { +std::vector GetCommonLocalSize(const std::vector &global, int max_size) { size_t wg_z = GetBiggestDividerWithPriority(global[2], 8); size_t wg_xy_size = max_size / wg_z; size_t wg_x = std::min(DivideRoundUp(global[0], 2), wg_xy_size); diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index 83536823eec97897263242fdae62fa9c2987dbbd..1593a93ee1d465e2881cec87e78e8930dea28caa 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -75,10 +75,10 @@ T AlignByN(T number, N n) { } // GetGlobalSize -std::vector GetGlobalSize(const std::vector &local, const std::vector &global); +std::vector GetCommonGlobalSize(const std::vector &local, const std::vector &global); // GetLocalSize -std::vector GetLocalSize(const std::vector &global, int max_size); +std::vector GetCommonLocalSize(const std::vector &global, int max_size); std::string CLErrorCode(cl_int error_code); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc index 365e87793452d9da6386a4e576a771cfc877bc5c..d90be66a4c914d1eaad9b1675dc9ec1823d3368e 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc @@ -43,33 +43,42 @@ TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) { MS_LOG(INFO) << "ocl runtime"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); + auto allocator = ocl_runtime->GetAllocator(); MS_LOG(INFO) << "PoolingParameter"; auto param = new PoolingParameter; InitParameter(param); // define tensor - MS_LOG(INFO) << "define tensor"; + MS_LOG(INFO) << "define tensor1"; std::vector input_shape = {1, 16, 256, 192}; std::vector output_shape = {1, 8, 128, 192}; auto data_type = kNumberTypeFloat32; auto tensorType = schema::NodeType_ValueNode; + MS_LOG(INFO) << "define tensor2"; auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC4, tensorType); auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensorType); + MS_LOG(INFO) << "define input"; std::vector inputs{input_tensor}; std::vector outputs{output_tensor}; // run + MS_LOG(INFO) << "pooling_kernel"; auto *pooling_kernel = new kernel::PoolingOpenCLKernel(reinterpret_cast(param), inputs, outputs); + MS_LOG(INFO) << "pooling_kernel init"; pooling_kernel->Init(); + std::vector kernels{pooling_kernel}; + inputs[0]->MallocData(allocator); auto *pGraph = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); + MS_LOG(INFO) << "pGraph init"; pGraph->Init(); // load data - MS_LOG(INFO) << "load data"; + MS_LOG(INFO) << "load data1"; std::string input_file = "maxpool_in.bin"; std::string expect_file = "maxpool_out.bin"; + MS_LOG(INFO) << "load data2"; LoadTestData(input_tensor->Data(), input_tensor->Size(), input_file); auto *input_data = reinterpret_cast(input_tensor->Data()); printf("input[0:10]:"); @@ -81,6 +90,7 @@ TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) { pGraph->Run(); MS_LOG(INFO) << "compare result"; + std::cout << "compare result" << std::endl; CompareOutput(output_tensor, expect_file); } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc index f928d4f60c9b58f87820aaf7cb70fdae61fcf643..e834e29b079dfc556a783d152c039fbe200b78eb 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc @@ -24,9 +24,14 @@ namespace mindspore { void LoadTestData(void *dst, size_t dst_size, const std::string &file_path) { if (file_path.empty()) { - memset(dst, dst_size, dst_size); + memset(dst, 0x00, dst_size); } else { - memcpy(dst, reinterpret_cast(dst_size), dst_size); + auto src_data = reinterpret_cast(mindspore::lite::ReadFile(file_path.c_str(), &dst_size)); + if (src_data != nullptr) { + memcpy(dst, src_data, dst_size); + } else { + MS_LOG(ERROR) << "read file empty."; + } } }