diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl index 4e9a8422be6b3044a2fa8d4863ba98a75039dab3..691d91ef9beec3451099950b24f1d044a36c6fc2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl @@ -26,26 +26,9 @@ __kernel void ElementDiv(__global float *input_a, __global float *input_b, __glo output[idx] = input_a[idx] * input_b[idx]; } -__kernel void BoardcastAdd(__global float *input_a, float input_b, __global float *output, const unsigned int n) { +__kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output, + const unsigned int n) { int idx = get_global_id(0); if (idx >= n) return; - output[idx] = input_a[idx] + input_b; -} - -__kernel void BoardcastSub(__global float *input_a, float input_b, __global float *output, const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] - input_b; -} - -__kernel void BoardcastMul(__global float *input_a, float input_b, __global float *output, const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] * input_b; -} - -__kernel void BoardcastDiv(__global float *input_a, float input_b, __global float *output, const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] * input_b; + output[idx] = weight * input_a[idx] + bias; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl index 12f75438f96a8e24d0af817b6e9f3754ae5a2707..33d6143f9d4bf91be493a9685881887870f52a22 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl @@ -1,15 +1,65 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; -__kernel void ElementAdd(__read_only image2d_t *input_a, __read_only image2d_t *input_b, __write_only image2d_t *output, - const int4 output_shape) { +__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape) { int X = get_global_id(0); int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) return; + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } - if (idx >= n) return; - float4 a = read_imagef(input_a, smp_none, (int2)(X, Y * output_shape.w + Z)); - float4 b = read_imagef(input_b, smp_none, (int2)(X, Y * output_shape.w + Z)); - src = a + b; - write_imagef(output, (int2)(0, 0), src); + float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); + float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); + write_imagef(output, (int2)(X, Y), a + b); +} + +__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); + float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); + write_imagef(output, (int2)(X, Y), a - b); +} + +__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); + float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); + write_imagef(output, (int2)(X, Y), a * b); +} + +__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); + float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); + write_imagef(output, (int2)(X, Y), a / b); +} + +__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); + write_imagef(output, (int2)(X, Y), weight * a + bias); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 8cd3113797d3401dc0e4038d3d38f174bb2b55a8..1deeaf51b44092a7d2b2419e3bbd08a995502594 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -40,10 +40,10 @@ std::vector ArithmeticOpenCLKernel::InitGlobalSize() const { } void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { - global_size_ = InitGlobalSize(); - int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())()); - local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); - global_size_ = GetCommonGlobalSize(local_size_, global_size_); + size_t H = outputs_[0]->Batch() * outputs_[0]->Height(); + size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM); + local_size_ = {16, 16}; + global_size_ = {H, W}; } void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { @@ -51,63 +51,75 @@ void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { global_size_ = {element_num}; } +int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector* img_size) { + size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); + int H = outputs_[0]->Batch() * outputs_[0]->Height(); + int W = outputs_[0]->Width() * CO4; + size_t im_dst_x, im_dst_y; + if (inputs_[0]->GetFormat() == schema::Format_NHWC4) { + im_dst_x = W; + im_dst_y = H; + } else { + im_dst_y = outputs_[0]->Batch() * outputs_[0]->Height() * CO4; + im_dst_x = outputs_[0]->Width(); + } +#ifdef ENABLE_FP16 + size_t img_dtype = CL_HALF_FLOAT; +#else + size_t img_dtype = CL_FLOAT; +#endif + img_size->clear(); + std::vector vec{im_dst_x, im_dst_y, img_dtype}; + *img_size = vec; + return 0; +} + int ArithmeticOpenCLKernel::Init() { runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); - std::string element_name; - std::string boardcast_name; + std::string kernel_name; if (inputs_[1]->TensorType() == schema::NodeType_ValueNode && inputs_[1]->Data() != nullptr) { element_flag_ = false; + kernel_name = "BoardcastArith"; } else { element_flag_ = true; + switch (opParameter->type_) { + case PrimitiveType_Mul: + kernel_name = "ElementMul"; + break; + case PrimitiveType_Add: + kernel_name = "ElementAdd"; + break; + case PrimitiveType_Sub: + kernel_name = "ElementSub"; + break; + case PrimitiveType_Div: + kernel_name = "ElementDiv"; + break; + default: + MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; + break; + } } - switch (opParameter->type_) { - case PrimitiveType_Mul: - element_name = "ElementMul"; - boardcast_name = "BoardcastMul"; - break; - case PrimitiveType_Add: - element_name = "ElementAdd"; - boardcast_name = "BoardcastAdd"; - break; - case PrimitiveType_Sub: - element_name = "ElementSub"; - boardcast_name = "BoardcastSub"; - break; - case PrimitiveType_Div: - element_name = "ElementDiv"; - boardcast_name = "BoardcastDiv"; - break; - default: - MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; - break; - } #ifdef PROGRAM_WITH_IL runtime_->CreateKernelFromIL(kernel_(), kernel_name); #else std::string program_name = "Arithmetic"; std::set build_options; - std::string source = arithmetic_buffer_source_fp32; + std::string source = arithmetic_image2d_source_fp32; runtime_->LoadSource(program_name, source); - - if (element_flag_) { - runtime_->BuildKernel(kernel_, program_name, element_name, build_options); - MS_LOG(DEBUG) << element_name << " Init Done!"; - } else { - runtime_->BuildKernel(kernel_, program_name, boardcast_name, build_options); - MS_LOG(DEBUG) << boardcast_name << " Init Done!"; - } + runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif outputs_[0]->SetFormat(schema::Format_NHWC4); + Image2dGetWorkGroupSize(); return 0; } int ArithmeticOpenCLKernel::Run() { MS_LOG(DEBUG) << this->Name() << " Running!"; auto runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); - BufferGetWorkGroupSize(); int arg_idx = 0; uint32_t element_num = outputs_[0]->ElementsC4Num(); @@ -116,11 +128,34 @@ int ArithmeticOpenCLKernel::Run() { if (element_flag_) { runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[1]->Data()); } else { - runtime_->SetKernelArg(kernel_, arg_idx++, static_cast(inputs_[1]->Data())[0]); + float value = static_cast(inputs_[1]->Data())[0]; + switch (opParameter->type_) { + case PrimitiveType_Mul: + weight_ = value; + break; + case PrimitiveType_Add: + bias_ = value; + break; + case PrimitiveType_Sub: + bias_ = -1 * value; + break; + case PrimitiveType_Div: + bias_ = 1 / value; + break; + default: + MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; + break; + } + runtime_->SetKernelArg(kernel_, arg_idx++, weight_); + runtime_->SetKernelArg(kernel_, arg_idx++, bias_); + MS_LOG(DEBUG) << arg_idx-2 << " " << weight_; + MS_LOG(DEBUG) << arg_idx-1 << " " << bias_; } runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data()); - runtime_->SetKernelArg(kernel_, arg_idx++, element_num); - + int H = outputs_[0]->Batch() * outputs_[0]->Height(); + int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM); + cl_int2 output_shape{H, W}; + runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); return 0; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index 37143775b0c1bac1baf5fefcd591e62ba6246645..5210f8e771885b3db0373c1179b574e62bca43a6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -24,15 +24,16 @@ namespace mindspore::kernel { -class ArithmeticOpenCLKernel : public ArithmeticCPUKernel { +class ArithmeticOpenCLKernel : public OpenCLKernel { public: explicit ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const lite::Context *ctx) - : ArithmeticCPUKernel(parameter, inputs, outputs, ctx) {} + : OpenCLKernel(parameter, inputs, outputs) {} ~ArithmeticOpenCLKernel() override{}; int Init() override; int Run() override; + int GetImageSize(size_t idx, std::vector* img_size) override; private: std::vector InitGlobalSize() const; @@ -42,6 +43,8 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel { cl::Kernel kernel_; lite::opencl::OpenCLRuntime *runtime_; bool element_flag_{true}; + float weight_{1.f}; + float bias_{.0f}; std::vector local_size_; std::vector global_size_; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc index e7fccfe1a3edc43763d85bbe729e8a4c51ef7e0d..5d9e99dd0b81cab6cbbd5d4b74985ab09180c5bb 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc @@ -61,13 +61,12 @@ void LogData(void *data, const int size, const std::string prefix) { } void TestCase(const std::vector &shape_a, const std::vector &shape_b) { - std::cout << "TestCase" << std::endl; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto allocator = ocl_runtime->GetAllocator(); bool is_bias_add = shape_b.empty(); auto tensorType = schema::NodeType_ValueNode; - std::cout << "TestCase tensor" << std::endl; lite::tensor::Tensor *tensor_a = new lite::tensor::Tensor(kNumberTypeFloat32, shape_a, schema::Format_NHWC4, tensorType); lite::tensor::Tensor *tensor_b = @@ -77,7 +76,6 @@ void TestCase(const std::vector &shape_a, const std::vector &shape_b) int64_t element_num = tensor_a->ElementsC4Num(); int64_t element_num_b = is_bias_add ? 1 : tensor_b->ElementsC4Num(); - std::cout << "TestCase new data" << std::endl; float *data_a = new float[element_num]; float *data_b = new float[element_num_b]; float *data_c_cpu = new float[element_num]; @@ -87,14 +85,12 @@ void TestCase(const std::vector &shape_a, const std::vector &shape_b) InitData(data_b, element_num_b); memset(data_c_ocl, 0, sizeof(float) * element_num); - std::cout << "TestCase run cpu" << std::endl; if (is_bias_add) { BoardcaseAdd(data_a, static_cast(data_b)[0], data_c_cpu, element_num); } else { ElementAdd(data_a, data_b, data_c_cpu, element_num); } - std::cout << "TestCase set data" << std::endl; std::vector inputs = {tensor_a}; if (!is_bias_add) { inputs.push_back(tensor_b); @@ -114,9 +110,10 @@ void TestCase(const std::vector &shape_a, const std::vector &shape_b) new kernel::ArithmeticOpenCLKernel(reinterpret_cast(param), arithmetic_inputs, outputs, &ctx); arith_kernel->Init(); + tensor_a->MallocData(allocator); + tensor_b->MallocData(allocator); std::vector kernels{arith_kernel}; auto *kernel = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); - std::cout << "TestCase Init" << std::endl; kernel->Init(); memcpy(inputs[0]->Data(), data_a, sizeof(float) * element_num); @@ -124,7 +121,6 @@ void TestCase(const std::vector &shape_a, const std::vector &shape_b) memcpy(inputs[1]->Data(), data_b, sizeof(float) * element_num_b); } - std::cout << "TestCase Run" << std::endl; kernel->Run(); memcpy(data_c_ocl, outputs[0]->Data(), sizeof(float) * element_num); @@ -136,7 +132,6 @@ void TestCase(const std::vector &shape_a, const std::vector &shape_b) LogData(outputs[0]->Data(), 10, "OpenCL compute : "); bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num); MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!"); - std::cout << "TestCase End" << std::endl; // free delete[] data_a; @@ -162,15 +157,15 @@ class TestArithmeticOpenCL : public mindspore::Common { }; TEST_F(TestArithmeticOpenCL, AddElementwiseTest) { - const std::vector &shape_a = {1, 32, 32, 4}; - const std::vector &shape_b = {1, 32, 32, 4}; + const std::vector &shape_a = {1, 1024, 1024, 4}; + const std::vector &shape_b = {1, 1024, 1024, 4}; TestCase(shape_a, shape_b); } -// TEST_F(TestOpenCLKernel, AddBoardcaseTest) { -// const std::vector &shape_a = {1, 4, 128, 128}; -// const std::vector &shape_b = {}; -// TestCase(shape_a, shape_b); -//} +TEST_F(TestArithmeticOpenCL, AddBoardcaseTest) { + const std::vector &shape_a = {1, 128, 128, 4}; + const std::vector &shape_b = {}; + TestCase(shape_a, shape_b); +} } // namespace mindspore