提交 a6dc9490 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!4138 change buffer to image2d for arithmetic

Merge pull request !4138 from liuchao/arith_image
...@@ -26,26 +26,9 @@ __kernel void ElementDiv(__global float *input_a, __global float *input_b, __glo ...@@ -26,26 +26,9 @@ __kernel void ElementDiv(__global float *input_a, __global float *input_b, __glo
output[idx] = input_a[idx] * input_b[idx]; output[idx] = input_a[idx] * input_b[idx];
} }
__kernel void BoardcastAdd(__global float *input_a, float input_b, __global float *output, const unsigned int n) { __kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output,
int idx = get_global_id(0); const unsigned int n) {
if (idx >= n) return;
output[idx] = input_a[idx] + input_b;
}
__kernel void BoardcastSub(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] - input_b;
}
__kernel void BoardcastMul(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * input_b;
}
__kernel void BoardcastDiv(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
int idx = get_global_id(0); int idx = get_global_id(0);
if (idx >= n) return; if (idx >= n) return;
output[idx] = input_a[idx] * input_b; output[idx] = weight * input_a[idx] + bias;
} }
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__kernel void ElementAdd(__read_only image2d_t *input_a, __read_only image2d_t *input_b, __write_only image2d_t *output, __kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int4 output_shape) { const int2 output_shape) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
int Z = get_global_id(2); if (X >= output_shape.x || Y >= output_shape.y) {
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) return; return;
}
if (idx >= n) return; float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y * output_shape.w + Z)); float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y * output_shape.w + Z)); write_imagef(output, (int2)(X, Y), a + b);
src = a + b; }
write_imagef(output, (int2)(0, 0), src);
__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a - b);
}
__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a * b);
}
__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a / b);
}
__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), weight * a + bias);
} }
...@@ -40,10 +40,10 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const { ...@@ -40,10 +40,10 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
} }
void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
global_size_ = InitGlobalSize(); size_t H = outputs_[0]->Batch() * outputs_[0]->Height();
int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())()); size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); local_size_ = {16, 16};
global_size_ = GetCommonGlobalSize(local_size_, global_size_); global_size_ = {H, W};
} }
void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
...@@ -51,63 +51,75 @@ void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { ...@@ -51,63 +51,75 @@ void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
global_size_ = {element_num}; global_size_ = {element_num};
} }
int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t>* img_size) {
size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
int H = outputs_[0]->Batch() * outputs_[0]->Height();
int W = outputs_[0]->Width() * CO4;
size_t im_dst_x, im_dst_y;
if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
im_dst_x = W;
im_dst_y = H;
} else {
im_dst_y = outputs_[0]->Batch() * outputs_[0]->Height() * CO4;
im_dst_x = outputs_[0]->Width();
}
#ifdef ENABLE_FP16
size_t img_dtype = CL_HALF_FLOAT;
#else
size_t img_dtype = CL_FLOAT;
#endif
img_size->clear();
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
*img_size = vec;
return 0;
}
int ArithmeticOpenCLKernel::Init() { int ArithmeticOpenCLKernel::Init() {
runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
std::string element_name; std::string kernel_name;
std::string boardcast_name;
if (inputs_[1]->TensorType() == schema::NodeType_ValueNode && inputs_[1]->Data() != nullptr) { if (inputs_[1]->TensorType() == schema::NodeType_ValueNode && inputs_[1]->Data() != nullptr) {
element_flag_ = false; element_flag_ = false;
kernel_name = "BoardcastArith";
} else { } else {
element_flag_ = true; element_flag_ = true;
}
switch (opParameter->type_) { switch (opParameter->type_) {
case PrimitiveType_Mul: case PrimitiveType_Mul:
element_name = "ElementMul"; kernel_name = "ElementMul";
boardcast_name = "BoardcastMul";
break; break;
case PrimitiveType_Add: case PrimitiveType_Add:
element_name = "ElementAdd"; kernel_name = "ElementAdd";
boardcast_name = "BoardcastAdd";
break; break;
case PrimitiveType_Sub: case PrimitiveType_Sub:
element_name = "ElementSub"; kernel_name = "ElementSub";
boardcast_name = "BoardcastSub";
break; break;
case PrimitiveType_Div: case PrimitiveType_Div:
element_name = "ElementDiv"; kernel_name = "ElementDiv";
boardcast_name = "BoardcastDiv";
break; break;
default: default:
MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
break; break;
} }
}
#ifdef PROGRAM_WITH_IL #ifdef PROGRAM_WITH_IL
runtime_->CreateKernelFromIL(kernel_(), kernel_name); runtime_->CreateKernelFromIL(kernel_(), kernel_name);
#else #else
std::string program_name = "Arithmetic"; std::string program_name = "Arithmetic";
std::set<std::string> build_options; std::set<std::string> build_options;
std::string source = arithmetic_buffer_source_fp32; std::string source = arithmetic_image2d_source_fp32;
runtime_->LoadSource(program_name, source); runtime_->LoadSource(program_name, source);
runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
if (element_flag_) {
runtime_->BuildKernel(kernel_, program_name, element_name, build_options);
MS_LOG(DEBUG) << element_name << " Init Done!";
} else {
runtime_->BuildKernel(kernel_, program_name, boardcast_name, build_options);
MS_LOG(DEBUG) << boardcast_name << " Init Done!";
}
#endif #endif
outputs_[0]->SetFormat(schema::Format_NHWC4); outputs_[0]->SetFormat(schema::Format_NHWC4);
Image2dGetWorkGroupSize();
return 0; return 0;
} }
int ArithmeticOpenCLKernel::Run() { int ArithmeticOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->Name() << " Running!"; MS_LOG(DEBUG) << this->Name() << " Running!";
auto runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); auto runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
BufferGetWorkGroupSize();
int arg_idx = 0; int arg_idx = 0;
uint32_t element_num = outputs_[0]->ElementsC4Num(); uint32_t element_num = outputs_[0]->ElementsC4Num();
...@@ -116,11 +128,34 @@ int ArithmeticOpenCLKernel::Run() { ...@@ -116,11 +128,34 @@ int ArithmeticOpenCLKernel::Run() {
if (element_flag_) { if (element_flag_) {
runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[1]->Data()); runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[1]->Data());
} else { } else {
runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float *>(inputs_[1]->Data())[0]); float value = static_cast<float *>(inputs_[1]->Data())[0];
switch (opParameter->type_) {
case PrimitiveType_Mul:
weight_ = value;
break;
case PrimitiveType_Add:
bias_ = value;
break;
case PrimitiveType_Sub:
bias_ = -1 * value;
break;
case PrimitiveType_Div:
bias_ = 1 / value;
break;
default:
MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
break;
}
runtime_->SetKernelArg(kernel_, arg_idx++, weight_);
runtime_->SetKernelArg(kernel_, arg_idx++, bias_);
MS_LOG(DEBUG) << arg_idx-2 << " " << weight_;
MS_LOG(DEBUG) << arg_idx-1 << " " << bias_;
} }
runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data()); runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data());
runtime_->SetKernelArg(kernel_, arg_idx++, element_num); int H = outputs_[0]->Batch() * outputs_[0]->Height();
int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
cl_int2 output_shape{H, W};
runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
return 0; return 0;
} }
......
...@@ -24,15 +24,16 @@ ...@@ -24,15 +24,16 @@
namespace mindspore::kernel { namespace mindspore::kernel {
class ArithmeticOpenCLKernel : public ArithmeticCPUKernel { class ArithmeticOpenCLKernel : public OpenCLKernel {
public: public:
explicit ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, explicit ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx) const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx)
: ArithmeticCPUKernel(parameter, inputs, outputs, ctx) {} : OpenCLKernel(parameter, inputs, outputs) {}
~ArithmeticOpenCLKernel() override{}; ~ArithmeticOpenCLKernel() override{};
int Init() override; int Init() override;
int Run() override; int Run() override;
int GetImageSize(size_t idx, std::vector<size_t>* img_size) override;
private: private:
std::vector<size_t> InitGlobalSize() const; std::vector<size_t> InitGlobalSize() const;
...@@ -42,6 +43,8 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel { ...@@ -42,6 +43,8 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel {
cl::Kernel kernel_; cl::Kernel kernel_;
lite::opencl::OpenCLRuntime *runtime_; lite::opencl::OpenCLRuntime *runtime_;
bool element_flag_{true}; bool element_flag_{true};
float weight_{1.f};
float bias_{.0f};
std::vector<size_t> local_size_; std::vector<size_t> local_size_;
std::vector<size_t> global_size_; std::vector<size_t> global_size_;
......
...@@ -61,13 +61,12 @@ void LogData(void *data, const int size, const std::string prefix) { ...@@ -61,13 +61,12 @@ void LogData(void *data, const int size, const std::string prefix) {
} }
void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) { void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) {
std::cout << "TestCase" << std::endl;
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto allocator = ocl_runtime->GetAllocator();
bool is_bias_add = shape_b.empty(); bool is_bias_add = shape_b.empty();
auto tensorType = schema::NodeType_ValueNode; auto tensorType = schema::NodeType_ValueNode;
std::cout << "TestCase tensor" << std::endl;
lite::tensor::Tensor *tensor_a = lite::tensor::Tensor *tensor_a =
new lite::tensor::Tensor(kNumberTypeFloat32, shape_a, schema::Format_NHWC4, tensorType); new lite::tensor::Tensor(kNumberTypeFloat32, shape_a, schema::Format_NHWC4, tensorType);
lite::tensor::Tensor *tensor_b = lite::tensor::Tensor *tensor_b =
...@@ -77,7 +76,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) ...@@ -77,7 +76,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
int64_t element_num = tensor_a->ElementsC4Num(); int64_t element_num = tensor_a->ElementsC4Num();
int64_t element_num_b = is_bias_add ? 1 : tensor_b->ElementsC4Num(); int64_t element_num_b = is_bias_add ? 1 : tensor_b->ElementsC4Num();
std::cout << "TestCase new data" << std::endl;
float *data_a = new float[element_num]; float *data_a = new float[element_num];
float *data_b = new float[element_num_b]; float *data_b = new float[element_num_b];
float *data_c_cpu = new float[element_num]; float *data_c_cpu = new float[element_num];
...@@ -87,14 +85,12 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) ...@@ -87,14 +85,12 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
InitData(data_b, element_num_b); InitData(data_b, element_num_b);
memset(data_c_ocl, 0, sizeof(float) * element_num); memset(data_c_ocl, 0, sizeof(float) * element_num);
std::cout << "TestCase run cpu" << std::endl;
if (is_bias_add) { if (is_bias_add) {
BoardcaseAdd(data_a, static_cast<float *>(data_b)[0], data_c_cpu, element_num); BoardcaseAdd(data_a, static_cast<float *>(data_b)[0], data_c_cpu, element_num);
} else { } else {
ElementAdd(data_a, data_b, data_c_cpu, element_num); ElementAdd(data_a, data_b, data_c_cpu, element_num);
} }
std::cout << "TestCase set data" << std::endl;
std::vector<lite::tensor::Tensor *> inputs = {tensor_a}; std::vector<lite::tensor::Tensor *> inputs = {tensor_a};
if (!is_bias_add) { if (!is_bias_add) {
inputs.push_back(tensor_b); inputs.push_back(tensor_b);
...@@ -114,9 +110,10 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) ...@@ -114,9 +110,10 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
new kernel::ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(param), arithmetic_inputs, outputs, &ctx); new kernel::ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(param), arithmetic_inputs, outputs, &ctx);
arith_kernel->Init(); arith_kernel->Init();
tensor_a->MallocData(allocator);
tensor_b->MallocData(allocator);
std::vector<kernel::LiteKernel *> kernels{arith_kernel}; std::vector<kernel::LiteKernel *> kernels{arith_kernel};
auto *kernel = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); auto *kernel = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
std::cout << "TestCase Init" << std::endl;
kernel->Init(); kernel->Init();
memcpy(inputs[0]->Data(), data_a, sizeof(float) * element_num); memcpy(inputs[0]->Data(), data_a, sizeof(float) * element_num);
...@@ -124,7 +121,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) ...@@ -124,7 +121,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
memcpy(inputs[1]->Data(), data_b, sizeof(float) * element_num_b); memcpy(inputs[1]->Data(), data_b, sizeof(float) * element_num_b);
} }
std::cout << "TestCase Run" << std::endl;
kernel->Run(); kernel->Run();
memcpy(data_c_ocl, outputs[0]->Data(), sizeof(float) * element_num); memcpy(data_c_ocl, outputs[0]->Data(), sizeof(float) * element_num);
...@@ -136,7 +132,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) ...@@ -136,7 +132,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
LogData(outputs[0]->Data(), 10, "OpenCL compute : "); LogData(outputs[0]->Data(), 10, "OpenCL compute : ");
bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num); bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num);
MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!"); MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!");
std::cout << "TestCase End" << std::endl;
// free // free
delete[] data_a; delete[] data_a;
...@@ -162,15 +157,15 @@ class TestArithmeticOpenCL : public mindspore::Common { ...@@ -162,15 +157,15 @@ class TestArithmeticOpenCL : public mindspore::Common {
}; };
TEST_F(TestArithmeticOpenCL, AddElementwiseTest) { TEST_F(TestArithmeticOpenCL, AddElementwiseTest) {
const std::vector<int> &shape_a = {1, 32, 32, 4}; const std::vector<int> &shape_a = {1, 1024, 1024, 4};
const std::vector<int> &shape_b = {1, 32, 32, 4}; const std::vector<int> &shape_b = {1, 1024, 1024, 4};
TestCase(shape_a, shape_b); TestCase(shape_a, shape_b);
} }
// TEST_F(TestOpenCLKernel, AddBoardcaseTest) { TEST_F(TestArithmeticOpenCL, AddBoardcaseTest) {
// const std::vector<int> &shape_a = {1, 4, 128, 128}; const std::vector<int> &shape_a = {1, 128, 128, 4};
// const std::vector<int> &shape_b = {}; const std::vector<int> &shape_b = {};
// TestCase(shape_a, shape_b); TestCase(shape_a, shape_b);
//} }
} // namespace mindspore } // namespace mindspore
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册