提交 c01f0b66 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!4623 opencl convolution kernel support winograd

Merge pull request !4623 from 王东旭/opencl_winograd
...@@ -40,12 +40,59 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { ...@@ -40,12 +40,59 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
private: private:
int CI_SLICES;
int CO_SLICES;
float *packed_weight_ = nullptr; float *packed_weight_ = nullptr;
float *packed_bias_ = nullptr; float *packed_bias_ = nullptr;
cl::Kernel kernel_;
std::string CodeGen(); bool use_winograd_ = false;
int GetGlobalLocal(std::vector<size_t> *global, std::vector<size_t> *local); int TILES_X;
int TILES_Y;
int TILES_XY;
void *winograd_mem0_ = nullptr;
void *winograd_mem1_ = nullptr;
cl::Kernel kernel_4x4to36;
cl::Kernel kernel_conv;
cl::Kernel kernel_36to4x4;
std::string CodeGenConvolution();
std::string CodeGenWinograd4x4To36();
std::string CodeGenWinogradConvolution();
std::string CodeGenWinograd36To4x4();
int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local);
bool UseWinograd4x4To6x6() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
const bool channel_good = CO_SLICES % 4 == 0 && CI_SLICES >= 16 && CO_SLICES >= 16;
const bool hw_good = TILES_X * TILES_Y >= 32;
return attr_valid && channel_good && hw_good;
}
std::vector<float> MatrixMultiply(const std::vector<float> &A, const std::vector<float> &B, int M, int N, int K) {
std::vector<float> C(M * K);
for (int i = 0; i < M; ++i) {
for (int j = 0; j < K; ++j) {
float s = 0.0f;
for (int k = 0; k < N; ++k) {
s += A[i * N + k] * B[k * K + j];
}
C[i * K + j] = s;
}
}
return C;
}
static int GetBiggestDivider(int x, int y) {
for (int i = y; i != 0; i--) {
if (x % i == 0) {
return i;
}
}
return 1;
}
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
......
...@@ -113,6 +113,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size) ...@@ -113,6 +113,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
UnLock(); UnLock();
return nullptr; return nullptr;
} }
MS_LOG(DEBUG) << "Malloc a new Image2D, width=" << img_size[0] << ", height=" << img_size[1];
image_ptr = static_cast<void *>(image); image_ptr = static_cast<void *>(image);
} }
} }
......
...@@ -71,6 +71,10 @@ void OpenCLRuntime::DeleteInstance() { ...@@ -71,6 +71,10 @@ void OpenCLRuntime::DeleteInstance() {
OpenCLRuntime::OpenCLRuntime() { default_build_opts_ = " -cl-mad-enable -cl-fast-relaxed-math -Werror"; } OpenCLRuntime::OpenCLRuntime() { default_build_opts_ = " -cl-mad-enable -cl-fast-relaxed-math -Werror"; }
void printf_callback(const char *buffer, size_t length, size_t final, void *user_data) {
fwrite(buffer, 1, length, stdout);
}
// Init will get platforms info, get devices info, create opencl context. // Init will get platforms info, get devices info, create opencl context.
int OpenCLRuntime::Init() { int OpenCLRuntime::Init() {
std::unique_lock<std::mutex> lck(g_init_mtx); std::unique_lock<std::mutex> lck(g_init_mtx);
...@@ -147,6 +151,9 @@ int OpenCLRuntime::Init() { ...@@ -147,6 +151,9 @@ int OpenCLRuntime::Init() {
} }
#else #else
MS_LOG(INFO) << "Create common opencl context"; MS_LOG(INFO) << "Create common opencl context";
// cl_context_properties context_prop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](),
// CL_PRINTF_CALLBACK_ARM, (cl_context_properties)printf_callback, 0};
// context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &err);
context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &err); context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &err);
#endif #endif
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
......
...@@ -63,9 +63,26 @@ void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &fil ...@@ -63,9 +63,26 @@ void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &fil
printf("compare success!\n\n\n"); printf("compare success!\n\n\n");
} }
void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::string &input_file, void TEST_MAIN(schema::Format input_format, schema::Format output_format, const std::string &data_path,
const std::string &weight_file, const std::string &bias_file, const std::string &expect_file) { std::string attr_str) {
assert(data_format == schema::Format_NHWC || data_format == schema::Format_NHWC4); assert(data_format == schema::Format_NHWC || data_format == schema::Format_NHWC4);
auto param = new ConvParameter;
sscanf(attr_str.c_str(),
"inputNHWC_%dx%dx%dx%d_outputNHWC_%dx%dx%dx%d_kernelHW_%dx%d_strideHW_%dx%d_padTopBottomLeftRight_%dx%dx%dx%d_"
"dilationHW_%dx%d",
&param->input_batch_, &param->input_h_, &param->input_w_, &param->input_channel_, &param->output_batch_,
&param->output_h_, &param->output_w_, &param->output_channel_, &param->kernel_h_, &param->kernel_w_,
&param->stride_h_, &param->stride_w_, &param->pad_u_, &param->pad_d_, &param->pad_l_, &param->pad_r_,
&param->dilation_h_, &param->dilation_w_);
auto testcase_path = data_path + "/" + attr_str + "/";
auto input_file = testcase_path + (input_format == schema::Format_NHWC4 ? "input_NHWC4.bin" : "input_NHWC.bin");
auto weight_file = testcase_path + "weight_OHWI.bin";
auto bias_file = testcase_path + "bias_C4.bin";
auto expect_file = testcase_path + (output_format == schema::Format_NHWC4 ? "expect_NHWC4.bin" : "expect_NHWC.bin");
std::cout << input_file << std::endl;
std::cout << weight_file << std::endl;
std::cout << bias_file << std::endl;
std::cout << expect_file << std::endl;
std::cout << "initialize OpenCLRuntime"; std::cout << "initialize OpenCLRuntime";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
...@@ -79,10 +96,10 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri ...@@ -79,10 +96,10 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri
std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_}; std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_};
auto data_type = kNumberTypeFloat32; auto data_type = kNumberTypeFloat32;
auto tensorType = schema::NodeType_ValueNode; auto tensorType = schema::NodeType_ValueNode;
auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, data_format, tensorType); auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, input_format, tensorType);
auto weight_tensor = new lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensorType); auto weight_tensor = new lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensorType);
auto bias_tensor = new lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensorType); auto bias_tensor = new lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensorType);
auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, data_format, tensorType); auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, output_format, tensorType);
std::vector<lite::tensor::Tensor *> inputs{input_tensor, weight_tensor, bias_tensor}; std::vector<lite::tensor::Tensor *> inputs{input_tensor, weight_tensor, bias_tensor};
std::vector<lite::tensor::Tensor *> outputs{output_tensor}; std::vector<lite::tensor::Tensor *> outputs{output_tensor};
...@@ -114,7 +131,6 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri ...@@ -114,7 +131,6 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri
std::cout << "sub_graph->Run()"; std::cout << "sub_graph->Run()";
sub_graph->Run(); sub_graph->Run();
printf("output_tensor->Size() =%zu\n", output_tensor->Size());
std::cout << "compare result"; std::cout << "compare result";
MyCompareOutput(output_tensor, expect_file); MyCompareOutput(output_tensor, expect_file);
...@@ -131,57 +147,35 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri ...@@ -131,57 +147,35 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri
mindspore::lite::opencl::OpenCLRuntime::DeleteInstance(); mindspore::lite::opencl::OpenCLRuntime::DeleteInstance();
} }
std::array<std::string, 4> GenFilenames(ConvParameter *param, schema::Format data_format, const std::string &path) { TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
auto full_path = path + "inputNHWC_" + std::to_string(param->input_batch_) + "x" + std::to_string(param->input_h_) + TEST_MAIN(
"x" + std::to_string(param->input_w_) + "x" + std::to_string(param->input_channel_) + schema::Format_NHWC, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/",
"_outputNHWC_" + std::to_string(param->output_batch_) + "x" + std::to_string(param->output_h_) + "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_"
"x" + std::to_string(param->output_w_) + "x" + std::to_string(param->output_channel_) + "1x1");
"_kernelHW_" + std::to_string(param->kernel_h_) + "x" + std::to_string(param->kernel_w_) +
"_strideHW_" + std::to_string(param->stride_h_) + "x" + std::to_string(param->stride_w_) +
"_padTopBottomLeftRight_" + std::to_string(param->pad_u_) + "x" + std::to_string(param->pad_d_) +
"x" + std::to_string(param->pad_l_) + "x" + std::to_string(param->pad_r_) + "_dilationHW_1x1/";
if (data_format == schema::Format_NHWC4) {
return std::array<std::string, 4>{full_path + "input_NHWC4.bin", full_path + "weight_OHWI.bin",
full_path + "bias_C4.bin", full_path + "expect_NHWC4.bin"};
} else {
return std::array<std::string, 4>{full_path + "input_NHWC.bin", full_path + "weight_OHWI.bin",
full_path + "bias_C.bin", full_path + "expect_NHWC.bin"};
}
} }
TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { // TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) {
auto param = new ConvParameter; // TEST_MAIN(
param->input_batch_ = 1, param->input_h_ = 224, param->input_w_ = 224, param->input_channel_ = 3; // schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
param->output_batch_ = 1, param->output_h_ = 112, param->output_w_ = 112, param->output_channel_ = 32; // "inputNHWC_1x1x64x512_outputNHWC_1x1x64x7358_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_"
param->kernel_h_ = 3, param->kernel_w_ = 3; // "1x1");
param->stride_h_ = 2, param->stride_w_ = 2; //}
param->pad_u_ = 0, param->pad_d_ = 1, param->pad_l_ = 0, param->pad_r_ = 1;
TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) {
auto filenames = GenFilenames(param, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/"); TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
// std::cout << filenames[0] << std::endl; "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
// std::cout << filenames[1] << std::endl; "dilationHW_1x1");
// std::cout << filenames[2] << std::endl; }
// std::cout << filenames[3] << std::endl; TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x100_outputNHWC_1x16x256x96) {
TEST_MAIN(param, schema::Format_NHWC4, filenames[0], filenames[1], filenames[2], filenames[3]); TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
lite::opencl::OpenCLRuntime::DeleteInstance(); "inputNHWC_1x16x256x100_outputNHWC_1x16x256x96_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1");
} }
TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) { TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x480x480x128_outputNHWC_1x480x480x128) {
auto param = new ConvParameter; TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
param->input_batch_ = 1, param->input_h_ = 1, param->input_w_ = 64, param->input_channel_ = 512; "inputNHWC_1x480x480x128_outputNHWC_1x480x480x128_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_"
param->output_batch_ = 1, param->output_h_ = 1, param->output_w_ = 64, param->output_channel_ = 7358; "1x1x1x1_dilationHW_1x1");
param->kernel_h_ = 1, param->kernel_w_ = 1;
param->stride_h_ = 1, param->stride_w_ = 1;
param->pad_u_ = 0, param->pad_d_ = 0, param->pad_l_ = 0, param->pad_r_ = 0;
auto filenames = GenFilenames(param, schema::Format_NHWC4, "testcases/02_fp32/");
// std::cout << filenames[0] << std::endl;
// std::cout << filenames[1] << std::endl;
// std::cout << filenames[2] << std::endl;
// std::cout << filenames[3] << std::endl;
TEST_MAIN(param, schema::Format_NHWC4, filenames[0], filenames[1], filenames[2], filenames[3]);
lite::opencl::OpenCLRuntime::DeleteInstance();
} }
} // namespace mindspore } // namespace mindspore
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册