From e0fe902cf7cb6cc8c683cf7b6f4baf441d504e01 Mon Sep 17 00:00:00 2001 From: Yuan Shuai Date: Fri, 14 Feb 2020 23:31:02 -0600 Subject: [PATCH] [cherry-pick][LITE][OPENCL] OpenCL Image2D model passed (#2891) * mobilenetv1/v2,mnasnet,yolonano,sr models passed with opencl image2d kernel. test=develop * Fix gflag shape define. test=develop --- lite/api/mobilenetv1_test.cc | 9 ++++++++- lite/api/mobilenetv2_test.cc | 16 +++++++++++++--- lite/kernels/opencl/conv_compute.cc | 10 +++++++--- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc index 085f7f3ad7..bcc9644f81 100644 --- a/lite/api/mobilenetv1_test.cc +++ b/lite/api/mobilenetv1_test.cc @@ -23,6 +23,10 @@ #include "lite/core/op_registry.h" DEFINE_string(optimized_model, "", "optimized_model"); +DEFINE_int32(N, 1, "input_batch"); +DEFINE_int32(C, 3, "input_channel"); +DEFINE_int32(H, 224, "input_height"); +DEFINE_int32(W, 224, "input_width"); namespace paddle { namespace lite { @@ -37,7 +41,8 @@ void TestModel(const std::vector& valid_places, predictor.Build(model_dir, "", "", valid_places); auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); + input_tensor->Resize(DDim( + std::vector({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W}))); auto* data = input_tensor->mutable_data(); auto item_size = input_tensor->dims().production(); for (int i = 0; i < item_size; i++) { @@ -58,6 +63,8 @@ void TestModel(const std::vector& valid_places, predictor.SaveModel(FLAGS_optimized_model); } + LOG(INFO) << "input shape(NCHW):" << FLAGS_N << " " << FLAGS_C << " " + << FLAGS_H << " " << FLAGS_W; LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc index 84bd27e352..012d6d48d9 100644 --- a/lite/api/mobilenetv2_test.cc +++ b/lite/api/mobilenetv2_test.cc @@ -23,6 +23,10 @@ #include "lite/core/op_registry.h" DEFINE_string(optimized_model, "", "optimized_model"); +DEFINE_int32(N, 1, "input_batch"); +DEFINE_int32(C, 3, "input_channel"); +DEFINE_int32(H, 224, "input_height"); +DEFINE_int32(W, 224, "input_width"); namespace paddle { namespace lite { @@ -38,7 +42,8 @@ void TestModel(const std::vector& valid_places, predictor.Build(model_dir, "", "", valid_places); auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); + input_tensor->Resize(DDim( + std::vector({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W}))); auto* data = input_tensor->mutable_data(); auto item_size = input_tensor->dims().production(); for (int i = 0; i < item_size; i++) { @@ -59,6 +64,8 @@ void TestModel(const std::vector& valid_places, predictor.SaveModel(FLAGS_optimized_model); } + LOG(INFO) << "input shape(NCHW):" << FLAGS_N << " " << FLAGS_C << " " + << FLAGS_H << " " << FLAGS_W; LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats @@ -123,8 +130,11 @@ TEST(MobileNetV2, test_arm) { #ifdef LITE_WITH_OPENCL TEST(MobileNetV2, test_opencl) { std::vector valid_places({ - Place{TARGET(kOpenCL), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, + TARGET(kARM), // enable kARM CPU kernel when no opencl kernel }); TestModel(valid_places); diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc index a9cfb32aa5..d00101552d 100644 --- a/lite/kernels/opencl/conv_compute.cc +++ b/lite/kernels/opencl/conv_compute.cc @@ -362,6 +362,9 @@ void ConvImageCompute::PrepareForRun() { filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); impl_ = &ConvImageCompute::Conv2d1x1; +#if 1 // TODO(ysh329): enable general dwconv + } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]) { +#else // TODO(ysh329): remove dwconv3x3s1 and dwconv3x3 temporarily, need fix } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] && kernel_h == 3 && kernel_w == 3 && groups > 1) { // depth_conv2d_3x3s1, depth_conv2d_3x3 @@ -374,7 +377,7 @@ void ConvImageCompute::PrepareForRun() { } kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl"); - CLImageConverterDWBlock converter; + CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); std::vector filter_image_v(filter_image_dims[0] * filter_image_dims[1] * 4); // 4 : RGBA @@ -382,12 +385,13 @@ void ConvImageCompute::PrepareForRun() { filter_gpu_image_.mutable_data( filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] && - kernel_h != 3 && groups > 1) { + kernel_h != 3) { +#endif // depth_conv2d kernel_func_names_.push_back("depth_conv2d"); kernel_func_paths_.push_back("image/depthwise_conv2d_basic_kernel.cl"); - CLImageConverterDWBlock converter; + CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); std::vector filter_image_v(filter_image_dims[0] * filter_image_dims[1] * 4); // 4 : RGBA -- GitLab