diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc index 085f7f3ad7101a59b8035ac3a8ad8a1e602fb102..bcc9644f81542ab6fb8a0badf8ecaea89fc8dedb 100644 --- a/lite/api/mobilenetv1_test.cc +++ b/lite/api/mobilenetv1_test.cc @@ -23,6 +23,10 @@ #include "lite/core/op_registry.h" DEFINE_string(optimized_model, "", "optimized_model"); +DEFINE_int32(N, 1, "input_batch"); +DEFINE_int32(C, 3, "input_channel"); +DEFINE_int32(H, 224, "input_height"); +DEFINE_int32(W, 224, "input_width"); namespace paddle { namespace lite { @@ -37,7 +41,8 @@ void TestModel(const std::vector& valid_places, predictor.Build(model_dir, "", "", valid_places); auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); + input_tensor->Resize(DDim( + std::vector({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W}))); auto* data = input_tensor->mutable_data(); auto item_size = input_tensor->dims().production(); for (int i = 0; i < item_size; i++) { @@ -58,6 +63,8 @@ void TestModel(const std::vector& valid_places, predictor.SaveModel(FLAGS_optimized_model); } + LOG(INFO) << "input shape(NCHW):" << FLAGS_N << " " << FLAGS_C << " " + << FLAGS_H << " " << FLAGS_W; LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc index 84bd27e352f549d619cfa51f9127f973023e6d45..012d6d48d9e6d3747f83a7f1089944bbaf359f71 100644 --- a/lite/api/mobilenetv2_test.cc +++ b/lite/api/mobilenetv2_test.cc @@ -23,6 +23,10 @@ #include "lite/core/op_registry.h" DEFINE_string(optimized_model, "", "optimized_model"); +DEFINE_int32(N, 1, "input_batch"); +DEFINE_int32(C, 3, "input_channel"); +DEFINE_int32(H, 224, "input_height"); +DEFINE_int32(W, 224, "input_width"); namespace paddle { namespace lite { @@ -38,7 +42,8 @@ void TestModel(const std::vector& valid_places, predictor.Build(model_dir, "", "", valid_places); auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); + input_tensor->Resize(DDim( + std::vector({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W}))); auto* data = input_tensor->mutable_data(); auto item_size = input_tensor->dims().production(); for (int i = 0; i < item_size; i++) { @@ -59,6 +64,8 @@ void TestModel(const std::vector& valid_places, predictor.SaveModel(FLAGS_optimized_model); } + LOG(INFO) << "input shape(NCHW):" << FLAGS_N << " " << FLAGS_C << " " + << FLAGS_H << " " << FLAGS_W; LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats @@ -123,8 +130,11 @@ TEST(MobileNetV2, test_arm) { #ifdef LITE_WITH_OPENCL TEST(MobileNetV2, test_opencl) { std::vector valid_places({ - Place{TARGET(kOpenCL), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, + TARGET(kARM), // enable kARM CPU kernel when no opencl kernel }); TestModel(valid_places); diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc index a9cfb32aa5b141b4e3b3c7d28d2d3694524fa34c..d00101552d4376bc4ac2a176016c1a9a449c35a7 100644 --- a/lite/kernels/opencl/conv_compute.cc +++ b/lite/kernels/opencl/conv_compute.cc @@ -362,6 +362,9 @@ void ConvImageCompute::PrepareForRun() { filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); impl_ = &ConvImageCompute::Conv2d1x1; +#if 1 // TODO(ysh329): enable general dwconv + } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]) { +#else // TODO(ysh329): remove dwconv3x3s1 and dwconv3x3 temporarily, need fix } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] && kernel_h == 3 && kernel_w == 3 && groups > 1) { // depth_conv2d_3x3s1, depth_conv2d_3x3 @@ -374,7 +377,7 @@ void ConvImageCompute::PrepareForRun() { } kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl"); - CLImageConverterDWBlock converter; + CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); std::vector filter_image_v(filter_image_dims[0] * filter_image_dims[1] * 4); // 4 : RGBA @@ -382,12 +385,13 @@ void ConvImageCompute::PrepareForRun() { filter_gpu_image_.mutable_data( filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] && - kernel_h != 3 && groups > 1) { + kernel_h != 3) { +#endif // depth_conv2d kernel_func_names_.push_back("depth_conv2d"); kernel_func_paths_.push_back("image/depthwise_conv2d_basic_kernel.cl"); - CLImageConverterDWBlock converter; + CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); std::vector filter_image_v(filter_image_dims[0] * filter_image_dims[1] * 4); // 4 : RGBA