未验证 提交 8b90a0c7 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] support fp16 for cl_image_converter, layout, activation all...

[LITE][OPENCL] support fp16 for cl_image_converter, layout, activation all OpenCL image kernel. test=develop (#2964)

* [LITE][OPENCL] support fp16 for cl_image_converter, layout, activation image kernel. test=develop

* add conv, depthwise and UT. test=develop

* add pool, conv, nearest_interp kernel. test=develop

* support fp16 for scale, reshape, concat, fc buffer opencl kernel. test=develop

* refactor for mul opencl buffer kernel. test=develop

* support fp16 for elementwise_mul opecl image kernel. test=develop

* support fp16 for elementwise_mul opencl image kernel. test=develop

* support fp16 for ele_add, fuse_ele_add_act opencl kernel. test=develop

* rename io_copy. test=develop

* mobilenetv1,v2 passed on 855. test=develop

* fix opt for opencl. test=develop
上级 6fcad721
...@@ -81,7 +81,16 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -81,7 +81,16 @@ void TestModel(const std::vector<Place>& valid_places,
auto* out = predictor.GetOutput(0); auto* out = predictor.GetOutput(0);
const auto* pdata = out->data<float>(); const auto* pdata = out->data<float>();
int step = 50; int step = 50;
#ifdef LITE_WITH_NPU
// Get target and check result
VLOG(1) << "valid_places.size():" << valid_places.size();
for (int i = 0; i < valid_places.size(); ++i) {
auto p = valid_places[i];
VLOG(1) << "valid_places[" << i << "]:" << p.DebugString();
}
auto first_target = valid_places[0].target;
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000); ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1; double eps = 0.1;
for (int i = 0; i < ref.size(); ++i) { for (int i = 0; i < ref.size(); ++i) {
...@@ -92,7 +101,7 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -92,7 +101,7 @@ void TestModel(const std::vector<Place>& valid_places,
EXPECT_LT(diff, eps); EXPECT_LT(diff, eps);
} }
} }
#else } else {
ASSERT_EQ(out->dims().size(), 2); ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1); ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000); ASSERT_EQ(out->dims()[1], 1000);
...@@ -103,7 +112,34 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -103,7 +112,34 @@ void TestModel(const std::vector<Place>& valid_places,
EXPECT_NEAR(result, ref[i][j], eps); EXPECT_NEAR(result, ref[i][j], eps);
} }
} }
#endif }
// Get detailed result
auto* pred = &predictor;
size_t output_tensor_num = pred->GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor(
std::move(pred->GetOutput(tidx)));
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
float sum = 0.f;
for (int i = 0; i < out_dims.production(); ++i) {
sum += output_tensor->data<float>()[i];
}
VLOG(1) << "out_dims.production():" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
// print result
for (int i = 0; i < out_dims.production(); ++i) {
VLOG(2) << "output_tensor->data<float>()[" << i
<< "]:" << output_tensor->data<float>()[i];
}
}
} }
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
...@@ -130,7 +166,7 @@ TEST(MobileNetV1, test_arm) { ...@@ -130,7 +166,7 @@ TEST(MobileNetV1, test_arm) {
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
TEST(MobileNetV1, test_opencl) { TEST(MobileNetV1, test_opencl) {
std::vector<Place> valid_places({ std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
......
...@@ -83,7 +83,16 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -83,7 +83,16 @@ void TestModel(const std::vector<Place>& valid_places,
auto* out = predictor.GetOutput(0); auto* out = predictor.GetOutput(0);
const auto* pdata = out->data<float>(); const auto* pdata = out->data<float>();
int step = 50; int step = 50;
#ifdef LITE_WITH_NPU
// Get target and check result
VLOG(1) << "valid_places.size():" << valid_places.size();
for (int i = 0; i < valid_places.size(); ++i) {
auto p = valid_places[i];
VLOG(1) << "valid_places[" << i << "]:" << p.DebugString();
}
auto first_target = valid_places[0].target;
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000); ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1; double eps = 0.1;
for (int i = 0; i < ref.size(); ++i) { for (int i = 0; i < ref.size(); ++i) {
...@@ -94,16 +103,45 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -94,16 +103,45 @@ void TestModel(const std::vector<Place>& valid_places,
EXPECT_LT(diff, eps); EXPECT_LT(diff, eps);
} }
} }
#else } else {
ASSERT_EQ(out->dims().size(), 2); ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1); ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000); ASSERT_EQ(out->dims()[1], 1000);
double eps = 1e-6;
for (int i = 0; i < ref.size(); ++i) { for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) { for (int j = 0; j < ref[i].size(); ++j) {
EXPECT_NEAR(pdata[j * step + (out->dims()[1] * i)], ref[i][j], 1e-6); auto result = pdata[j * step + (out->dims()[1] * i)];
EXPECT_NEAR(result, ref[i][j], eps);
}
}
}
// Get detailed result
auto* pred = &predictor;
size_t output_tensor_num = pred->GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor(
std::move(pred->GetOutput(tidx)));
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
float sum = 0.f;
for (int i = 0; i < out_dims.production(); ++i) {
sum += output_tensor->data<float>()[i];
}
VLOG(1) << "out_dims.production():" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
// print result
for (int i = 0; i < out_dims.production(); ++i) {
VLOG(2) << "output_tensor->data<float>()[" << i
<< "]:" << output_tensor->data<float>()[i];
} }
} }
#endif
} }
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
...@@ -130,7 +168,7 @@ TEST(MobileNetV2, test_arm) { ...@@ -130,7 +168,7 @@ TEST(MobileNetV2, test_arm) {
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
TEST(MobileNetV2, test_opencl) { TEST(MobileNetV2, test_opencl) {
std::vector<Place> valid_places({ std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
......
...@@ -91,7 +91,7 @@ std::vector<Place> ParserValidPlaces() { ...@@ -91,7 +91,7 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(TARGET(kARM)); valid_places.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") { } else if (target_repr == "opencl") {
valid_places.emplace_back( valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}); Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places.emplace_back( valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}); Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places.emplace_back( valid_places.emplace_back(
......
...@@ -6,7 +6,8 @@ lite_cc_library(cl_wrapper SRCS cl_wrapper.cc) ...@@ -6,7 +6,8 @@ lite_cc_library(cl_wrapper SRCS cl_wrapper.cc)
lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper) lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper)
lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility) lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility)
lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime) lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime)
lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor) lite_cc_library(cl_half SRCS cl_half.cc)
lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor cl_half)
lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime) lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime)
lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image) lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image)
lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime) lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
......
...@@ -30,7 +30,7 @@ static void CopyImageData(CLContext* context, ...@@ -30,7 +30,7 @@ static void CopyImageData(CLContext* context,
int width = cl_image.image_dims()[0]; int width = cl_image.image_dims()[0];
int height = cl_image.image_dims()[1]; int height = cl_image.image_dims()[1];
float* image_data = new float[height * width * 4]; uint16_t* image_data = new uint16_t[height * width * 4];
cl::Image* image = cl_image.cl_image(); cl::Image* image = cl_image.cl_image();
cl::array<size_t, 3> origin = {0, 0, 0}; cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = { cl::array<size_t, 3> region = {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/opencl/cl_half.h"
namespace paddle {
namespace lite {
// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
static const uint32_t mantissatable[2048] = {
0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
0x387fc000, 0x387fe000};
static const uint16_t offsettable[64] = {
0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
static const uint32_t exponenttable[64] = {
0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
static const uint16_t basetable[512] = {
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
static const uint8_t shifttable[512] = {
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
half_t Float2Half(float f) {
uint32_t v = *reinterpret_cast<uint32_t *>(&f);
return basetable[(v >> 23) & 0x1ff] +
((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
}
float Half2Float(half_t h) {
uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
exponenttable[h >> 10];
return *reinterpret_cast<float *>(&v);
}
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
for (int i = 0; i < count; ++i) {
h_array[i] = Float2Half(f_array[i]);
}
}
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
for (int i = 0; i < count; ++i) {
f_array[i] = Half2Float(h_array[i]);
}
}
} // namespace lite
} // namespace paddle
...@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cl_common.h> #pragma once
#include <cstdint>
__kernel void relu(__read_only image2d_t input, namespace paddle {
__write_only image2d_t output) { namespace lite {
const int x = get_global_id(0); // image_width typedef uint16_t half_t;
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | half_t Float2Half(float f);
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); float Half2Float(half_t h);
in = max((CL_DTYPE4)(0.0f), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
}
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
} // namespace lite
} // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "lite/backends/opencl/cl_image.h" #include "lite/backends/opencl/cl_image.h"
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_runtime.h" #include "lite/backends/opencl/cl_runtime.h"
#include "lite/backends/opencl/cl_utility.h" #include "lite/backends/opencl/cl_utility.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
...@@ -24,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { ...@@ -24,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
int width = cl_image.image_dims_[0]; int width = cl_image.image_dims_[0];
int height = cl_image.image_dims_[1]; int height = cl_image.image_dims_[1];
float* image_data = new float[height * width * 4]; uint16_t* image_data = new uint16_t[height * width * 4];
cl::Image* image = cl_image.cl_image(); cl::Image* image = cl_image.cl_image();
cl::array<size_t, 3> origin = {0, 0, 0}; cl::array<size_t, 3> origin = {0, 0, 0};
...@@ -123,7 +124,7 @@ void CLImage::InitCLImage(const cl::Context& context, ...@@ -123,7 +124,7 @@ void CLImage::InitCLImage(const cl::Context& context,
VLOG(3) << " begin init cl image "; VLOG(3) << " begin init cl image ";
image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
float* image_data = new float[image_dims_.production() * 4]; uint16_t* image_data = new uint16_t[image_dims_.production() * 4];
VLOG(3) << " convert to image "; VLOG(3) << " convert to image ";
converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_); converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_);
......
...@@ -37,7 +37,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -37,7 +37,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
void CLImageConverterDefault::NCHWToImage(float *nchw, void CLImageConverterDefault::NCHWToImage(float *nchw,
float *image, half_t *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1}; size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) { for (size_t j = 0; j < tensor_dim.size(); ++j) {
...@@ -69,7 +69,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, ...@@ -69,7 +69,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
if (c < C) { if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4); // (c % 4);
image[i2] = *p; image[i2] = Float2Half(*p);
i2 += 4; i2 += 4;
p++; p++;
} else { } else {
...@@ -84,7 +84,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, ...@@ -84,7 +84,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
} }
} }
void CLImageConverterDefault::ImageToNCHW(float *image, void CLImageConverterDefault::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
...@@ -109,7 +109,7 @@ void CLImageConverterDefault::ImageToNCHW(float *image, ...@@ -109,7 +109,7 @@ void CLImageConverterDefault::ImageToNCHW(float *image,
for (size_t h = 0; h < H; h++) { for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4; size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) { for (size_t w = 0; w < W; w++) {
*p = image[i2]; *p = Half2Float(image[i2]);
i2 += 4; i2 += 4;
p++; p++;
} }
...@@ -164,7 +164,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -164,7 +164,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
void CLImageConverterFolder::NCHWToImage(float *tensor, void CLImageConverterFolder::NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!"; << " Tensor dim is not support!";
...@@ -187,13 +187,14 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, ...@@ -187,13 +187,14 @@ void CLImageConverterFolder::NCHWToImage(float *tensor,
for (size_t h = 0; h < tdim[0]; h++) { for (size_t h = 0; h < tdim[0]; h++) {
for (size_t w = 0; w < tdim[1]; w++) { for (size_t w = 0; w < tdim[1]; w++) {
image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w]; image[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor[h * tdim[1] + w]);
} }
} }
} }
} }
void CLImageConverterFolder::ImageToNCHW(float *image, void CLImageConverterFolder::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
...@@ -216,7 +217,7 @@ void CLImageConverterFolder::ImageToNCHW(float *image, ...@@ -216,7 +217,7 @@ void CLImageConverterFolder::ImageToNCHW(float *image,
for (size_t h = 0; h < H; h++) { for (size_t h = 0; h < H; h++) {
for (size_t w = 0; w < W; w++) { for (size_t w = 0; w < W; w++) {
p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)]; p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
} }
} }
} }
...@@ -237,7 +238,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -237,7 +238,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
void CLImageConverterNWBlock::NCHWToImage(float *tensor, void CLImageConverterNWBlock::NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
auto image_dim = InitImageDimInfoWith(tensor_dim); auto image_dim = InitImageDimInfoWith(tensor_dim);
...@@ -257,7 +258,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, ...@@ -257,7 +258,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4; w * 4 + n % 4;
if (n < N) { if (n < N) {
image[index] = *p; image[index] = Float2Half(*p);
p++; p++;
} else { } else {
image[index] = 0.0; image[index] = 0.0;
...@@ -272,7 +273,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, ...@@ -272,7 +273,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
VLOG(3) << " init done"; VLOG(3) << " init done";
} }
void CLImageConverterNWBlock::ImageToNCHW(float *image, void CLImageConverterNWBlock::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
...@@ -291,7 +292,7 @@ void CLImageConverterNWBlock::ImageToNCHW(float *image, ...@@ -291,7 +292,7 @@ void CLImageConverterNWBlock::ImageToNCHW(float *image,
for (size_t w = 0; w < W; ++w) { for (size_t w = 0; w < W; ++w) {
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4; w * 4 + n % 4;
*p = image[index]; *p = Half2Float(image[index]);
p++; p++;
if (index >= (width * height * 4)) { if (index >= (width * height * 4)) {
LOG(INFO) << " index out of range "; LOG(INFO) << " index out of range ";
...@@ -318,7 +319,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -318,7 +319,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
void CLImageConverterDWBlock::NCHWToImage(float *tensor, void CLImageConverterDWBlock::NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1}; size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) { for (size_t j = 0; j < tensor_dim.size(); ++j) {
...@@ -350,7 +351,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, ...@@ -350,7 +351,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
if (c < C) { if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4); // (c % 4);
image[i2] = *p; image[i2] = Float2Half(*p);
i2 += 4; i2 += 4;
p++; p++;
} else { } else {
...@@ -365,7 +366,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, ...@@ -365,7 +366,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
} }
} }
void CLImageConverterDWBlock::ImageToNCHW(float *image, void CLImageConverterDWBlock::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
...@@ -384,7 +385,7 @@ void CLImageConverterDWBlock::ImageToNCHW(float *image, ...@@ -384,7 +385,7 @@ void CLImageConverterDWBlock::ImageToNCHW(float *image,
for (size_t h = 0; h < H; h++) { for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4; size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) { for (size_t w = 0; w < W; w++) {
*p = image[i2]; *p = Half2Float(image[i2]);
i2 += 4; i2 += 4;
p++; p++;
} }
...@@ -418,7 +419,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -418,7 +419,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
void CLImageConverterNormal::NCHWToImage(float *tensor, void CLImageConverterNormal::NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!"; << " Tensor dim is not support!";
...@@ -427,7 +428,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, ...@@ -427,7 +428,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor,
default_converter.NCHWToImage(tensor, image, tensor_dim); default_converter.NCHWToImage(tensor, image, tensor_dim);
} }
void CLImageConverterNormal::ImageToNCHW(float *image, void CLImageConverterNormal::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
...@@ -449,10 +450,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith( ...@@ -449,10 +450,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
} }
void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) {} const DDim &tensor_dim) {}
void CLImageConverterWinoTransWeight::ImageToNCHW(float *image, void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) {} const DDim &tensor_dim) {}
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include "lite/backends/opencl/cl_half.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
namespace paddle { namespace paddle {
...@@ -24,10 +25,10 @@ class CLImageConverterBase { ...@@ -24,10 +25,10 @@ class CLImageConverterBase {
virtual ~CLImageConverterBase() {} virtual ~CLImageConverterBase() {}
virtual void NCHWToImage(float *nchw, virtual void NCHWToImage(float *nchw,
float *image, half_t *image,
const DDim &tensor_dim) = 0; const DDim &tensor_dim) = 0;
virtual void ImageToNCHW(float *image, virtual void ImageToNCHW(half_t *image,
float *nchw, float *nchw,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) = 0; const DDim &tensor_dim) = 0;
...@@ -37,8 +38,8 @@ class CLImageConverterBase { ...@@ -37,8 +38,8 @@ class CLImageConverterBase {
class CLImageConverterDefault : public CLImageConverterBase { class CLImageConverterDefault : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) override; void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
...@@ -48,9 +49,9 @@ class CLImageConverterFolder : public CLImageConverterBase { ...@@ -48,9 +49,9 @@ class CLImageConverterFolder : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor, void NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
...@@ -77,9 +78,9 @@ class CLImageConverterNormal : public CLImageConverterBase { ...@@ -77,9 +78,9 @@ class CLImageConverterNormal : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor, void NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
...@@ -106,9 +107,9 @@ class CLImageConverterNWBlock : public CLImageConverterBase { ...@@ -106,9 +107,9 @@ class CLImageConverterNWBlock : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor, void NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
...@@ -117,9 +118,9 @@ class CLImageConverterDWBlock : public CLImageConverterBase { ...@@ -117,9 +118,9 @@ class CLImageConverterDWBlock : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor, void NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
...@@ -129,9 +130,9 @@ class CLImageConverterWinoTransWeight : public CLImageConverterBase { ...@@ -129,9 +130,9 @@ class CLImageConverterWinoTransWeight : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor, void NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
// #define DEBUG
// buffer -> image2d // buffer -> image2d
__kernel void buffer_to_image2d(__global CL_DTYPE *in, __kernel void buffer_to_image2d(__global CL_DTYPE *in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
...@@ -27,6 +28,7 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -27,6 +28,7 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H; const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H; const int out_h = out_nh % out_H;
...@@ -47,20 +49,83 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -47,20 +49,83 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
output_pos.x = out_c * out_W + out_w; output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh; output_pos.y = out_nh;
CL_DTYPE4 output = (CL_DTYPE4)0.0f; CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f);
output.x = convert_float(in[input_pos0]); output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE);
if(out_C - 4 * out_c >= 2){
output.y = convert_float(in[input_pos1]); if (out_C - 4 * out_c >= 2) {
output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE);
} }
if(out_C - 4 * out_c >= 3){ if (out_C - 4 * out_c >= 3) {
output.z = convert_float(in[input_pos2]); output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE);
} }
if(out_C - 4 * out_c >= 4){ if (out_C - 4 * out_c >= 4) {
output.w = convert_float(in[input_pos3]); output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE);
}
#ifdef DEBUG
if (out_w > 2045) {
printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n",
out_w,
out_C - 4 * out_c,
(float)(in[input_pos0]),
(float)(in[input_pos1]),
(float)(in[input_pos2]),
(float)(in[input_pos3]));
printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh,
output_pos.x, output_pos.y,
(float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w));
} }
write_imagef(output_image, output_pos, output); #endif
WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
} }
// image2d -> buffer
__kernel void image2d_to_buffer(__read_only image2d_t input,
__private const int in_width,
__private const int in_height,
__global CL_DTYPE* out,
__private const int size_ch,
__private const int size_block,
__private const int size_batch,
__private const int C) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
const int in_n = in_nh / in_height;
const int in_h = in_nh % in_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
#ifdef DEBUG
if (in_w > 2045) {
printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh,
pos_x, in_nh,
(float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w));
}
#endif
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
if (C - 4 * in_c >= 2) {
out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
}
if(C - 4 * in_c >= 3) {
out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
}
if(C - 4 * in_c >= 4) {
out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
}
}
#if 0
// buffer -> image2d_nw // buffer -> image2d_nw
__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
...@@ -97,55 +162,23 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, ...@@ -97,55 +162,23 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
output_pos.y = out_ch; output_pos.y = out_ch;
CL_DTYPE4 output = (CL_DTYPE4)0.0f; CL_DTYPE4 output = (CL_DTYPE4)0.0f;
output.x = convert_float(in[input_pos0]); output.x = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos0]);
if (out_N - 4 * out_n >= 2) { if (out_N - 4 * out_n >= 2) {
output.y = convert_float(in[input_pos1]); output.y = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos1]);
} }
if (out_N - 4 * out_n >= 3) { if (out_N - 4 * out_n >= 3) {
output.z = convert_float(in[input_pos2]); output.z = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos2]);
} }
if (out_N - 4 * out_n >= 4) { if (out_N - 4 * out_n >= 4) {
output.w = convert_float(in[input_pos3]); output.w = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos3]);
} }
write_imagef(output_image, output_pos, output);
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
// image2d -> buffer
__kernel void image2d_to_buffer(__read_only image2d_t input,
__private const int in_width,
__private const int in_height,
__global CL_DTYPE* out,
__private const int size_ch,
__private const int size_block,
__private const int size_batch,
__private const int C) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
const int in_n = in_nh / in_height;
const int in_h = in_nh % in_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w);
CL_DTYPE4 in = read_imagef(input, sampler, (int2)(pos_x, in_nh));
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = convert_float(in.x);
if (C - 4 * in_c >= 2) {
out[index + size_ch] = convert_float(in.y);
}
if(C - 4 * in_c >= 3) {
out[index + size_ch * 2] = convert_float(in.z);
}
if(C - 4 * in_c >= 4) {
out[index + size_ch * 3] = convert_float(in.w);
}
} }
#endif
#if 0
// image2d -> buffer // image2d -> buffer
__kernel void image2d_to_buffer_2d(__private const int in_height, __kernel void image2d_to_buffer_2d(__private const int in_height,
__private const int in_width, __private const int in_width,
...@@ -157,11 +190,12 @@ __kernel void image2d_to_buffer_2d(__private const int in_height, ...@@ -157,11 +190,12 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
const sampler_t sampler = const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = read_imagef(input, sampler, (int2)(in_w, in_h)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(in_w, in_h));
const int index = (in_h * in_width + in_w) * 4; const int index = (in_h * in_width + in_w) * 4;
out[index] = convert_float(in.x); out[index] = CONVERT_TYPE_TO(CL_DTYPE, in.x);
out[index + 1] = convert_float(in.y); out[index + 1] = CONVERT_TYPE_TO(CL_DTYPE, in.y);
out[index + 2] = convert_float(in.z); out[index + 2] = CONVERT_TYPE_TO(CL_DTYPE, in.z);
out[index + 3] = convert_float(in.w); out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w);
} }
#endif
...@@ -29,11 +29,15 @@ limitations under the License. */ ...@@ -29,11 +29,15 @@ limitations under the License. */
#ifdef CL_DTYPE_float #ifdef CL_DTYPE_float
#define CL_DTYPE float #define CL_DTYPE float
#define CL_DTYPE_CHAR f #define CL_DTYPE_CHAR f
#define CL_COMPUTE_DTYPE half
#define CL_COMPUTE_DTYPE_CHAR h
#endif #endif
#ifdef CL_DTYPE_half #ifdef CL_DTYPE_half
#define CL_DTYPE half #define CL_DTYPE half
#define CL_DTYPE_CHAR h #define CL_DTYPE_CHAR h
#define CL_COMPUTE_DTYPE half
#define CL_COMPUTE_DTYPE_CHAR h
#endif #endif
///////////////////////////////// /////////////////////////////////
...@@ -43,6 +47,7 @@ limitations under the License. */ ...@@ -43,6 +47,7 @@ limitations under the License. */
#define GET_VEC_TYPE(type__, size__) type__##size__ #define GET_VEC_TYPE(type__, size__) type__##size__
#define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__) #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
#define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4) #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
#define CL_COMPUTE_DTYPE4 VECTORIZED_TYPE(CL_COMPUTE_DTYPE, 4)
///////////////////////////////// /////////////////////////////////
// CONVERT_TYPE_TO // CONVERT_TYPE_TO
......
...@@ -14,6 +14,23 @@ limitations under the License. */ ...@@ -14,6 +14,23 @@ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
__kernel void relu(__read_only image2d_t input,
__write_only image2d_t output) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
__kernel void relu6(__read_only image2d_t input, __kernel void relu6(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold){ __private const float threshold){
...@@ -30,3 +47,19 @@ __kernel void relu6(__read_only image2d_t input, ...@@ -30,3 +47,19 @@ __kernel void relu6(__read_only image2d_t input,
in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in); in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = 1 / (1 + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
...@@ -12,19 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #include <cl_common.h>
__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
__private const float scale_h, __private const float scale_w,
__private const int in_dims_h, __private const int out_dims_h, __kernel void nearest_interp(__read_only image2d_t input,
__private const int in_dims_w, __private const int out_dims_w) { __write_only image2d_t output,
__private const float scale_h,
__private const float scale_w,
__private const int in_dims_h,
__private const int out_dims_h,
__private const int in_dims_w,
__private const int out_dims_w) {
const int c = get_global_id(0); const int c = get_global_id(0);
const int w = get_global_id(1); const int w = get_global_id(1);
const int nh = get_global_id(2); const int nh = get_global_id(2);
int2 output_pos; int2 output_pos;
output_pos.x = c * out_dims_w + w; output_pos.x = c * out_dims_w + w;
output_pos.y = nh; output_pos.y = nh;
int out_n = nh / out_dims_h; int out_n = nh / out_dims_h;
int out_h = nh % out_dims_h; int out_h = nh % out_dims_h;
int2 input_pos; int2 input_pos;
input_pos.x = c * in_dims_w + w / scale_w; input_pos.x = c * in_dims_w + w / scale_w;
input_pos.y = out_n * in_dims_h + out_h / scale_h; input_pos.y = out_n * in_dims_h + out_h / scale_h;
...@@ -32,6 +42,7 @@ __kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t ...@@ -32,6 +42,7 @@ __kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST; CLK_FILTER_NEAREST;
half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y)); CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(input_pos.x, input_pos.y));
write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(output_pos.x , output_pos.y), input_data);
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = 1 / (1 + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
...@@ -81,8 +81,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width, ...@@ -81,8 +81,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
return cl_image; return cl_image;
} }
template <> // use int16_t represents half float template <> // use uint16_t represents half float
void *TargetWrapperCL::MallocImage<int16_t>(const size_t cl_image2d_width, void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
const size_t cl_image2d_height, const size_t cl_image2d_height,
void *host_ptr) { void *host_ptr) {
cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16))); cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16)));
......
...@@ -178,5 +178,6 @@ void PrecisionCastPass::SetValidPlaces(const std::vector<Place>& valid_places) { ...@@ -178,5 +178,6 @@ void PrecisionCastPass::SetValidPlaces(const std::vector<Place>& valid_places) {
REGISTER_MIR_PASS(type_precision_cast_pass, REGISTER_MIR_PASS(type_precision_cast_pass,
paddle::lite::mir::PrecisionCastPass) paddle::lite::mir::PrecisionCastPass)
.BindTargets({TARGET(kAny)}) .BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kOpenCL)})
.BindKernel("calib_once") .BindKernel("calib_once")
.BindKernel("calib"); .BindKernel("calib");
...@@ -103,8 +103,8 @@ const cl::Image2D *TensorLite::data<float, cl::Image2D>() const { ...@@ -103,8 +103,8 @@ const cl::Image2D *TensorLite::data<float, cl::Image2D>() const {
return static_cast<const cl::Image2D *>(buffer_->data()); return static_cast<const cl::Image2D *>(buffer_->data());
} }
template <> // use int16_t represent half float template <> // use uint16_t represent half float
const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const { const cl::Image2D *TensorLite::data<uint16_t, cl::Image2D>() const {
if (nullptr == buffer_->data()) return nullptr; if (nullptr == buffer_->data()) return nullptr;
return static_cast<const cl::Image2D *>(buffer_->data()); return static_cast<const cl::Image2D *>(buffer_->data());
} }
......
...@@ -260,8 +260,8 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) { ...@@ -260,8 +260,8 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) {
template <> template <>
const cl::Image2D *TensorLite::data<float, cl::Image2D>() const; const cl::Image2D *TensorLite::data<float, cl::Image2D>() const;
template <> // use int16_t represent half float template <> // use uint16_t represent half float
const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const; const cl::Image2D *TensorLite::data<uint16_t, cl::Image2D>() const;
#endif #endif
} // namespace lite } // namespace lite
......
...@@ -4,91 +4,136 @@ endif() ...@@ -4,91 +4,136 @@ endif()
set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter) set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter)
add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps}) #####################
add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps}) # image kernel #
add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps}) #####################
add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_compute.cc DEPS ${cl_kernel_deps}) # basic
add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fusion_elementwise_add_activation_opencl add_kernel(fusion_elementwise_add_activation_opencl
OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc OPENCL basic SRCS fusion_elementwise_add_activation_image_compute.cc
DEPS elementwise_add_opencl ${cl_kernel_deps}) DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps}) add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps}) add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(sigmoid_opencl OPENCL basic SRCS sigmoid_compute.cc DEPS ${cl_kernel_deps}) add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl OPENCL basic SRCS concat_compute.cc DEPS ${cl_kernel_deps}) add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps}) add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_compute.cc DEPS ${cl_kernel_deps}) add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc # extra
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context # wait to add ...
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_elementwise_mul_opencl SRCS elementwise_mul_compute_test.cc
DEPS elementwise_mul_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc
DEPS pool_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
DEPS fc_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
# TODO(ysh329): comment for buffer-impl mul ######################
#lite_cc_test(test_mul_opencl SRCS mul_compute_test.cc # image kernel test #
# DEPS mul_opencl op_registry program context ######################
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) lite_cc_test(test_activation_image_opencl SRCS activation_image_compute_test.cc
DEPS activation_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc lite_cc_test(test_conv_image_opencl SRCS conv_image_compute_test.cc
DEPS io_copy_compute_opencl op_registry program context DEPS conv_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#TODO(ysh329): comment buffer-impl relu lite_cc_test(test_depthwise_conv2d_image_opencl SRCS depthwise_conv2d_image_compute_test.cc
lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc DEPS conv_opencl op_registry program context
DEPS relu_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_sigmoid_opencl SRCS sigmoid_compute_test.cc lite_cc_test(test_nearest_interp_image_opencl SRCS nearest_interp_image_compute_test.cc
DEPS sigmoid_opencl layout_opencl op_registry program context DEPS nearest_interp_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc lite_cc_test(test_pool_image_opencl SRCS pool_image_compute_test.cc
DEPS depthwise_conv2d_opencl op_registry program context DEPS pool_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_depthwise_conv2d_image2d_opencl SRCS depthwise_conv2d_image2d_compute_test.cc lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
DEPS conv_opencl op_registry program context DEPS scale_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
DEPS reshape_opencl op_registry program context DEPS reshape_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
DEPS conv_opencl op_registry program context DEPS concat_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
DEPS conv_opencl op_registry program context cl_image_converter DEPS elementwise_mul_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
DEPS layout_opencl op_registry program context cl_image_converter DEPS layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_concat_opencl SRCS concat_compute_test.cc lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
DEPS concat_opencl layout_opencl op_registry program context DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter ######################
# buffer kernel #
######################
# basic
#add_kernel(activation_opencl OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(conv_opencl OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(fusion_elementwise_add_activation_opencl
# OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
# DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
# extra
# wait to add ...
######################
# buffer kernel test #
######################
#lite_cc_test(test_activation_buffer_opencl SRCS activation_buffer_compute_test.cc
# DEPS activation_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
# DEPS conv_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
# DEPS depthwise_conv2d_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_pool_buffer_opencl SRCS pool_buffer_compute_test.cc
# DEPS pool_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_concat_buffer_opencl SRCS concat_buffer_compute_test.cc
# DEPS concat_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
DEPS fc_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_scale_opencl SRCS scale_compute_test.cc lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
DEPS scale_opencl op_registry program context DEPS mul_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_elementwise_add_buffer_opencl SRCS elementwise_add__buffer_compute_test.cc
# DEPS elementwise_add_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc
DEPS io_copy_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ReluCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
size_t count = x_dims.production();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)count);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{count};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class SigmoidCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Sigmoid using cl::Buffer, kFloat";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
size_t count = x_dims.production();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)count);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{count};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
private:
std::string kernel_func_name_{"sigmoid"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
// Relu
REGISTER_LITE_KERNEL(relu,
kOpenCL,
kFloat,
kNCHW,
paddle::lite::kernels::opencl::ReluCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
// Sigmoid
REGISTER_LITE_KERNEL(sigmoid,
kOpenCL,
kFloat,
kNCHW,
paddle::lite::kernels::opencl::SigmoidCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle {
namespace lite {
template <typename dtype>
void relu_compute_ref(const dtype *x_data,
const DDim &x_dim,
dtype *out_data,
float threshold = 0.f) {
if (abs(threshold) < 1e-5) {
// relu
for (int i = 0; i < x_dim.production(); ++i) {
out_data[i] = (x_data[i] > threshold) ? x_data[i] : threshold;
}
} else {
// relu6 or relu with threshold
for (int i = 0; i < x_dim.production(); ++i) {
auto out_tmp = (x_data[i] > 0) ? x_data[i] : 0;
out_data[i] = (out_tmp < threshold) ? out_tmp : threshold;
}
}
}
template <typename dtype>
void sigmoid_compute_ref(const dtype *x_data,
const DDim &x_dim,
dtype *out_data) {
for (int i = 0; i < x_dim.production(); ++i) {
out_data[i] = 1 / (1 + expf(-x_data[i]));
}
}
TEST(opencl_relu_buffer, compute) {
// prepare data
const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
lite::Tensor x, out;
x.Resize(x_dim);
out.Resize(x_dim);
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x = static_cast<float *>(
TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
mapped_x[i] = dist(engine);
}
// set param and kernel, then run
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"relu", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> relu_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(relu_context->As<OpenCLContext>()));
kernel->SetContext(std::move(relu_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.Out->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
relu_compute_ref<float>(mapped_x, x_dim, out_ref.get());
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x_data, mapped_x);
}
TEST(opencl_sigmoid_buffer, compute) {
// prepare data
const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
lite::Tensor x, out;
x.Resize(x_dim);
out.Resize(x_dim);
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x = static_cast<float *>(
TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
mapped_x[i] = dist(engine);
}
// set param and kernel, then run
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"sigmoid", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> sigmoid_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(sigmoid_context->As<OpenCLContext>()));
kernel->SetContext(std::move(sigmoid_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.Out->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
sigmoid_compute_ref<float>(mapped_x, x_dim, out_ref.get());
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x_data, mapped_x);
}
} // namespace lite
} // namespace paddle
// sigmoid buffer
USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
// relu buffer
USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
...@@ -24,44 +24,55 @@ namespace lite { ...@@ -24,44 +24,55 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class SigmoidCompute class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> { PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ActivationParam; using param_t = operators::ActivationParam;
std::string doc() const override { std::string doc() const override {
return "Sigmoid using cl::Buffer, kFloat"; return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16";
} }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_); kernel_func_name_, "image/activation_kernel.cl", build_options_);
} }
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
size_t count = x_dims.production(); auto* x_buf = param.X->data<uint16_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
int arg_idx = 0; int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf); cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)count);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{count}; VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
...@@ -70,40 +81,42 @@ class SigmoidCompute ...@@ -70,40 +81,42 @@ class SigmoidCompute
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_); // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
} }
private: private:
std::string kernel_func_name_{"sigmoid"}; std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"}; std::string build_options_{"-DCL_DTYPE_half -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class SigmoidComputeFloatImageDefault class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
: public KernelLite<TARGET(kOpenCL), PRECISION(kFP16),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ActivationParam; using param_t = operators::ActivationParam;
std::string doc() const override { std::string doc() const override {
return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFloat"; return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
} }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/sigmoid_kernel.cl", build_options_); kernel_func_name_, "image/activation_kernel.cl", build_options_);
} }
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<float, cl::Image2D>(); auto* x_buf = param.X->data<uint16_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<float, cl::Image2D>( auto* out_buf = param.Out->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]); image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
...@@ -116,6 +129,8 @@ class SigmoidComputeFloatImageDefault ...@@ -116,6 +129,8 @@ class SigmoidComputeFloatImageDefault
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target()); VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target()); VLOG(4) << TargetToStr(param.Out->target());
...@@ -125,6 +140,7 @@ class SigmoidComputeFloatImageDefault ...@@ -125,6 +140,7 @@ class SigmoidComputeFloatImageDefault
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "threshold:" << threshold;
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]), cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
...@@ -143,12 +159,12 @@ class SigmoidComputeFloatImageDefault ...@@ -143,12 +159,12 @@ class SigmoidComputeFloatImageDefault
} }
private: private:
std::string kernel_func_name_{"sigmoid"}; std::string kernel_func_name_{"relu6"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"}; std::string build_options_{"-DCL_DTYPE_half -DRELU6"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class SigmoidComputeFP16ImageDefault class SigmoidComputeImageDefault
: public KernelLite<TARGET(kOpenCL), : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
...@@ -162,18 +178,18 @@ class SigmoidComputeFP16ImageDefault ...@@ -162,18 +178,18 @@ class SigmoidComputeFP16ImageDefault
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/sigmoid_kernel.cl", build_options_); kernel_func_name_, "image/activation_kernel.cl", build_options_);
} }
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
auto* x_buf = auto* x_buf =
param.X->data<int16_t, param.X->data<uint16_t,
cl::Image2D>(); // use int16_t represents half float cl::Image2D>(); // use uint16_t represents half float
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = auto* out_buf =
param.Out->mutable_data<int16_t, cl::Image2D>( // use int16_t param.Out->mutable_data<uint16_t, cl::Image2D>( // use uint16_t
// represents half float // represents half float
image_shape["width"], image_shape["width"],
image_shape["height"]); image_shape["height"]);
...@@ -227,39 +243,46 @@ class SigmoidComputeFP16ImageDefault ...@@ -227,39 +243,46 @@ class SigmoidComputeFP16ImageDefault
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// REGISTER_LITE_KERNEL(sigmoid, // Relu
// kOpenCL, REGISTER_LITE_KERNEL(relu,
// kFloat,
// kNCHW,
// paddle::lite::kernels::opencl::SigmoidCompute,
// def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(
sigmoid,
kOpenCL, kOpenCL,
kFloat, kFP16,
kImageDefault, kImageDefault,
paddle::lite::kernels::opencl::SigmoidComputeFloatImageDefault, paddle::lite::kernels::opencl::ReluComputeImageDefault,
ImageDefault) ImageDefault)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Relu6
REGISTER_LITE_KERNEL(relu6,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::Relu6ComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL( // Sigmoid
sigmoid, REGISTER_LITE_KERNEL(sigmoid,
kOpenCL, kOpenCL,
kFP16, kFP16,
kImageDefault, kImageDefault,
paddle::lite::kernels::opencl::SigmoidComputeFP16ImageDefault, paddle::lite::kernels::opencl::SigmoidComputeImageDefault,
ImageDefault) ImageDefault)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
......
...@@ -41,224 +41,17 @@ void relu_compute_ref(const dtype *x_data, ...@@ -41,224 +41,17 @@ void relu_compute_ref(const dtype *x_data,
} }
} }
#if 0 // relu_buffer template <typename dtype>
TEST(opencl_relu_buffer, compute) { void sigmoid_compute_ref(const dtype *x_data,
// prepare data const DDim &x_dim,
const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10}); dtype *out_data) {
lite::Tensor x, out;
x.Resize(x_dim);
out.Resize(x_dim);
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x = static_cast<float *>(
TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
mapped_x[i] = dist(engine);
}
// set param and kernel, then run
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"relu", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> relu_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(relu_context->As<OpenCLContext>()));
kernel->SetContext(std::move(relu_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.Out->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
relu_compute_ref<float>(mapped_x, x_dim, out_ref.get());
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x_data, mapped_x);
}
#endif // relu_buffer
// #define LOOP_TEST
// #define PRINT_RESULT
TEST(relu_image2d_fp32, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
"layout(img2buf) "
"-> host";
#ifdef LOOP_TEST
for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3}) {
for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) {
#else
const int n = 1;
const int c = 2;
const int h = 3;
const int w = 4;
#endif // LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
// set layout kernels
auto buf_to_img_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
auto relu_img_kernels =
KernelRegistry::Global().Create("relu",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(relu_img_kernels.empty());
auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
auto relu_img_kernel = std::move(relu_img_kernels.front());
LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
// set tensors about op param
LOG(INFO) << "set tensors about op param";
// layout(buf->img): x -> relu_in
// relu(img): relu_in -> relu_out
// layout(img->buf): relu_out -> y
lite::Tensor x, y, relu_in, relu_out, y_ref;
operators::LayoutParam BufferToImageParam;
operators::LayoutParam ImageToBufferParam;
BufferToImageParam.x = &x;
BufferToImageParam.y = &relu_in;
ImageToBufferParam.x = &relu_out;
ImageToBufferParam.y = &y;
operators::ActivationParam ReluParam;
ReluParam.X = &relu_in;
ReluParam.Out = &relu_out;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
x.Resize(x_dim);
y.Resize(x_dim);
relu_in.Resize(x_dim);
relu_out.Resize(x_dim);
y_ref.Resize(x_dim);
auto relu_image2d_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
// initialize tensors
LOG(INFO) << "initialize tensors";
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
x_data, 0, sizeof(float) * x_dim.production()));
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) { for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2; out_data[i] = 1 / (1 + expf(-x_data[i]));
mapped_y[i] = static_cast<int>(0);
}
auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]);
auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]);
// set context and kernel args
LOG(INFO) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
buf_to_img_kernel->SetParam(BufferToImageParam);
std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(buf_to_img_context->As<OpenCLContext>()));
buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
img_to_buf_kernel->SetParam(ImageToBufferParam);
std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(img_to_buf_context->As<OpenCLContext>()));
img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
relu_img_kernel->SetParam(ReluParam);
std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(relu_img_context->As<OpenCLContext>()));
relu_img_kernel->SetContext(std::move(relu_img_context));
// run kernels
LOG(INFO) << "run kernel: buf_to_img_kernel";
buf_to_img_kernel->Launch();
LOG(INFO) << "run kernel: relu_img_kernel";
relu_img_kernel->Launch();
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result
#ifdef PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
}
#endif // PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
<< eidx << "]:" << mapped_y[eidx];
break;
}
} }
// free
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef LOOP_TEST
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
} }
// #define RELU_FP16_LOOP_TEST
// #define RELU_FP16_PRINT_RESULT
TEST(relu_image2d_fp16, compute) { TEST(relu_image2d_fp16, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> " LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
"layout(img2buf) " "layout(img2buf) "
...@@ -340,9 +133,9 @@ TEST(relu_image2d_fp16, compute) { ...@@ -340,9 +133,9 @@ TEST(relu_image2d_fp16, compute) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2; mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
mapped_y[i] = static_cast<int>(0); mapped_y[i] = static_cast<int>(0);
} }
auto *relu_in_data = relu_in.mutable_data<int16_t, cl::Image2D>( auto *relu_in_data = relu_in.mutable_data<uint16_t, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]); relu_image2d_shape["width"], relu_image2d_shape["height"]);
auto *relu_out_data = relu_out.mutable_data<int16_t, cl::Image2D>( auto *relu_out_data = relu_out.mutable_data<uint16_t, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]); relu_image2d_shape["width"], relu_image2d_shape["height"]);
// set context and kernel args // set context and kernel args
...@@ -413,14 +206,14 @@ TEST(relu_image2d_fp16, compute) { ...@@ -413,14 +206,14 @@ TEST(relu_image2d_fp16, compute) {
#endif #endif
} }
// #define RELU6_FP32_LOOP_TEST // #define RELU6_FP16_LOOP_TEST
// #define RELU6_FP32_PRINT_RESULT // #define RELU6_FP16_PRINT_RESULT
TEST(relu6_image2d_fp32, compute) { TEST(relu6_image2d_fp16, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> " LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
"layout(img2buf) " "layout(img2buf) "
"-> host"; "-> host";
#ifdef RELU6_FP32_LOOP_TEST #ifdef RELU6_FP16_LOOP_TEST
for (int n = 1; n <= 100; n += 33) { for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3}) { for (auto c : {1, 3}) {
for (int h = 12; h <= 100; h += 13) { for (int h = 12; h <= 100; h += 13) {
...@@ -430,7 +223,7 @@ TEST(relu6_image2d_fp32, compute) { ...@@ -430,7 +223,7 @@ TEST(relu6_image2d_fp32, compute) {
const int c = 2; const int c = 2;
const int h = 3; const int h = 3;
const int w = 4; const int w = 4;
#endif // RELU6_FP32_LOOP_TEST #endif // RELU6_FP16_LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " " LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========"; << h << " " << w << " ========";
...@@ -445,7 +238,7 @@ TEST(relu6_image2d_fp32, compute) { ...@@ -445,7 +238,7 @@ TEST(relu6_image2d_fp32, compute) {
auto relu_img_kernels = auto relu_img_kernels =
KernelRegistry::Global().Create("relu6", KernelRegistry::Global().Create("relu6",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
...@@ -497,9 +290,9 @@ TEST(relu6_image2d_fp32, compute) { ...@@ -497,9 +290,9 @@ TEST(relu6_image2d_fp32, compute) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2; mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
mapped_y[i] = static_cast<int>(0); mapped_y[i] = static_cast<int>(0);
} }
auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>( auto *relu_in_data = relu_in.mutable_data<uint16_t, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]); relu_image2d_shape["width"], relu_image2d_shape["height"]);
auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>( auto *relu_out_data = relu_out.mutable_data<uint16_t, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]); relu_image2d_shape["width"], relu_image2d_shape["height"]);
// set context and kernel args // set context and kernel args
...@@ -536,13 +329,13 @@ TEST(relu6_image2d_fp32, compute) { ...@@ -536,13 +329,13 @@ TEST(relu6_image2d_fp32, compute) {
// compute ref cpu // compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f); relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
// result // result
#ifdef RELU6_FP32_PRINT_RESULT #ifdef RELU6_FP16_PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----"; LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) { for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx] std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl; << std::endl;
} }
#endif // RELU6_FP32_PRINT_RESULT #endif // RELU6_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref) // check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) { for (int eidx = 0; eidx < x_dim.production(); eidx++) {
...@@ -560,7 +353,7 @@ TEST(relu6_image2d_fp32, compute) { ...@@ -560,7 +353,7 @@ TEST(relu6_image2d_fp32, compute) {
LOG(INFO) << "free: unmap x, y"; LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x); TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y); TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef RELU6_FP32_LOOP_TEST #ifdef RELU6_FP16_LOOP_TEST
} // w } // w
} // h } // h
} // c } // c
...@@ -570,14 +363,14 @@ TEST(relu6_image2d_fp32, compute) { ...@@ -570,14 +363,14 @@ TEST(relu6_image2d_fp32, compute) {
#endif #endif
} }
// #define RELU6_FP16_LOOP_TEST // #define SIGMOID_FP16_LOOP_TEST
// #define RELU6_FP16_PRINT_RESULT // #define SIGMOID_FP16_PRINT_RESULT
TEST(relu6_image2d_fp16, compute) { TEST(sigmoid_image2d_fp16, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> " LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
"layout(img2buf) " "layout(img2buf) "
"-> host"; "-> host";
#ifdef RELU6_FP16_LOOP_TEST #ifdef SIGMOID_FP16_LOOP_TEST
for (int n = 1; n <= 100; n += 33) { for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3}) { for (auto c : {1, 3}) {
for (int h = 12; h <= 100; h += 13) { for (int h = 12; h <= 100; h += 13) {
...@@ -587,7 +380,7 @@ TEST(relu6_image2d_fp16, compute) { ...@@ -587,7 +380,7 @@ TEST(relu6_image2d_fp16, compute) {
const int c = 2; const int c = 2;
const int h = 3; const int h = 3;
const int w = 4; const int w = 4;
#endif // RELU6_FP16_LOOP_TEST #endif // SIGMOID_FP16_LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " " LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========"; << h << " " << w << " ========";
...@@ -599,46 +392,45 @@ TEST(relu6_image2d_fp16, compute) { ...@@ -599,46 +392,45 @@ TEST(relu6_image2d_fp16, compute) {
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create( auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)); "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
auto relu_img_kernels = auto sigmoid_img_kernels =
KernelRegistry::Global().Create("relu6", KernelRegistry::Global().Create("sigmoid",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(relu_img_kernels.empty()); ASSERT_FALSE(sigmoid_img_kernels.empty());
auto buf_to_img_kernel = std::move(buf_to_img_kernels.front()); auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
auto img_to_buf_kernel = std::move(img_to_buf_kernels.front()); auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
auto relu_img_kernel = std::move(relu_img_kernels.front()); auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc(); LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc(); LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc(); LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
// set tensors about op param // set tensors about op param
LOG(INFO) << "set tensors about op param"; LOG(INFO) << "set tensors about op param";
// layout(buf->img): x -> relu_in // layout(buf->img): x -> sigmoid_in
// relu(img): relu_in -> relu_out // sigmoid(img): sigmoid_in -> sigmoid_out
// layout(img->buf): relu_out -> y // layout(img->buf): sigmoid_out -> y
lite::Tensor x, y, relu_in, relu_out, y_ref; lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
operators::LayoutParam BufferToImageParam; operators::LayoutParam BufferToImageParam;
operators::LayoutParam ImageToBufferParam; operators::LayoutParam ImageToBufferParam;
BufferToImageParam.x = &x; BufferToImageParam.x = &x;
BufferToImageParam.y = &relu_in; BufferToImageParam.y = &sigmoid_in;
ImageToBufferParam.x = &relu_out; ImageToBufferParam.x = &sigmoid_out;
ImageToBufferParam.y = &y; ImageToBufferParam.y = &y;
operators::ActivationParam ReluParam; operators::ActivationParam SigmoidParam;
ReluParam.X = &relu_in; SigmoidParam.X = &sigmoid_in;
ReluParam.Out = &relu_out; SigmoidParam.Out = &sigmoid_out;
ReluParam.Relu_clipped_coef = 6.f;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w}); const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
x.Resize(x_dim); x.Resize(x_dim);
y.Resize(x_dim); y.Resize(x_dim);
relu_in.Resize(x_dim); sigmoid_in.Resize(x_dim);
relu_out.Resize(x_dim); sigmoid_out.Resize(x_dim);
y_ref.Resize(x_dim); y_ref.Resize(x_dim);
auto relu_image2d_shape = auto sigmoid_image2d_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim); paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
// initialize tensors // initialize tensors
...@@ -650,14 +442,19 @@ TEST(relu6_image2d_fp16, compute) { ...@@ -650,14 +442,19 @@ TEST(relu6_image2d_fp16, compute) {
x_data, 0, sizeof(float) * x_dim.production())); x_data, 0, sizeof(float) * x_dim.production()));
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map( auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production())); y_data, 0, sizeof(float) * x_dim.production()));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-1, 1);
for (int i = 0; i < x_dim.production(); ++i) { for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2; mapped_x[i] = static_cast<float>(dist(engine));
mapped_y[i] = static_cast<int>(0);
} }
auto *relu_in_data = relu_in.mutable_data<int16_t, cl::Image2D>( auto *sigmoid_in_data =
relu_image2d_shape["width"], relu_image2d_shape["height"]); sigmoid_in.mutable_data<uint16_t, cl::Image2D>(
auto *relu_out_data = relu_out.mutable_data<int16_t, cl::Image2D>( sigmoid_image2d_shape["width"],
relu_image2d_shape["width"], relu_image2d_shape["height"]); sigmoid_image2d_shape["height"]);
auto *sigmoid_out_data =
sigmoid_out.mutable_data<uint16_t, cl::Image2D>(
sigmoid_image2d_shape["width"],
sigmoid_image2d_shape["height"]);
// set context and kernel args // set context and kernel args
LOG(INFO) << "set context and kernel args"; LOG(INFO) << "set context and kernel args";
...@@ -676,39 +473,40 @@ TEST(relu6_image2d_fp16, compute) { ...@@ -676,39 +473,40 @@ TEST(relu6_image2d_fp16, compute) {
&(img_to_buf_context->As<OpenCLContext>())); &(img_to_buf_context->As<OpenCLContext>()));
img_to_buf_kernel->SetContext(std::move(img_to_buf_context)); img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
relu_img_kernel->SetParam(ReluParam); sigmoid_img_kernel->SetParam(SigmoidParam);
std::unique_ptr<KernelContext> relu_img_context(new KernelContext); std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo( context->As<OpenCLContext>().CopySharedTo(
&(relu_img_context->As<OpenCLContext>())); &(sigmoid_img_context->As<OpenCLContext>()));
relu_img_kernel->SetContext(std::move(relu_img_context)); sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
// run kernels // run kernels
LOG(INFO) << "run kernel: buf_to_img_kernel"; LOG(INFO) << "run kernel: buf_to_img_kernel";
buf_to_img_kernel->Launch(); buf_to_img_kernel->Launch();
LOG(INFO) << "run kernel: relu_img_kernel"; LOG(INFO) << "run kernel: sigmoid_img_kernel";
relu_img_kernel->Launch(); sigmoid_img_kernel->Launch();
LOG(INFO) << "run kernel: img_to_buf_kernel"; LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch(); img_to_buf_kernel->Launch();
// compute ref cpu // compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f); sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result // result
#ifdef RELU6_FP16_PRINT_RESULT #ifdef SIGMOID_FP16_PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----"; LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) { for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx] std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl; << std::endl;
} }
#endif // RELU6_FP16_PRINT_RESULT #endif // SIGMOID_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref) // check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) { for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6); EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) { if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref[" << " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y[" << eidx << "]: " << y_data_ref[eidx] << ", mapped_y["
<< eidx << "]:" << mapped_y[eidx]; << eidx << "]: " << mapped_y[eidx] << ", mapped_x["
<< eidx << "]: " << mapped_x[eidx];
break; break;
} }
} }
...@@ -717,7 +515,7 @@ TEST(relu6_image2d_fp16, compute) { ...@@ -717,7 +515,7 @@ TEST(relu6_image2d_fp16, compute) {
LOG(INFO) << "free: unmap x, y"; LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x); TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y); TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef RELU6_FP16_LOOP_TEST #ifdef SIGMOID_FP16_LOOP_TEST
} // w } // w
} // h } // h
} // c } // c
...@@ -730,17 +528,15 @@ TEST(relu6_image2d_fp16, compute) { ...@@ -730,17 +528,15 @@ TEST(relu6_image2d_fp16, compute) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// relu buffer // layout
// USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
// relu image2d fp32
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(relu, kOpenCL, kFloat, kImageDefault, ImageDefault);
// relu image2d fp16 // relu image2d fp16
USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
// relu6 image2d fp32 // relu6 image2d fp16
USE_LITE_KERNEL(relu6, kOpenCL, kFloat, kImageDefault, ImageDefault);
USE_LITE_KERNEL(relu6, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(relu6, kOpenCL, kFP16, kImageDefault, ImageDefault);
// sigmoid image2d fp16
USE_LITE_KERNEL(sigmoid, kOpenCL, kFP16, kImageDefault, ImageDefault);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ConcatCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConcatParam;
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
auto global_work_size = cl::NDRange{axis_size_};
int total = axis_size_ * post_size_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
auto axis0 = inputs[0]->dims()[axis_];
int total0 = axis0 * post_size_;
int total1 = (axis_size_ - axis0) * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total1);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
int size = inputs[i]->dims()[axis_];
auto* x_buf = inputs[i]->data<float, cl::Buffer>();
global_work_size = cl::NDRange{static_cast<size_t>(size)};
int total0 = size * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(size));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
start += size;
}
}
}
std::string doc() { return "Concat using cl::Buffer, kFloat"; }
int axis_size_ = 1;
int post_size_ = 1;
int pre_size_ = 1;
int axis_ = 1;
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::ConcatCompute Concat_buffer;
REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// // Licensed under the Apache License, Version 2.0 (the "License");
// // you may not use this file except in compliance with the License.
// // You may obtain a copy of the License at
// //
// // http://www.apache.org/licenses/LICENSE-2.0
// //
// // Unless required by applicable law or agreed to in writing, software
// // distributed under the License is distributed on an "AS IS" BASIS,
// // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// // See the License for the specific language governing permissions and
// // limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle {
namespace lite {
template <typename dtype>
void concat2_compute_ref(const dtype *in0,
const dtype *in1,
const int axis,
const DDim in0_dim,
const DDim in1_dim,
const DDim out_dim,
dtype *out_data) {
int pre_size = 1;
int post_size = 1;
for (int i = 0; i < axis; i++) {
pre_size *= in0_dim[i];
}
for (int i = axis + 1; i < in0_dim.size(); i++) {
post_size *= in0_dim[i];
}
int axis_size = out_dim[axis];
for (int i = 0; i < pre_size; i++) {
for (int j = 0; j < axis_size; j++) {
if (j < in0_dim[axis]) {
memcpy(out_data, in0, sizeof(dtype) * post_size);
in0 += post_size;
out_data += post_size;
}
}
}
}
template <typename dtype>
void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
std::vector<const DDim> ins_dim,
int axis,
const DDim out_dim,
dtype *out_data) {
int pre_size = 1;
int post_size = 1;
for (int i = 0; i < axis; i++) {
pre_size *= ins_dim[0][i];
}
for (int i = axis + 1; i < ins_dim[0].size(); i++) {
post_size *= ins_dim[0][i];
}
int axis_size = out_dim[axis];
for (int i = 0; i < pre_size; i++) {
for (int j = 0; j < ins_data.size(); j++) {
int size = post_size * ins_dim[j][axis];
memcpy(out_data, ins_data[j], sizeof(dtype) * size);
out_data += size;
}
}
}
TEST(opencl_concat_buffer, compute) {
// prepare data
const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
lite::Tensor x0, x1, x2, out, out_ref;
x0.Resize(x0_dim);
x1.Resize(x1_dim);
x2.Resize(x2_dim);
out.Resize(out_dim);
out_ref.Resize(out_dim);
auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x0 = static_cast<float *>(
TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
auto *mapped_x1 = static_cast<float *>(
TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
auto *mapped_x2 = static_cast<float *>(
TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
for (int i = 0; i < x0_dim.production(); i++) {
mapped_x0[i] = dist(engine);
}
for (int i = 0; i < x1_dim.production(); i++) {
mapped_x1[i] = dist(engine);
}
for (int i = 0; i < x2_dim.production(); i++) {
mapped_x2[i] = dist(engine);
}
// set param and kernel, then run
operators::ConcatParam param;
std::vector<lite::Tensor *> ins;
ins.push_back(&x0);
ins.push_back(&x1);
ins.push_back(&x2);
auto axis = 1;
param.x = ins;
param.output = &out;
param.axis = axis;
std::vector<const float *> ins_data;
std::vector<const DDim> ins_dim;
ins_data.push_back(mapped_x0);
ins_data.push_back(mapped_x1);
ins_data.push_back(mapped_x2);
ins_dim.push_back(x0_dim);
ins_dim.push_back(x1_dim);
ins_dim.push_back(x2_dim);
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> concat_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(concat_context->As<OpenCLContext>()));
kernel->SetContext(std::move(concat_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.output->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x0_data, mapped_x0);
TargetWrapperCL::Unmap(x1_data, mapped_x1);
TargetWrapperCL::Unmap(x2_data, mapped_x2);
}
} // namespace lite
} // namespace paddle
// concat buffer
USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/concat_compute.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
template <>
void ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kImageDefault)>::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "image/concat_kernel.cl", build_options_);
// UpdateParams<kFloat, kImageDefault>();
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
// auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::Run() {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
int width = inputs[0]->dims()[-1];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int flag = 1; // cxw
switch (axis_) {
case 0:
width = x_dims[2]; // n
flag = 0;
break;
case 1:
width = x_dims[3]; // c
break;
case 2:
width = x_dims[0]; // h
flag = 0;
break;
case 3:
case -1:
width = x_dims[1]; // w
break;
default:
printf("this axis: %d does not support \n", axis_);
}
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status =
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto* x_buf = inputs[i]->data<float, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
start += inputs[i]->dims()[axis_];
}
}
}
template <>
std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::doc() {
return "Concat using cl::Image, kFloat";
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
// UpdateParams<kFloat, kImageDefault>();
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::Run() {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
auto global_work_size = cl::NDRange{axis_size_};
int total = axis_size_ * post_size_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
auto axis0 = inputs[0]->dims()[axis_];
int total0 = axis0 * post_size_;
int total1 = (axis_size_ - axis0) * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total1);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
int size = inputs[i]->dims()[axis_];
auto* x_buf = inputs[i]->data<float, cl::Buffer>();
global_work_size = cl::NDRange{static_cast<size_t>(size)};
int total0 = size * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(size));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
start += size;
}
}
}
template <>
std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::doc() {
return "Concat using cl::Buffer, kFloat";
}
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
Concat_buffer;
typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kImageDefault)>
Concat_image;
REGISTER_LITE_KERNEL(
concat, kOpenCL, kFloat, kImageDefault, Concat_image, ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
// REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
// .BindInput("X",
// {LiteType::GetTensorTy(TARGET(kOpenCL),
// PRECISION(kFloat),
// DATALAYOUT(kNCHW))})
// .BindInput("AxisTensor",
// {LiteType::GetTensorTy(TARGET(kOpenCL),
// PRECISION(kInt32),
// DATALAYOUT(kNCHW))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kOpenCL),
// PRECISION(kFloat),
// DATALAYOUT(kNCHW))})
// .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConcatParam;
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "image/concat_kernel.cl", build_options_);
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
// auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.output->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
int width = inputs[0]->dims()[-1];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int flag = 1; // cxw
switch (axis_) {
case 0:
width = x_dims[2]; // n
flag = 0;
break;
case 1:
width = x_dims[3]; // c
break;
case 2:
width = x_dims[0]; // h
flag = 0;
break;
case 3:
case -1:
width = x_dims[1]; // w
break;
default:
printf("this axis: %d does not support \n", axis_);
}
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<uint16_t, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<uint16_t, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status =
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto* x_buf = inputs[i]->data<uint16_t, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
start += inputs[i]->dims()[axis_];
}
}
}
std::string doc() { return "Concat using cl::Image, kFP16"; }
int axis_size_ = 1;
int post_size_ = 1;
int pre_size_ = 1;
int axis_ = 1;
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::ConcatComputeImage Concat_image;
REGISTER_LITE_KERNEL(
concat, kOpenCL, kFP16, kImageDefault, Concat_image, ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -73,106 +76,10 @@ void concat_mul_compute_ref(std::vector<const dtype *> ins_data, ...@@ -73,106 +76,10 @@ void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
} }
} }
} }
#if 0 // concat_buffer
TEST(opencl_concat_buffer, compute) {
// prepare data
const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
lite::Tensor x0, x1, x2, out, out_ref;
x0.Resize(x0_dim);
x1.Resize(x1_dim);
x2.Resize(x2_dim);
out.Resize(out_dim);
out_ref.Resize(out_dim);
auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x0 = static_cast<float *>(
TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
auto *mapped_x1 = static_cast<float *>(
TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
auto *mapped_x2 = static_cast<float *>(
TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
for (int i = 0; i < x0_dim.production(); i++) {
mapped_x0[i] = dist(engine);
}
for (int i = 0; i < x1_dim.production(); i++) {
mapped_x1[i] = dist(engine);
}
for (int i = 0; i < x2_dim.production(); i++) {
mapped_x2[i] = dist(engine);
}
// set param and kernel, then run
operators::ConcatParam param;
std::vector<lite::Tensor *> ins;
ins.push_back(&x0);
ins.push_back(&x1);
ins.push_back(&x2);
auto axis = 1;
param.x = ins;
param.output = &out;
param.axis = axis;
std::vector<const float *> ins_data;
std::vector<const DDim> ins_dim;
ins_data.push_back(mapped_x0);
ins_data.push_back(mapped_x1);
ins_data.push_back(mapped_x2);
ins_dim.push_back(x0_dim);
ins_dim.push_back(x1_dim);
ins_dim.push_back(x2_dim);
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> concat_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(concat_context->As<OpenCLContext>()));
kernel->SetContext(std::move(concat_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.output->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x0_data, mapped_x0);
TargetWrapperCL::Unmap(x1_data, mapped_x1);
TargetWrapperCL::Unmap(x2_data, mapped_x2);
}
#endif // concat_buffer
// #define LOOP_TEST // #define LOOP_TEST
// #define PRINT_RESULT // #define PRINT_RESULT
TEST(concat_image2d_fp32, compute) { TEST(concat_image2d, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> " LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> "
"layout(img2buf) " "layout(img2buf) "
"-> host"; "-> host";
...@@ -209,7 +116,7 @@ TEST(concat_image2d_fp32, compute) { ...@@ -209,7 +116,7 @@ TEST(concat_image2d_fp32, compute) {
auto concat_img_kernels = auto concat_img_kernels =
KernelRegistry::Global().Create("concat", KernelRegistry::Global().Create("concat",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels1.empty()); ASSERT_FALSE(buf_to_img_kernels1.empty());
...@@ -284,14 +191,18 @@ TEST(concat_image2d_fp32, compute) { ...@@ -284,14 +191,18 @@ TEST(concat_image2d_fp32, compute) {
for (int i = 0; i < out_dim.production(); ++i) { for (int i = 0; i < out_dim.production(); ++i) {
mapped_y[i] = static_cast<int>(0); mapped_y[i] = static_cast<int>(0);
} }
auto *concat_in_data0 = concat_in0.mutable_data<float, cl::Image2D>( auto *concat_in_data0 =
concat_in0.mutable_data<uint16_t, cl::Image2D>(
concat_image2d_shape_in0["width"], concat_image2d_shape_in0["width"],
concat_image2d_shape_in0["height"]); concat_image2d_shape_in0["height"]);
auto *concat_in_data1 = concat_in1.mutable_data<float, cl::Image2D>( auto *concat_in_data1 =
concat_in1.mutable_data<uint16_t, cl::Image2D>(
concat_image2d_shape_in1["width"], concat_image2d_shape_in1["width"],
concat_image2d_shape_in1["height"]); concat_image2d_shape_in1["height"]);
auto *concat_out_data = concat_out.mutable_data<float, cl::Image2D>( auto *concat_out_data =
concat_image2d_shape["width"], concat_image2d_shape["height"]); concat_out.mutable_data<uint16_t, cl::Image2D>(
concat_image2d_shape["width"],
concat_image2d_shape["height"]);
// set context and kernel args // set context and kernel args
LOG(INFO) << "set context and kernel args"; LOG(INFO) << "set context and kernel args";
...@@ -347,22 +258,35 @@ TEST(concat_image2d_fp32, compute) { ...@@ -347,22 +258,35 @@ TEST(concat_image2d_fp32, compute) {
#ifdef PRINT_RESULT #ifdef PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----"; LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < out_dim.production(); ++eidx) { for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
std::cout << mapped_x0[eidx] << ", " << mapped_x1[eidx] << " -> " std::cout << "x0[" << eidx << "]:" << mapped_x0[eidx] << ",\t x1["
<< mapped_y[eidx] << std::endl; << eidx << "]:" << mapped_x1[eidx] << " -> y[" << eidx
<< "]:" << mapped_y[eidx] << "\t, y_ref[" << eidx
<< "]:" << y_data_ref[eidx] << ",\t IS_DIFF_PASSED:"
<< IS_DIFF_PASSED(
y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF)
<< std::endl;
} }
#endif // PRINT_RESULT #endif // PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref) // check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < out_dim.production(); eidx++) { for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6); auto abs_diff = abs(y_data_ref[i] - mapped_y[i]);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) { auto relative_diff =
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]);
<< " / " << x0_dim.production() << ", y_data_ref[" EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y[" (abs_diff <= FP16_MAX_DIFF),
<< eidx << "]:" << mapped_y[eidx]; true);
if ((relative_diff > FP16_MAX_DIFF) &&
(abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " mapped_y[" << i
<< "]:" << mapped_y[i] << " y_data_ref[" << i
<< "]:" << y_data_ref[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
break; break;
} }
} }
// free // free
LOG(INFO) << "free: unmap x, y"; LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data0, mapped_x0); TargetWrapperCL::Unmap(x_data0, mapped_x0);
...@@ -382,9 +306,9 @@ TEST(concat_image2d_fp32, compute) { ...@@ -382,9 +306,9 @@ TEST(concat_image2d_fp32, compute) {
} // namespace paddle } // namespace paddle
// concat buffer // concat buffer
// USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def); // USE_LITE_KERNEL(concat, kOpenCL, kFP16, kNCHW, def);
// concat image2d fp32 // concat image2d fp32
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(concat, kOpenCL, kFloat, kImageDefault, ImageDefault); USE_LITE_KERNEL(concat, kOpenCL, kFP16, kImageDefault, ImageDefault);
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/opencl/conv_compute.h" #include "lite/kernels/opencl/conv_buffer_compute.h"
#include <sstream> #include <sstream>
...@@ -1431,50 +1431,14 @@ void ConvImageCompute::Run() { (this->*impl_)(); } ...@@ -1431,50 +1431,14 @@ void ConvImageCompute::Run() { (this->*impl_)(); }
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// REGISTER_LITE_KERNEL(conv2d,
// kOpenCL,
// kFloat,
// kNCHW,
// paddle::lite::kernels::opencl::ConvCompute,
// def)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(conv2d, REGISTER_LITE_KERNEL(conv2d,
kOpenCL, kOpenCL,
kFloat, kFloat,
kImageDefault, kNCHW,
paddle::lite::kernels::opencl::ConvImageCompute, paddle::lite::kernels::opencl::ConvCompute,
image2d) def)
.BindInput("Input", .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
{LiteType::GetTensorTy(TARGET(kOpenCL), .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
PRECISION(kFloat), .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
DATALAYOUT(kImageDefault))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(depthwise_conv2d,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::ConvImageCompute,
image2d)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
...@@ -58,34 +58,6 @@ class ConvCompute ...@@ -58,34 +58,6 @@ class ConvCompute
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)();
void PrepareForRun() override;
void Run() override;
private:
void Conv2d1x1();
void Conv2d3x3();
void Conv2d5x5();
void Conv2d7x7();
void DepthwiseConv2d3x3s1();
void DepthwiseConv2d3x3();
void DepthwiseConv2d();
kernel_t impl_;
std::vector<std::string> kernel_func_names_{};
std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{};
std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_;
Tensor bias_gpu_image_;
};
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -167,7 +167,6 @@ void PrintData(std::string name, ...@@ -167,7 +167,6 @@ void PrintData(std::string name,
} }
// buffer // buffer
#if 0
// #define PRINT_RESULT // #define PRINT_RESULT
#define LOOP_TEST #define LOOP_TEST
TEST(conv2d, compute_conv2d_1x1) { TEST(conv2d, compute_conv2d_1x1) {
...@@ -625,9 +624,8 @@ TEST(conv2d, compute_conv2d_gemm) { ...@@ -625,9 +624,8 @@ TEST(conv2d, compute_conv2d_gemm) {
} // batch_size } // batch_size
#endif #endif
} }
#endif
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def); USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/conv_image_compute.h"
#include <sstream>
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
/* image kernel*/
void ConvImageCompute::PrepareForRun() {
const auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
float* filter_cpu = param.filter->mutable_data<float>();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
int bs = x_dims[0];
int c_in = x_dims[1];
int h_out = output_dims[2];
int w_out = output_dims[3];
int kernel_h = filter_dims[2]; // oihw
int kernel_w = filter_dims[3];
auto paddings = *param.paddings;
auto dilations = *param.dilations;
int stride_h = param.strides[0];
int stride_w = param.strides[1];
int pad_h = paddings[0];
int pad_w = paddings[2];
int groups = param.groups;
bool relu_fused = param.fuse_relu;
bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
bool zero_pad = (pad_h == 0) && (pad_w == 0);
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) &&
(paddings[2] == paddings[3]));
bool stride_equal = stride_h == stride_w;
bool dilation_equal = dilations[0] == dilations[1];
CHECK(pad_equal && stride_equal && dilation_equal);
VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
<< " stride_w:" << stride_w << " pad_h:" << pad_h
<< " pad_w:" << pad_w << " kernel_h:" << kernel_h
<< " kernel_h:" << kernel_h;
VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
<< " " << x_dims[3];
VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
<< output_dims[2] << " " << output_dims[3];
VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
<< filter_dims[2] << " " << filter_dims[3];
if (kernel_h == 1 && kernel_w == 1) {
// conv2d_1x1
if (param.x->dims()[1] % 4 == 0) {
kernel_func_names_.push_back("conv2d_1x1_simple");
} else {
kernel_func_names_.push_back("conv2d_1x1");
}
kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl");
CLImageConverterNWBlock converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d1x1;
#if 1 // TODO(ysh329): enable general dwconv
} else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]) {
#else // TODO(ysh329): remove dwconv3x3s1 and dwconv3x3 temporarily, need fix
} else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
kernel_h == 3 && kernel_w == 3 && groups > 1) {
// depth_conv2d_3x3s1, depth_conv2d_3x3
if (stride_h == 1 && dilations[0] == 1) {
kernel_func_names_.push_back("depth_conv2d_3x3s1");
impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
} else {
kernel_func_names_.push_back("depth_conv2d_3x3");
impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
}
kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
CLImageConverterNWBlock converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
} else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
kernel_h != 3) {
#endif
// depth_conv2d
kernel_func_names_.push_back("depth_conv2d");
kernel_func_paths_.push_back("image/depthwise_conv2d_basic_kernel.cl");
CLImageConverterNWBlock converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::DepthwiseConv2d;
} else if (kernel_h == 3 && kernel_h == 3) {
// conv2d_3x3
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d3x3;
} else if (kernel_h == 5 && kernel_w == 5) {
// conv2d_5x5
kernel_func_names_.push_back("conv2d_5x5");
kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d5x5;
} else if (kernel_h == 7 && kernel_w == 7) {
// conv2d_7x7
kernel_func_names_.push_back("conv2d_7x7");
kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
this->filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d7x7;
} else {
LOG(FATAL) << "conv image compute not support this condition yet! ";
}
VLOG(1) << "kernel_func_names_[0]:" << kernel_func_names_[0]
<< " kernel_func_paths_[0]:" << kernel_func_paths_[0];
std::string build_options_single(" -DCL_DTYPE_half");
// relu options
if (relu_fused) {
build_options_single += " -DRELU";
} else if (param.activation_param.active_type ==
lite_api::ActivationType::kRelu6) {
build_options_single += " -DRELU6";
} else {
// do nothing, may add more activation fuse
}
// bias options
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
if (has_bias) {
build_options_single +=
is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
// convert cpu buffer bias --> gpu image
CLImageConverterFolder bias_converter;
const DDim& bias_image_dims =
bias_converter.InitImageDimInfoWith(param.bias->dims());
std::vector<uint16_t> bias_image_v(bias_image_dims[0] * bias_image_dims[1] *
4);
float* bias_cpu_data = param.bias->mutable_data<float>();
bias_converter.NCHWToImage(
bias_cpu_data, bias_image_v.data(), param.bias->dims());
this->bias_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
bias_image_dims[0], bias_image_dims[1], bias_image_v.data());
// convert cpu buffer bias --> gpu image --- end ----
}
build_options_.push_back(build_options_single);
for (size_t i = 0; i < kernel_func_names_.size(); i++) {
context.cl_context()->AddKernel(
kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
}
}
void ConvImageCompute::Conv2d1x1() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<uint16_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<uint16_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<uint16_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
VLOG(4) << "============ conv2d_1x1 params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
// handle bias use buffer for channel wise , use image for element wise
const cl::Buffer* bias_buf = nullptr;
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<uint16_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int maped_w = maptofactor(w, 4);
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "maped_w: " << maped_w;
VLOG(4) << "hasbias: " << has_bias;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, maped_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(maped_w),
static_cast<size_t>(default_work_size.data()[2])};
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d3x3() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<uint16_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<uint16_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int input_channel = input_dims[1];
int output_width = output_dims[3];
int output_height = output_dims[2];
int output_channel = output_dims[1];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
int filter_channel = filter_dims[1];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<uint16_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
// re-calc group
int new_groups{param.groups};
if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
new_groups = 1;
} else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
new_groups = input_channel / filter_channel;
}
/* TODO(ysh329): mobile has no case below
else {
LOG(FATAL) << "Not support conv3x3 case with"
<< " input_dims:" << input_dims << " output_dims:" <<
output_dims
<< " filter_dims:" << filter_dims;
}
*/
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "param.groups(groups):" << param.groups;
VLOG(4) << "new_groups:" << new_groups;
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<uint16_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, new_groups);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d5x5() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<uint16_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<uint16_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<uint16_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<uint16_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d7x7() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<uint16_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<uint16_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<uint16_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<uint16_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::DepthwiseConv2d3x3s1() {
const auto& param = *param_.get_mutable<param_t>();
auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto dilations = *param.dilations;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* input_img = param.x->data<uint16_t, cl::Image2D>();
auto* filter_img = filter_gpu_image_.data<uint16_t, cl::Image2D>();
const cl::Image2D* bias_img = nullptr;
if (param.bias) {
bias_img = bias_gpu_image_.data<uint16_t, cl::Image2D>();
}
auto image_shape = InitImageDimInfoWith(output_dims);
auto* output_img = param.output->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
int w_blk_size = 2;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
auto global_work_size = cl::NDRange(c_block, w_blk, nh);
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *output_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(output_img, event_);
}
void ConvImageCompute::DepthwiseConv2d3x3() {
const auto& param = *param_.get_mutable<param_t>();
auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto dilations = *param.dilations;
int offset = filter_dims[2] / 2 - paddings[0];
int input_c_block = (x_dims[1] + 3) / 4;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* input_img = param.x->data<uint16_t, cl::Image2D>();
auto* filter_img = filter_gpu_image_.data<uint16_t, cl::Image2D>();
const cl::Image2D* bias_img = nullptr;
if (param.bias) {
bias_img = bias_gpu_image_.data<uint16_t, cl::Image2D>();
}
auto image_shape = InitImageDimInfoWith(output_dims);
auto* output_img = param.output->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
auto global_work_size = cl::NDRange(c_block, w, nh);
VLOG(4) << "setArg";
VLOG(4) << "c_block = " << c_block;
VLOG(4) << "w = " << w;
VLOG(4) << "nh = " << nh;
VLOG(4) << "strides = " << strides[0];
VLOG(4) << "offset = " << offset;
VLOG(4) << "dilations = " << dilations[0];
VLOG(4) << "input_c_block = " << input_c_block;
VLOG(4) << "x_dims[3] = " << x_dims[3];
VLOG(4) << "x_dims[2] = " << x_dims[2];
VLOG(4) << "output_dims[3] = " << output_dims[3];
VLOG(4) << "output_dims[2] = " << output_dims[2];
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(w));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *output_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(output_img, event_);
}
void ConvImageCompute::DepthwiseConv2d() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<uint16_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<uint16_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<uint16_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
VLOG(4) << "============ depthwise conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
// handle bias use buffer for channel wise , use image for element wise
const cl::Buffer* bias_buf = nullptr;
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<uint16_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Run() { (this->*impl_)(); }
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(conv2d,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ConvImageCompute,
image2d)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(depthwise_conv2d,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ConvImageCompute,
image2d)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
...@@ -11,41 +11,50 @@ ...@@ -11,41 +11,50 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/tensor.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
template <PrecisionType Ptype, DataLayoutType layout> class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
class ConcatCompute : public KernelLite<TARGET(kOpenCL), Ptype, layout> { PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ConcatParam; using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)();
void PrepareForRun() override; void PrepareForRun() override;
void Run() override; void Run() override;
std::string doc(); // override; private:
void Conv2d1x1();
// protected: void Conv2d3x3();
// void UpdateParams(); void Conv2d5x5();
void Conv2d7x7();
int axis_size_ = 1; void DepthwiseConv2d3x3s1();
int post_size_ = 1; void DepthwiseConv2d3x3();
int pre_size_ = 1; void DepthwiseConv2d();
int axis_ = 1;
param_t* concat_param_{nullptr}; kernel_t impl_;
std::string kernel_func_name_{}; std::vector<std::string> kernel_func_names_{};
std::string build_options_{"-DCL_DTYPE_float"}; std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_;
Tensor bias_gpu_image_;
}; };
} // namespace opencl } // namespace opencl
......
...@@ -15,16 +15,17 @@ ...@@ -15,16 +15,17 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <random> #include <random>
#include "lite/backends/opencl/cl_image_converter.h" #include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/target_wrapper.h" #include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
// #define SHADOW_LOG LOG(INFO) // #define SHADOW_LOG LOG(INFO)
#define SHADOW_LOG VLOG(4) #define SHADOW_LOG VLOG(4)
#define FP16_MAX_DIFF (1e0)
template <typename Dtype1, typename Dtype2> template <typename Dtype1, typename Dtype2>
static void conv_basic(const Dtype1* din, static void conv_basic(const Dtype1* din,
...@@ -162,7 +163,7 @@ TEST(conv2d, compute_image2d_1x1) { ...@@ -162,7 +163,7 @@ TEST(conv2d, compute_image2d_1x1) {
auto kernels = auto kernels =
KernelRegistry::Global().Create("conv2d", KernelRegistry::Global().Create("conv2d",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
...@@ -283,13 +284,13 @@ TEST(conv2d, compute_image2d_1x1) { ...@@ -283,13 +284,13 @@ TEST(conv2d, compute_image2d_1x1) {
paddle::lite::CLImageConverterDefault default_convertor; paddle::lite::CLImageConverterDefault default_convertor;
SHADOW_LOG << "set mapped input ..."; SHADOW_LOG << "set mapped input ...";
std::vector<float> x_image_v( std::vector<uint16_t> x_image_v(
input_image_width * input_image_height * 4); // 4 : RGBA input_image_width * input_image_height * 4); // 4 : RGBA
std::vector<float> filter_image_v( std::vector<uint16_t> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 :RGBA filter_image_width * filter_image_height * 4); // 4 :RGBA
std::vector<float> bias_image_v( std::vector<uint16_t> bias_image_v(
bias_image_width * bias_image_height * 4); // 4 : RGBA bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v( std::vector<uint16_t> out_image_v(
out_image_width * out_image_height * 4); // 4 : RGBA out_image_width * out_image_height * 4); // 4 : RGBA
default_convertor.NCHWToImage( default_convertor.NCHWToImage(
...@@ -300,13 +301,13 @@ TEST(conv2d, compute_image2d_1x1) { ...@@ -300,13 +301,13 @@ TEST(conv2d, compute_image2d_1x1) {
nw_convertor.NCHWToImage( nw_convertor.NCHWToImage(
filter_v.data(), filter_image_v.data(), filter_dim); filter_v.data(), filter_image_v.data(), filter_dim);
auto* input_image2d = input.mutable_data<float, cl::Image2D>( auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data()); input_image_width, input_image_height, x_image_v.data());
// assign filter as target arm // assign filter as target arm
filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(), filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
filter_dim); filter_dim);
// auto* filter_image2d = // auto* filter_image2d =
// filter.mutable_data<float, cl::Image2D>( // filter.mutable_data<uint16_t, cl::Image2D>(
// filter_image_width, // filter_image_width,
// filter_image_height, // filter_image_height,
// filter_image_v.data()); // filter_image_v.data());
...@@ -356,11 +357,12 @@ TEST(conv2d, compute_image2d_1x1) { ...@@ -356,11 +357,12 @@ TEST(conv2d, compute_image2d_1x1) {
SHADOW_LOG << "kernel launch ..."; SHADOW_LOG << "kernel launch ...";
kernel->Launch(); kernel->Launch();
SHADOW_LOG << "mutable output ..."; SHADOW_LOG << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>( auto* output_image2d =
output.mutable_data<uint16_t, cl::Image2D>(
out_image_width, out_image_height); out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
...@@ -373,8 +375,9 @@ TEST(conv2d, compute_image2d_1x1) { ...@@ -373,8 +375,9 @@ TEST(conv2d, compute_image2d_1x1) {
"cl tensor."; "cl tensor.";
} }
TargetWrapperCL::ImgcpySync(out_image_v.data(), TargetWrapperCL::ImgcpySync(
output.data<float, cl::Image2D>(), out_image_v.data(),
output.data<uint16_t, cl::Image2D>(),
out_image_width, out_image_width,
out_image_height, out_image_height,
cl_image2d_row_pitch, cl_image2d_row_pitch,
...@@ -425,12 +428,16 @@ TEST(conv2d, compute_image2d_1x1) { ...@@ -425,12 +428,16 @@ TEST(conv2d, compute_image2d_1x1) {
static_cast<int64_t>(out_image_height)})}; static_cast<int64_t>(out_image_height)})};
for (int i = 0; i < out_dim.production(); i++) { for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2); auto relative_diff =
if (abs(output_v[i] - out_ref_data[i]) > 1e-2) { COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
LOG(FATAL) << "error idx:" << i; EXPECT_LT(relative_diff, FP16_MAX_DIFF);
if (relative_diff > FP16_MAX_DIFF) {
LOG(FATAL) << "error idx:" << i << "output_v[" << i
<< "]:" << output_v[i] << " "
"out_ref_data["
<< i << "]:" << out_ref_data[i];
} }
} }
#ifdef LOOP_TEST #ifdef LOOP_TEST
} }
} }
...@@ -479,12 +486,12 @@ TEST(conv2d, compute_image2d_3x3) { ...@@ -479,12 +486,12 @@ TEST(conv2d, compute_image2d_3x3) {
const int oc = 2; const int oc = 2;
#else // big scale with group #else // big scale with group
const int stride = 1; const int stride = 1;
const int group = 32; const int group = 32 / 1;
const int batch_size = 1; const int batch_size = 1;
const int ic = 32; const int ic = 32 / 1;
const int ih = 112; const int ih = 112 / 1;
const int iw = 112; const int iw = 112 / 1;
const int oc = 32; const int oc = 32 / 1;
#endif #endif
const bool bias_flag = false; const bool bias_flag = false;
...@@ -503,7 +510,7 @@ TEST(conv2d, compute_image2d_3x3) { ...@@ -503,7 +510,7 @@ TEST(conv2d, compute_image2d_3x3) {
auto kernels = auto kernels =
KernelRegistry::Global().Create("conv2d", KernelRegistry::Global().Create("conv2d",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1"; CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
...@@ -599,10 +606,10 @@ TEST(conv2d, compute_image2d_3x3) { ...@@ -599,10 +606,10 @@ TEST(conv2d, compute_image2d_3x3) {
SHADOW_LOG << "gen input and filter ..."; SHADOW_LOG << "gen input and filter ...";
for (int i = 0; i < input_v.size(); ++i) { for (int i = 0; i < input_v.size(); ++i) {
input_v[i] = i; // gen(engine); input_v[i] = i * 0.001; // gen(engine);
} }
for (int i = 0; i < filter_v.size(); ++i) { for (int i = 0; i < filter_v.size(); ++i) {
filter_v[i] = 1; // gen(engine); filter_v[i] = 1 * 0.001; // gen(engine);
} }
SHADOW_LOG << "after gen input and filter ..."; SHADOW_LOG << "after gen input and filter ...";
...@@ -634,14 +641,14 @@ TEST(conv2d, compute_image2d_3x3) { ...@@ -634,14 +641,14 @@ TEST(conv2d, compute_image2d_3x3) {
paddle::lite::CLImageConverterDefault default_convertor; paddle::lite::CLImageConverterDefault default_convertor;
SHADOW_LOG << "set mapped input ..."; SHADOW_LOG << "set mapped input ...";
std::vector<float> x_image_v(input_image_width * std::vector<uint16_t> x_image_v(
input_image_height * 4); // 4 :RGBA input_image_width * input_image_height * 4); // 4 :RGBA
std::vector<float> filter_image_v( std::vector<uint16_t> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 : RGBA filter_image_width * filter_image_height * 4); // 4 : RGBA
std::vector<float> bias_image_v( std::vector<uint16_t> bias_image_v(
bias_image_width * bias_image_height * 4); // 4 : RGBA bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v(out_image_width * std::vector<uint16_t> out_image_v(
out_image_height * 4); // 4 :RGBA out_image_width * out_image_height * 4); // 4 :RGBA
default_convertor.NCHWToImage( default_convertor.NCHWToImage(
input_v.data(), x_image_v.data(), input_dim); input_v.data(), x_image_v.data(), input_dim);
...@@ -666,7 +673,7 @@ TEST(conv2d, compute_image2d_3x3) { ...@@ -666,7 +673,7 @@ TEST(conv2d, compute_image2d_3x3) {
for (int i = 0; i < filter_image_v.size(); i++) { for (int i = 0; i < filter_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << filter_image_v[i]; SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
} }
auto* input_image2d = input.mutable_data<float, cl::Image2D>( auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data()); input_image_width, input_image_height, x_image_v.data());
// assign filter as target arm // assign filter as target arm
filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(), filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
...@@ -707,11 +714,11 @@ TEST(conv2d, compute_image2d_3x3) { ...@@ -707,11 +714,11 @@ TEST(conv2d, compute_image2d_3x3) {
SHADOW_LOG << "kernel launch ..."; SHADOW_LOG << "kernel launch ...";
kernel->Launch(); kernel->Launch();
SHADOW_LOG << "mutable output ..."; SHADOW_LOG << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>( auto* output_image2d = output.mutable_data<uint16_t, cl::Image2D>(
out_image_width, out_image_height); out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
...@@ -725,7 +732,7 @@ TEST(conv2d, compute_image2d_3x3) { ...@@ -725,7 +732,7 @@ TEST(conv2d, compute_image2d_3x3) {
} }
TargetWrapperCL::ImgcpySync(out_image_v.data(), TargetWrapperCL::ImgcpySync(out_image_v.data(),
output.data<float, cl::Image2D>(), output.data<uint16_t, cl::Image2D>(),
out_image_width, out_image_width,
out_image_height, out_image_height,
cl_image2d_row_pitch, cl_image2d_row_pitch,
...@@ -793,9 +800,14 @@ TEST(conv2d, compute_image2d_3x3) { ...@@ -793,9 +800,14 @@ TEST(conv2d, compute_image2d_3x3) {
#endif #endif
for (int i = 0; i < out_dim.production(); i++) { for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2); auto relative_diff =
if (abs(output_v[i] - out_ref_data[i]) > 1e-2) { COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
LOG(FATAL) << "error idx:" << i; EXPECT_LT(relative_diff, FP16_MAX_DIFF);
if (relative_diff > FP16_MAX_DIFF) {
LOG(FATAL) << "error idx:" << i << "output_v[" << i
<< "]:" << output_v[i] << " "
"out_ref_data["
<< i << "]:" << out_ref_data[i];
} }
} }
...@@ -850,7 +862,7 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -850,7 +862,7 @@ TEST(conv2d, compute_image2d_5x5) {
auto kernels = auto kernels =
KernelRegistry::Global().Create("conv2d", KernelRegistry::Global().Create("conv2d",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
...@@ -944,10 +956,10 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -944,10 +956,10 @@ TEST(conv2d, compute_image2d_5x5) {
SHADOW_LOG << "gen input and filter ..."; SHADOW_LOG << "gen input and filter ...";
for (auto& i : input_v) { for (auto& i : input_v) {
i = gen(engine); i = 0.01 * gen(engine);
} }
for (auto& f : filter_v) { for (auto& f : filter_v) {
f = gen(engine); f = 0.01 * gen(engine);
} }
SHADOW_LOG << "after gen input and filter ..."; SHADOW_LOG << "after gen input and filter ...";
...@@ -975,14 +987,14 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -975,14 +987,14 @@ TEST(conv2d, compute_image2d_5x5) {
paddle::lite::CLImageConverterDefault default_convertor; paddle::lite::CLImageConverterDefault default_convertor;
SHADOW_LOG << "set mapped input ..."; SHADOW_LOG << "set mapped input ...";
std::vector<float> x_image_v(input_image_width * std::vector<uint16_t> x_image_v(
input_image_height * 4); // 4 :RGBA input_image_width * input_image_height * 4); // 4 :RGBA
std::vector<float> filter_image_v( std::vector<uint16_t> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 : RGBA filter_image_width * filter_image_height * 4); // 4 : RGBA
std::vector<float> bias_image_v( std::vector<uint16_t> bias_image_v(
bias_image_width * bias_image_height * 4); // 4 : RGBA bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v(out_image_width * std::vector<uint16_t> out_image_v(
out_image_height * 4); // 4 :RGBA out_image_width * out_image_height * 4); // 4 :RGBA
default_convertor.NCHWToImage( default_convertor.NCHWToImage(
input_v.data(), x_image_v.data(), input_dim); input_v.data(), x_image_v.data(), input_dim);
...@@ -1007,7 +1019,7 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -1007,7 +1019,7 @@ TEST(conv2d, compute_image2d_5x5) {
for (int i = 0; i < filter_image_v.size(); i++) { for (int i = 0; i < filter_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << filter_image_v[i]; SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
} }
auto* input_image2d = input.mutable_data<float, cl::Image2D>( auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data()); input_image_width, input_image_height, x_image_v.data());
// assign filter as target arm // assign filter as target arm
filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(), filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
...@@ -1048,11 +1060,11 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -1048,11 +1060,11 @@ TEST(conv2d, compute_image2d_5x5) {
SHADOW_LOG << "kernel launch ..."; SHADOW_LOG << "kernel launch ...";
kernel->Launch(); kernel->Launch();
SHADOW_LOG << "mutable output ..."; SHADOW_LOG << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>( auto* output_image2d = output.mutable_data<uint16_t, cl::Image2D>(
out_image_width, out_image_height); out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
...@@ -1066,7 +1078,7 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -1066,7 +1078,7 @@ TEST(conv2d, compute_image2d_5x5) {
} }
TargetWrapperCL::ImgcpySync(out_image_v.data(), TargetWrapperCL::ImgcpySync(out_image_v.data(),
output.data<float, cl::Image2D>(), output.data<uint16_t, cl::Image2D>(),
out_image_width, out_image_width,
out_image_height, out_image_height,
cl_image2d_row_pitch, cl_image2d_row_pitch,
...@@ -1127,9 +1139,14 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -1127,9 +1139,14 @@ TEST(conv2d, compute_image2d_5x5) {
static_cast<int64_t>(out_image_height)})}; static_cast<int64_t>(out_image_height)})};
for (int i = 0; i < out_dim.production(); i++) { for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2); auto relative_diff =
if (abs(output_v[i] - out_ref_data[i]) > 1e-2) { COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
LOG(FATAL) << "error idx:" << i; EXPECT_LT(relative_diff, FP16_MAX_DIFF);
if (relative_diff > FP16_MAX_DIFF) {
LOG(FATAL) << "error idx:" << i << "output_v[" << i
<< "]:" << output_v[i] << " "
"out_ref_data["
<< i << "]:" << out_ref_data[i];
} }
} }
...@@ -1183,7 +1200,7 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1183,7 +1200,7 @@ TEST(conv2d, compute_image2d_7x7) {
auto kernels = auto kernels =
KernelRegistry::Global().Create("conv2d", KernelRegistry::Global().Create("conv2d",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
...@@ -1308,14 +1325,14 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1308,14 +1325,14 @@ TEST(conv2d, compute_image2d_7x7) {
paddle::lite::CLImageConverterDefault default_convertor; paddle::lite::CLImageConverterDefault default_convertor;
SHADOW_LOG << "set mapped input ..."; SHADOW_LOG << "set mapped input ...";
std::vector<float> x_image_v(input_image_width * std::vector<uint16_t> x_image_v(
input_image_height * 4); // 4 : RGBA input_image_width * input_image_height * 4); // 4 : RGBA
std::vector<float> filter_image_v( std::vector<uint16_t> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 : RGBA filter_image_width * filter_image_height * 4); // 4 : RGBA
std::vector<float> bias_image_v( std::vector<uint16_t> bias_image_v(
bias_image_width * bias_image_height * 4); // 4 : RGBA bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v(out_image_width * std::vector<uint16_t> out_image_v(
out_image_height * 4); // 4 : RGBA out_image_width * out_image_height * 4); // 4 : RGBA
default_convertor.NCHWToImage( default_convertor.NCHWToImage(
input_v.data(), x_image_v.data(), input_dim); input_v.data(), x_image_v.data(), input_dim);
...@@ -1340,7 +1357,7 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1340,7 +1357,7 @@ TEST(conv2d, compute_image2d_7x7) {
for (int i = 0; i < filter_image_v.size(); i++) { for (int i = 0; i < filter_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << filter_image_v[i]; SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
} }
auto* input_image2d = input.mutable_data<float, cl::Image2D>( auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data()); input_image_width, input_image_height, x_image_v.data());
// assign filter as target arm // assign filter as target arm
...@@ -1382,11 +1399,11 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1382,11 +1399,11 @@ TEST(conv2d, compute_image2d_7x7) {
SHADOW_LOG << "kernel launch ..."; SHADOW_LOG << "kernel launch ...";
kernel->Launch(); kernel->Launch();
SHADOW_LOG << "mutable output ..."; SHADOW_LOG << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>( auto* output_image2d = output.mutable_data<uint16_t, cl::Image2D>(
out_image_width, out_image_height); out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
...@@ -1400,7 +1417,7 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1400,7 +1417,7 @@ TEST(conv2d, compute_image2d_7x7) {
} }
TargetWrapperCL::ImgcpySync(out_image_v.data(), TargetWrapperCL::ImgcpySync(out_image_v.data(),
output.data<float, cl::Image2D>(), output.data<uint16_t, cl::Image2D>(),
out_image_width, out_image_width,
out_image_height, out_image_height,
cl_image2d_row_pitch, cl_image2d_row_pitch,
...@@ -1461,9 +1478,14 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1461,9 +1478,14 @@ TEST(conv2d, compute_image2d_7x7) {
static_cast<int64_t>(out_image_height)})}; static_cast<int64_t>(out_image_height)})};
for (int i = 0; i < out_dim.production(); i++) { for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2); auto relative_diff =
if (abs(output_v[i] - out_ref_data[i]) > 1e-2) { COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
LOG(FATAL) << "error idx:" << i; EXPECT_LT(relative_diff, FP16_MAX_DIFF);
if (relative_diff > FP16_MAX_DIFF) {
LOG(FATAL) << "error idx:" << i << "output_v[" << i
<< "]:" << output_v[i] << " "
"out_ref_data["
<< i << "]:" << out_ref_data[i];
} }
} }
...@@ -1485,4 +1507,4 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1485,4 +1507,4 @@ TEST(conv2d, compute_image2d_7x7) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kImageDefault, image2d); USE_LITE_KERNEL(conv2d, kOpenCL, kFP16, kImageDefault, image2d);
...@@ -105,6 +105,7 @@ int ConvOutputSize(int input_size, ...@@ -105,6 +105,7 @@ int ConvOutputSize(int input_size,
return output_size; return output_size;
} }
// #define LOOP_TEST
TEST(depthwise_conv2d_basic, compute) { TEST(depthwise_conv2d_basic, compute) {
// conv infos // conv infos
// const int ksize = 1; // const int ksize = 1;
...@@ -144,7 +145,7 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -144,7 +145,7 @@ TEST(depthwise_conv2d_basic, compute) {
auto kernels = auto kernels =
KernelRegistry::Global().Create("depthwise_conv2d", KernelRegistry::Global().Create("depthwise_conv2d",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
...@@ -252,13 +253,13 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -252,13 +253,13 @@ TEST(depthwise_conv2d_basic, compute) {
paddle::lite::CLImageConverterDefault default_convertor; paddle::lite::CLImageConverterDefault default_convertor;
VLOG(4) << "set mapped input ..."; VLOG(4) << "set mapped input ...";
std::vector<float> x_image_v(input_image_width * input_image_height * std::vector<uint16_t> x_image_v(input_image_width *
4); // 4 : RGBA input_image_height * 4); // 4 : RGBA
std::vector<float> filter_image_v( std::vector<uint16_t> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 : RGBA filter_image_width * filter_image_height * 4); // 4 : RGBA
std::vector<float> bias_image_v(bias_image_width * bias_image_height * std::vector<uint16_t> bias_image_v(
4); // 4 : RGBA bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v(out_image_width * out_image_height * std::vector<uint16_t> out_image_v(out_image_width * out_image_height *
4); // 4 : RGBA 4); // 4 : RGBA
default_convertor.NCHWToImage( default_convertor.NCHWToImage(
...@@ -269,9 +270,9 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -269,9 +270,9 @@ TEST(depthwise_conv2d_basic, compute) {
nw_convertor.NCHWToImage( nw_convertor.NCHWToImage(
filter_v.data(), filter_image_v.data(), filter_dim); filter_v.data(), filter_image_v.data(), filter_dim);
auto* input_image2d = input.mutable_data<float, cl::Image2D>( auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data()); input_image_width, input_image_height, x_image_v.data());
auto* filter_image2d = filter.mutable_data<float, cl::Image2D>( auto* filter_image2d = filter.mutable_data<uint16_t, cl::Image2D>(
filter_image_width, filter_image_height, filter_image_v.data()); filter_image_width, filter_image_height, filter_image_v.data());
if (bias_flag) { if (bias_flag) {
...@@ -284,7 +285,7 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -284,7 +285,7 @@ TEST(depthwise_conv2d_basic, compute) {
CLImageConverterFolder folder_convertor; CLImageConverterFolder folder_convertor;
folder_convertor.NCHWToImage( folder_convertor.NCHWToImage(
bias_v.data(), bias_image_v.data(), bias_dim); bias_v.data(), bias_image_v.data(), bias_dim);
auto* bias_data = bias.mutable_data<float, cl::Image2D>( auto* bias_data = bias.mutable_data<uint16_t, cl::Image2D>(
bias_image_width, bias_image_height, bias_image_v.data()); bias_image_width, bias_image_height, bias_image_v.data());
} }
...@@ -300,11 +301,11 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -300,11 +301,11 @@ TEST(depthwise_conv2d_basic, compute) {
VLOG(4) << "kernel launch ..."; VLOG(4) << "kernel launch ...";
kernel->Launch(); kernel->Launch();
VLOG(4) << "mutable output ..."; VLOG(4) << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>( auto* output_image2d = output.mutable_data<uint16_t, cl::Image2D>(
out_image_width, out_image_height); out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
...@@ -318,7 +319,7 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -318,7 +319,7 @@ TEST(depthwise_conv2d_basic, compute) {
} }
TargetWrapperCL::ImgcpySync(out_image_v.data(), TargetWrapperCL::ImgcpySync(out_image_v.data(),
output.data<float, cl::Image2D>(), output.data<uint16_t, cl::Image2D>(),
out_image_width, out_image_width,
out_image_height, out_image_height,
cl_image2d_row_pitch, cl_image2d_row_pitch,
...@@ -387,7 +388,7 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -387,7 +388,7 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
LOG(INFO) << "to get kernel ..."; LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create("depthwise_conv2d", auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
...@@ -433,11 +434,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -433,11 +434,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
default_converter->InitImageDimInfoWith(input.dims()); default_converter->InitImageDimInfoWith(input.dims());
LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " " LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
<< input_image_shape[1]; << input_image_shape[1];
std::vector<float> input_image_data(input_image_shape.production() * std::vector<uint16_t> input_image_data(input_image_shape.production() *
4); // 4 : RGBA 4); // 4 : RGBA
default_converter->NCHWToImage( default_converter->NCHWToImage(
input_v.data(), input_image_data.data(), input.dims()); input_v.data(), input_image_data.data(), input.dims());
auto* input_image = input.mutable_data<int16_t, cl::Image2D>( auto* input_image = input.mutable_data<uint16_t, cl::Image2D>(
input_image_shape[0], input_image_shape[1], input_image_data.data()); input_image_shape[0], input_image_shape[1], input_image_data.data());
LOG(INFO) << "prepare kernel"; LOG(INFO) << "prepare kernel";
...@@ -446,11 +447,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -446,11 +447,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims()); DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " " LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
<< filter_image_shape[1]; << filter_image_shape[1];
std::vector<float> filter_image_data(filter_image_shape.production() * std::vector<uint16_t> filter_image_data(filter_image_shape.production() *
4); // 4 : RGBA 4); // 4 : RGBA
nw_converter->NCHWToImage( nw_converter->NCHWToImage(
filter_v.data(), filter_image_data.data(), filter.dims()); filter_v.data(), filter_image_data.data(), filter.dims());
auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>( auto* filter_image = filter.mutable_data<uint16_t, cl::Image2D>(
filter_image_shape[0], filter_image_shape[1], filter_image_data.data()); filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
LOG(INFO) << "launch"; LOG(INFO) << "launch";
...@@ -459,13 +460,13 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -459,13 +460,13 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
default_converter->InitImageDimInfoWith(output.dims()); default_converter->InitImageDimInfoWith(output.dims());
LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " " LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
<< output_image_shape[1]; << output_image_shape[1];
auto* output_image = output.mutable_data<int16_t, cl::Image2D>( auto* output_image = output.mutable_data<uint16_t, cl::Image2D>(
output_image_shape[0], output_image_shape[1]); output_image_shape[0], output_image_shape[1]);
kernel->Launch(); kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<int16_t, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
...@@ -490,7 +491,8 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -490,7 +491,8 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0}; const size_t cl_image2d_slice_pitch{0};
float* output_image_data = new float[output_image_shape.production() * 4]; uint16_t* output_image_data =
new uint16_t[output_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(output_image_data, TargetWrapperCL::ImgcpySync(output_image_data,
output_image, output_image,
output_image_shape[0], output_image_shape[0],
...@@ -512,4 +514,4 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -512,4 +514,4 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kImageDefault, image2d); USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFP16, kImageDefault, image2d);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/elementwise_add_buffer_compute.h"
#include <memory>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
void ElementwiseAddCompute::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
ele_param_ = param_.get_mutable<param_t>();
UpdateParams();
}
void ElementwiseAddCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(ele_param_->X->target());
VLOG(4) << TargetToStr(ele_param_->Y->target());
VLOG(4) << TargetToStr(ele_param_->Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)batch_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)channels_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)num_);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{channels_, batch_};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
void ElementwiseAddCompute::UpdateParams() {
auto axis = ele_param_->axis;
const auto& x_dims = ele_param_->X->dims();
const auto& y_dims = ele_param_->Y->dims();
const auto& out_dims = ele_param_->Out->dims();
if (axis < 0) {
axis = static_cast<int>(x_dims.size() - y_dims.size());
}
for (int i = 0; i < axis; ++i) {
batch_ *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels_ *= y_dims[i];
}
for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
num_ *= x_dims[i];
}
VLOG(4) << "axis: " << axis;
VLOG(4) << "batch: " << batch_;
VLOG(4) << "channels: " << channels_;
VLOG(4) << "num: " << num_;
}
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
namespace ocl = paddle::lite::kernels::opencl;
REGISTER_LITE_KERNEL(
elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
...@@ -49,28 +49,6 @@ class ElementwiseAddCompute ...@@ -49,28 +49,6 @@ class ElementwiseAddCompute
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class ElementwiseAddImageCompute
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ElementwiseParam;
void PrepareForRun() override;
void Run() override;
std::string doc() const override {
return "ElementwiseAdd using cl::Image2D, kFloat";
}
protected:
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{" -DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/opencl/elementwise_add_compute.h" #include "lite/kernels/opencl/elementwise_add_image_compute.h"
#include <memory> #include <memory>
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
...@@ -23,80 +23,6 @@ namespace lite { ...@@ -23,80 +23,6 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
/* Buffer */
#if 0
void ElementwiseAddCompute::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
ele_param_ = param_.get_mutable<param_t>();
UpdateParams();
}
void ElementwiseAddCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(ele_param_->X->target());
VLOG(4) << TargetToStr(ele_param_->Y->target());
VLOG(4) << TargetToStr(ele_param_->Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)batch_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)channels_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)num_);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{channels_, batch_};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
void ElementwiseAddCompute::UpdateParams() {
auto axis = ele_param_->axis;
const auto& x_dims = ele_param_->X->dims();
const auto& y_dims = ele_param_->Y->dims();
const auto& out_dims = ele_param_->Out->dims();
if (axis < 0) {
axis = static_cast<int>(x_dims.size() - y_dims.size());
}
for (int i = 0; i < axis; ++i) {
batch_ *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels_ *= y_dims[i];
}
for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
num_ *= x_dims[i];
}
VLOG(4) << "axis: " << axis;
VLOG(4) << "batch: " << batch_;
VLOG(4) << "channels: " << channels_;
VLOG(4) << "num: " << num_;
}
#endif
/* Image2D */
void ElementwiseAddImageCompute::PrepareForRun() { void ElementwiseAddImageCompute::PrepareForRun() {
ele_param_ = param_.get_mutable<param_t>(); ele_param_ = param_.get_mutable<param_t>();
auto* x = ele_param_->X; auto* x = ele_param_->X;
...@@ -152,10 +78,10 @@ void ElementwiseAddImageCompute::Run() { ...@@ -152,10 +78,10 @@ void ElementwiseAddImageCompute::Run() {
default_convertor.InitImageDimInfoWith(out->dims()); // w, h default_convertor.InitImageDimInfoWith(out->dims()); // w, h
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims()); auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
auto* x_img = x->data<float, cl::Image2D>(); auto* x_img = x->data<uint16_t, cl::Image2D>();
auto* y_img = y->data<float, cl::Image2D>(); auto* y_img = y->data<uint16_t, cl::Image2D>();
auto* out_img = auto* out_img = out->mutable_data<uint16_t, cl::Image2D>(out_img_shape[0],
out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]); out_img_shape[1]);
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height; VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1]; VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
...@@ -220,14 +146,7 @@ void ElementwiseAddImageCompute::Run() { ...@@ -220,14 +146,7 @@ void ElementwiseAddImageCompute::Run() {
namespace ocl = paddle::lite::kernels::opencl; namespace ocl = paddle::lite::kernels::opencl;
// REGISTER_LITE_KERNEL( // TODO(ysh329): May need fix.
// elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
// TODO(ysh329): Not fix.
// "Y" may from constant value like conv bias (kARM, need do cl_image_converter // "Y" may from constant value like conv bias (kARM, need do cl_image_converter
// on CPU); // on CPU);
// may from anther branch like "X" (kOpenCL, nothing to do). // may from anther branch like "X" (kOpenCL, nothing to do).
...@@ -235,20 +154,20 @@ namespace ocl = paddle::lite::kernels::opencl; ...@@ -235,20 +154,20 @@ namespace ocl = paddle::lite::kernels::opencl;
// set target of "Y" as kOpenCL temporarily. // set target of "Y" as kOpenCL temporarily.
REGISTER_LITE_KERNEL(elementwise_add, REGISTER_LITE_KERNEL(elementwise_add,
kOpenCL, kOpenCL,
kFloat, kFP16,
kImageDefault, kImageDefault,
ocl::ElementwiseAddImageCompute, ocl::ElementwiseAddImageCompute,
def) def)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindInput("Y", .BindInput("Y",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
...@@ -25,25 +24,25 @@ namespace lite { ...@@ -25,25 +24,25 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class ElementwiseMulFloatImageCompute class ElementwiseAddImageCompute
: public KernelLite<TARGET(kOpenCL), : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ElementwiseParam; using param_t = operators::ElementwiseParam;
std::string doc() const override {
return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
}
void PrepareForRun() override; void PrepareForRun() override;
void Run() override; void Run() override;
std::string doc() const override {
return "ElementwiseAdd using cl::Image2D, kFP16";
}
protected: protected:
param_t* ele_param_{nullptr}; param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_mul"}; std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <algorithm>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
template <typename dtype>
void fill_data(dtype *x, const int length, int set_value = -1) {
if (set_value == -1) {
for (size_t idx = 0; idx < length; ++idx) {
x[idx] = idx;
}
} else if (set_value != -1) {
for (size_t idx = 0; idx < length; ++idx) {
x[idx] = set_value;
}
}
}
template <typename dtype>
void elementwise_compute_ref(const dtype *x_data,
const dtype *y_data,
dtype *out_data,
const DDim &x_dims,
const DDim &y_dims,
int axis,
const std::string elt_type,
bool use_relu = false) {
if (axis < 0) {
axis = x_dims.size() - y_dims.size();
}
int batch = 1;
int channels = 1;
int num = 1;
for (int i = 0; i < axis; ++i) {
batch *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels *= y_dims[i];
}
for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
num *= x_dims[i];
}
VLOG(4) << "axis:" << axis;
VLOG(4) << "batch:" << batch;
VLOG(4) << "cahnnels:" << channels;
VLOG(4) << "num:" << num;
// do elementwise add/sub/max/...
if (elt_type == "add" && axis == 1 && y_dims.size() == 1) {
for (int i = 0; i < x_dims.production(); ++i) {
auto w = i % y_dims.production();
out_data[i] = x_data[i] + y_data[w];
}
} else if (elt_type == "add") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype *din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype *dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr + diny_data;
if (use_relu) {
*dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
}
dout_ptr++;
din_ptr++;
}
}
}
} else {
LOG(FATAL) << "unsupported Elementwise type: " << elt_type << std::endl;
}
}
// #define PRINT_RESULT
// image
TEST(elementwise_add_image, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img on cpu) -> "
"elementwise_add(img) -> "
"layout(img2buf on cpu) "
"-> host";
// elementwise_add's 3 kernels selection routing strategy:
// --------------------------------------------------------
// 1. elementwise_add: Need y_dim.size() == 4
// 2. elementwise_add (used by fuse_elementwise_activation op):
// Need y_dim.size() == 4 && act_type == "relu"
// 3. width_add: Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
// 3
// 4. channel_add: Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
// 1
// dims
const int n = 1;
const int c = 3;
const int h = 2;
const int w = 2;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
auto out_dim = x_dim;
// y_dim / axis / relu_flag
std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, h, w}),
DDim(std::vector<DDim::value_type>{n, c, h, w}),
DDim(std::vector<DDim::value_type>{w}),
DDim(std::vector<DDim::value_type>{w})};
std::vector<int> axis_v{-1, -1, 3, 1};
std::vector<bool> relu_flag_v{false, true, false, false};
CHECK(y_dim_v.size() == axis_v.size() && axis_v.size() == relu_flag_v.size())
<< "y_dim_v.size() == axis_v.size() == relu_flag_v.size() should be "
"same, and be corresponding "
"one by one";
// start loop
for (size_t case_idx = 0; case_idx < y_dim_v.size(); ++case_idx) {
auto y_dim = y_dim_v[case_idx];
auto axis = axis_v[case_idx];
auto relu_flag = relu_flag_v[case_idx];
LOG(INFO) << "================== elementwise_add, case_idx:" << case_idx + 1
<< "/" << y_dim_v.size() << " ===================";
LOG(INFO) << "x_dim:" << x_dim;
LOG(INFO) << "y_dim:" << y_dim;
LOG(INFO) << "out_dim:" << out_dim;
LOG(INFO) << "axis:" << axis;
LOG(INFO) << "relu_flag:" << relu_flag;
// tensor
VLOG(4) << "set tensors about op param";
lite::Tensor eleadd_x, eleadd_y, eleadd_out;
eleadd_x.Resize(x_dim);
eleadd_y.Resize(y_dim);
eleadd_out.Resize(out_dim);
// initialize tensors
VLOG(4) << "initialize tensors";
paddle::lite::CLImageConverterDefault default_convertor;
// x
std::vector<float> x_v(x_dim.production());
fill_data<float>(x_v.data(), x_v.size()); // fill with index value
auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim); // w, h
auto x_img_w = x_img_shape[0];
auto x_img_h = x_img_shape[1];
std::vector<uint16_t> x_img_v(x_img_w * x_img_h * 4); // 4: RGBA
default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
eleadd_x.mutable_data<uint16_t, cl::Image2D>(
x_img_w, x_img_h, x_img_v.data());
// y
std::vector<float> y_v(y_dim.production());
fill_data<float>(y_v.data(), y_v.size()); // fill with index value
auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim); // w, h
auto y_img_w = y_img_shape[0];
auto y_img_h = y_img_shape[1];
std::vector<uint16_t> y_img_v(y_img_shape[0] * y_img_shape[1] *
4); // 4: RGBA
default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
eleadd_y.mutable_data<uint16_t, cl::Image2D>(
y_img_w, y_img_h, y_img_v.data());
// out
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out_dim); // w, h
auto out_img_w = out_img_shape[0];
auto out_img_h = out_img_shape[1];
eleadd_out.mutable_data<uint16_t, cl::Image2D>(out_img_w, out_img_h);
std::vector<uint16_t> out_img_v(out_img_w * out_img_h * 4);
fill_data<uint16_t>(
out_img_v.data(), out_img_v.size(), 0); // fill with zero value
std::vector<float> out_v(out_dim.production());
// operator param
operators::FusionElementwiseActivationParam
fuseEleaddParam; // enabled if relu_flag is true
fuseEleaddParam.X = &eleadd_x;
fuseEleaddParam.Y = &eleadd_y;
fuseEleaddParam.Out = &eleadd_out;
fuseEleaddParam.axis = axis;
fuseEleaddParam.act_type = relu_flag ? "relu" : "";
operators::ElementwiseParam eleaddParam;
eleaddParam.X = &eleadd_x;
eleaddParam.Y = &eleadd_y;
eleaddParam.Out = &eleadd_out;
eleaddParam.axis = axis;
auto op_param = relu_flag ? fuseEleaddParam : eleaddParam;
// set kernel
auto eleadd_img_kernels =
KernelRegistry::Global().Create("elementwise_add",
TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(eleadd_img_kernels.empty());
auto eleadd_img_kernel = std::move(eleadd_img_kernels.front());
VLOG(4) << "get eleadd kernel: " << eleadd_img_kernel->doc();
// set context and kernel args
VLOG(4) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
eleadd_img_kernel->SetParam(op_param);
std::unique_ptr<KernelContext> eleadd_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(eleadd_img_context->As<OpenCLContext>()));
eleadd_img_kernel->SetContext(std::move(eleadd_img_context));
// run kernel
VLOG(4) << "run kernel";
eleadd_img_kernel->Launch();
// download gpu result to cpu
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(out_img_v.data(),
eleadd_out.data<uint16_t, cl::Image2D>(),
out_img_w,
out_img_h,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
default_convertor.ImageToNCHW(
out_img_v.data(), out_v.data(), out_img_shape, out_dim);
// compute cpu reference
std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
elementwise_compute_ref<float>(x_v.data(),
y_v.data(),
out_ref.get(),
x_dim,
y_dim,
op_param.axis,
"add",
relu_flag);
#ifdef PRINT_RESULT // enable to check value of x and y
for (int eidx = 0; eidx < out_dim.production(); eidx++) {
auto value = out_v[eidx];
auto ref_value = out_ref.get()[eidx];
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
<< out_dim.production() << ", x_v[" << eidx << "]:" << x_v[eidx]
<< ", value[" << eidx << "]:" << value << ", ref_value[" << eidx
<< "]:" << ref_value;
}
for (int i = 0; i < y_v.size(); i++) {
LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
}
#endif
for (int eidx = 0; eidx < out_dim.production(); eidx++) {
auto value = out_v[eidx];
auto ref_value = out_ref.get()[eidx];
EXPECT_NEAR(value, ref_value, 1e-6);
if (abs(value - ref_value) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
<< out_dim.production() << ", value[" << eidx << "]:" << value
<< ", ref_value[" << eidx << "]:" << ref_value;
break;
}
}
}
}
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(elementwise_add, kOpenCL, kFP16, kImageDefault, def);
USE_LITE_KERNEL(
fusion_elementwise_add_activation, kOpenCL, kFP16, kImageDefault, def);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ElementwiseMulImageCompute
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ElementwiseParam;
std::string doc() const override {
return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
}
void PrepareForRun() override {
ele_param_ = param_.get_mutable<param_t>();
auto* y = ele_param_->Y;
auto* x = ele_param_->X;
auto y_dims = y->dims();
auto x_dims = x->dims();
if (y_dims == x_dims) {
kernel_func_name_ = "elementwise_mul";
} else if (y_dims.size() == 1) {
kernel_func_name_ = "channel_mul_d1";
} else if (y_dims.size() == 2) {
if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
kernel_func_name_ = "channel_mul_d2_nc";
} else {
kernel_func_name_ = "channel_mul_d2_hw";
}
} else if (y_dims.size() == 4) {
kernel_func_name_ = "channel_mul_d4";
} else {
LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
<< y_dims.size()
<< ", x_dims.size():" << ele_param_->X->dims().size();
}
VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
VLOG(4) << "y_dims:" << y_dims;
VLOG(4) << "y_dims.size():" << y_dims.size();
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
}
void Run() override {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x = ele_param_->X;
auto* y = ele_param_->Y;
auto* out = ele_param_->Out;
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "y->target():" << TargetToStr(y->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << x->dims();
VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims();
paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape =
default_convertor.InitImageDimInfoWith(x->dims()); // w, h
auto x_img_width = x_img_shape[0];
auto x_img_height = x_img_shape[1];
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
auto* x_img = x->data<uint16_t, cl::Image2D>();
auto* y_img = y->data<uint16_t, cl::Image2D>();
auto* out_img = out->mutable_data<uint16_t, cl::Image2D>(out_img_shape[0],
out_img_shape[1]);
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1];
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
auto y_dims = y->dims();
auto x_dims = x->dims();
if (y_dims == x_dims) {
// kernel: elementwise_mul(channel_mul_d4)
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
} else if (y_dims.size() == 1 || y_dims.size() == 4) {
auto tensor_w = x_dims[x_dims.size() - 1];
VLOG(4) << "tensor_w:" << tensor_w;
// kernel: channel_mul_d1 / channel_mul_d4
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
CL_CHECK_FATAL(status);
} else if (y_dims.size() == 2) {
if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
auto tensor_w = x_dims[x_dims.size() - 1];
VLOG(4) << "tensor_w:" << tensor_w;
// kernel: channel_mul_d2_nc
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
CL_CHECK_FATAL(status);
} else {
auto y_tensor_h = y->dims()[0];
auto y_tensor_w = y->dims()[1];
VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
// kernel: channel_mul_d2_hw
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_w));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
CL_CHECK_FATAL(status);
}
} else {
LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
<< y_dims.size();
}
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(x_img_width),
static_cast<cl::size_type>(x_img_height)};
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
}
protected:
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_mul"};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
namespace ocl = paddle::lite::kernels::opencl;
REGISTER_LITE_KERNEL(elementwise_mul,
kOpenCL,
kFP16,
kImageDefault,
ocl::ElementwiseMulImageCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
...@@ -111,7 +111,7 @@ void elementwise_compute_ref(const dtype *x_data, ...@@ -111,7 +111,7 @@ void elementwise_compute_ref(const dtype *x_data,
} }
// #define PRINT_RESULT // #define PRINT_RESULT
TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { TEST(elementwise_mul_image, compute) {
LOG(INFO) LOG(INFO)
<< "main steps of test: host -> layout(buf2img on cpu) -> elemul(img) -> " << "main steps of test: host -> layout(buf2img on cpu) -> elemul(img) -> "
"layout(img2buf on cpu) " "layout(img2buf on cpu) "
...@@ -151,9 +151,10 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { ...@@ -151,9 +151,10 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim); // w, h auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim); // w, h
auto x_img_w = x_img_shape[0]; auto x_img_w = x_img_shape[0];
auto x_img_h = x_img_shape[1]; auto x_img_h = x_img_shape[1];
std::vector<float> x_img_v(x_img_w * x_img_h * 4); // 4: RGBA std::vector<uint16_t> x_img_v(x_img_w * x_img_h * 4); // 4: RGBA
default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim); default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
elemul_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data()); elemul_x.mutable_data<uint16_t, cl::Image2D>(
x_img_w, x_img_h, x_img_v.data());
// y // y
std::vector<float> y_v(y_dim.production()); std::vector<float> y_v(y_dim.production());
...@@ -161,19 +162,21 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { ...@@ -161,19 +162,21 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim); // w, h auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim); // w, h
auto y_img_w = y_img_shape[0]; auto y_img_w = y_img_shape[0];
auto y_img_h = y_img_shape[1]; auto y_img_h = y_img_shape[1];
std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4); // 4: RGBA std::vector<uint16_t> y_img_v(y_img_shape[0] * y_img_shape[1] *
4); // 4: RGBA
default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim); default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
elemul_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data()); elemul_y.mutable_data<uint16_t, cl::Image2D>(
y_img_w, y_img_h, y_img_v.data());
// out // out
auto out_img_shape = auto out_img_shape =
default_convertor.InitImageDimInfoWith(out_dim); // w, h default_convertor.InitImageDimInfoWith(out_dim); // w, h
auto out_img_w = out_img_shape[0]; auto out_img_w = out_img_shape[0];
auto out_img_h = out_img_shape[1]; auto out_img_h = out_img_shape[1];
elemul_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h); elemul_out.mutable_data<uint16_t, cl::Image2D>(out_img_w, out_img_h);
std::vector<float> out_img_v(out_img_w * out_img_h * 4); std::vector<uint16_t> out_img_v(out_img_w * out_img_h * 4);
fill_data<float>( fill_data<uint16_t>(
out_img_v.data(), out_img_v.size(), 0); // fill with zero value out_img_v.data(), out_img_v.size(), 0); // fill with zero value
std::vector<float> out_v(out_dim.production()); std::vector<float> out_v(out_dim.production());
...@@ -189,7 +192,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { ...@@ -189,7 +192,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
auto elemul_img_kernels = auto elemul_img_kernels =
KernelRegistry::Global().Create("elementwise_mul", KernelRegistry::Global().Create("elementwise_mul",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(elemul_img_kernels.empty()); ASSERT_FALSE(elemul_img_kernels.empty());
...@@ -215,7 +218,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { ...@@ -215,7 +218,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0}; const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(out_img_v.data(), TargetWrapperCL::ImgcpySync(out_img_v.data(),
elemul_out.data<float, cl::Image2D>(), elemul_out.data<uint16_t, cl::Image2D>(),
out_img_w, out_img_w,
out_img_h, out_img_h,
cl_image2d_row_pitch, cl_image2d_row_pitch,
...@@ -266,4 +269,4 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { ...@@ -266,4 +269,4 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFloat, kImageDefault, def); USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFP16, kImageDefault, def);
...@@ -66,8 +66,6 @@ void PrintData(std::string name, float* a, const int rows, const int cols) { ...@@ -66,8 +66,6 @@ void PrintData(std::string name, float* a, const int rows, const int cols) {
} }
} }
// buffer
#if 0 // fc_buffer
// #define PRINT_RESULT // #define PRINT_RESULT
#define LOOP_TEST #define LOOP_TEST
TEST(fc, compute) { TEST(fc, compute) {
...@@ -195,9 +193,8 @@ TEST(fc, compute) { ...@@ -195,9 +193,8 @@ TEST(fc, compute) {
} // m } // m
#endif #endif
} }
#endif // fc_buffer
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def); USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/elementwise_add_buffer_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
public:
using param_t = operators::FusionElementwiseActivationParam;
void PrepareForRun() override {
build_options_ += " -DRELU";
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
ele_param_ = param_.get_mutable<param_t>();
UpdateParams();
auto act_t = static_cast<param_t*>(ele_param_)->act_type;
VLOG(4) << "act: " << act_t;
if (act_t != "relu") {
LOG(FATAL) << "Unsupported Activation type: " << act_t;
}
}
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
namespace ocl = paddle::lite::kernels::opencl;
REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
kOpenCL,
kFloat,
kNCHW,
ocl::FusionElementwiseAddActivationCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
...@@ -14,35 +14,13 @@ ...@@ -14,35 +14,13 @@
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/kernels/opencl/elementwise_add_compute.h" #include "lite/kernels/opencl/elementwise_add_image_compute.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
/* Buffer */
#if 0
class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
public:
using param_t = operators::FusionElementwiseActivationParam;
void PrepareForRun() override {
build_options_ += " -DRELU";
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
ele_param_ = param_.get_mutable<param_t>();
UpdateParams();
auto act_t = static_cast<param_t*>(ele_param_)->act_type;
VLOG(4) << "act: " << act_t;
if (act_t != "relu") {
LOG(FATAL) << "Unsupported Activation type: " << act_t;
}
}
};
#endif
class FusionElementwiseAddActivationImageCompute class FusionElementwiseAddActivationImageCompute
: public ElementwiseAddImageCompute { : public ElementwiseAddImageCompute {
public: public:
...@@ -68,33 +46,23 @@ class FusionElementwiseAddActivationImageCompute ...@@ -68,33 +46,23 @@ class FusionElementwiseAddActivationImageCompute
} // namespace paddle } // namespace paddle
namespace ocl = paddle::lite::kernels::opencl; namespace ocl = paddle::lite::kernels::opencl;
// REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
// kOpenCL,
// kFloat,
// kNCHW,
// ocl::FusionElementwiseAddActivationCompute,
// def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(fusion_elementwise_add_activation, REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
kOpenCL, kOpenCL,
kFloat, kFP16,
kImageDefault, kImageDefault,
ocl::FusionElementwiseAddActivationImageCompute, ocl::FusionElementwiseAddActivationImageCompute,
def) def)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindInput("Y", .BindInput("Y",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
...@@ -47,7 +47,7 @@ class LayoutComputeBufferChwToImageDefault ...@@ -47,7 +47,7 @@ class LayoutComputeBufferChwToImageDefault
auto* x_data = param.x->data<float, cl::Buffer>(); auto* x_data = param.x->data<float, cl::Buffer>();
auto x_dims = param.x->dims(); auto x_dims = param.x->dims();
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* y_data = param.y->mutable_data<float, cl::Image2D>( auto* y_data = param.y->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]); image_shape["width"], image_shape["height"]);
auto y_dims = param.y->dims(); auto y_dims = param.y->dims();
...@@ -63,6 +63,8 @@ class LayoutComputeBufferChwToImageDefault ...@@ -63,6 +63,8 @@ class LayoutComputeBufferChwToImageDefault
const int Stride1 = out_H * out_W; const int Stride1 = out_H * out_W;
const int Stride0 = out_W; const int Stride0 = out_W;
VLOG(4) << "y image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
...@@ -121,12 +123,12 @@ class LayoutComputeBufferChwToImageDefault ...@@ -121,12 +123,12 @@ class LayoutComputeBufferChwToImageDefault
std::string doc() const override { std::string doc() const override {
return "Trans Layout from cl::Buffer(NCHW) to " return "Trans Layout from cl::Buffer(NCHW) to "
"cl::Image2D(ImageDefault/RGBA)"; "cl::Image2D(ImageDefault/RGBA), Float ---> FP16";
} }
private: private:
std::string kernel_func_name_{"buffer_to_image2d"}; std::string kernel_func_name_{"buffer_to_image2d"};
std::string build_options_{"-DCL_DTYPE_float "}; std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
...@@ -144,16 +146,19 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -144,16 +146,19 @@ class LayoutComputeImageDefaultToBufferChw
void Run() override { void Run() override {
auto& param = Param<param_t>(); auto& param = Param<param_t>();
auto* x_data = param.x->data<uint16_t, cl::Image2D>();
auto x_dims = param.x->dims();
auto* y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); auto* y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto y_dims = param.y->dims(); auto y_dims = param.y->dims();
auto* x_data = param.x->data<float, cl::Image2D>(); auto x_image_shape = InitImageDimInfoWith(x_dims);
auto x_dims = param.x->dims();
std::vector<size_t> new_dims = {1, 1, 1, 1}; std::vector<size_t> new_dims = {1, 1, 1, 1};
for (int j = 0; j < x_dims.size(); ++j) { for (int j = 0; j < x_dims.size(); ++j) {
new_dims[4 - x_dims.size() + j] = x_dims[j]; new_dims[4 - x_dims.size() + j] = x_dims[j];
} }
VLOG(4) << "x_image_shape(w,h):" << x_image_shape["width"] << " "
<< x_image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
...@@ -212,7 +217,7 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -212,7 +217,7 @@ class LayoutComputeImageDefaultToBufferChw
std::string doc() const override { std::string doc() const override {
return "Trans Layout from cl::Image2D(ImageDefault/RGBA) to " return "Trans Layout from cl::Image2D(ImageDefault/RGBA) to "
"cl::Buffer(NCHW)"; "cl::Buffer(NCHW), FP16 ---> Float";
} }
private: private:
...@@ -340,23 +345,6 @@ REGISTER_LITE_KERNEL( ...@@ -340,23 +345,6 @@ REGISTER_LITE_KERNEL(
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(
layout_once,
kOpenCL,
kAny,
kImageDefault,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
NCHW_to_ImageDefault)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault))})
.Finalize();
// [ImageDefault] -> [NCHW] // [ImageDefault] -> [NCHW]
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
layout, layout,
...@@ -374,38 +362,3 @@ REGISTER_LITE_KERNEL( ...@@ -374,38 +362,3 @@ REGISTER_LITE_KERNEL(
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kNCHW))}) DATALAYOUT(kNCHW))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(
layout_once,
kOpenCL,
kAny,
kNCHW,
paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
ImageDefault_to_NCHW)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.Finalize();
// [NCHW] -> [ImageNW]
REGISTER_LITE_KERNEL(
layout_once,
kOpenCL,
kFloat,
kImageNW,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DNw,
NCHW_to_ImageNW)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageNW))})
.Finalize();
...@@ -29,15 +29,15 @@ TEST(layout_ImageDefault, compute) { ...@@ -29,15 +29,15 @@ TEST(layout_ImageDefault, compute) {
"-> device"; "-> device";
#ifdef LOOP_TEST #ifdef LOOP_TEST
for (int n = 1; n <= 100; n += 21) { for (int n = 1; n <= 2; n += 1) {
for (auto c : {1, 3}) { for (auto c : {1, 3}) {
for (int h = 1; h <= 100; h += 13) { for (int h = 1; h <= 10; h += 1) {
for (int w = 1; w <= 100; w += 17) { for (int w = 1; w <= 10; w += 1) {
#else #else
const int n = 2; const int n = 1;
const int c = 9; const int c = 2;
const int h = 20; const int h = 3;
const int w = 5; const int w = 4;
#endif // LOOP_TEST #endif // LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " " LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
...@@ -79,14 +79,14 @@ TEST(layout_ImageDefault, compute) { ...@@ -79,14 +79,14 @@ TEST(layout_ImageDefault, compute) {
auto* y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); auto* y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto image_shape = auto image_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim); paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
auto* y_image_data = y_image.mutable_data<float, cl::Image2D>( auto* y_image_data = y_image.mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]); image_shape["width"], image_shape["height"]);
auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map( auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map(
x_data, 0, sizeof(float) * x_dim.production())); x_data, 0, sizeof(float) * x_dim.production()));
auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map( auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production())); y_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) { for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<float>(i); mapped_x[i] = static_cast<float>(i) * 2;
} }
// set context and kernel args // set context and kernel args
...@@ -116,15 +116,16 @@ TEST(layout_ImageDefault, compute) { ...@@ -116,15 +116,16 @@ TEST(layout_ImageDefault, compute) {
#ifdef PRINT_RESULT #ifdef PRINT_RESULT
LOG(INFO) << "---- print result ----"; LOG(INFO) << "---- print result ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) { for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx] std::cout << mapped_x[eidx] << " -> "
<< std::endl; << static_cast<float>(mapped_y[eidx]) << std::endl;
} }
#endif // PRINT_RESULT #endif // PRINT_RESULT
// check result: compare input and output // check result: compare input and output
float MAX_PASS_DIFF = 1e-4;
for (int eidx = 0; eidx < x_dim.production(); eidx++) { for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], 1e-6); EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], MAX_PASS_DIFF);
if (abs(mapped_x[eidx] - mapped_y[eidx]) > 1e-6) { if (abs(mapped_x[eidx] - mapped_y[eidx]) > MAX_PASS_DIFF) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", mapped_x[" << eidx << " / " << x_dim.production() << ", mapped_x[" << eidx
<< "]:" << mapped_x[eidx] << ", mapped_y[" << eidx << "]:" << mapped_x[eidx] << ", mapped_y[" << eidx
...@@ -147,6 +148,7 @@ TEST(layout_ImageDefault, compute) { ...@@ -147,6 +148,7 @@ TEST(layout_ImageDefault, compute) {
#endif #endif
} }
#if 0
TEST(layout_ImageNW, compute) { TEST(layout_ImageNW, compute) {
#ifdef LOOP_TEST #ifdef LOOP_TEST
for (int n = 1; n <= 100; n += 21) { for (int n = 1; n <= 100; n += 21) {
...@@ -282,9 +284,11 @@ TEST(layout_ImageNW, compute) { ...@@ -282,9 +284,11 @@ TEST(layout_ImageNW, compute) {
// nothing to do. // nothing to do.
#endif #endif
} }
#endif
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW); // USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW);
...@@ -102,7 +102,7 @@ class MulCompute ...@@ -102,7 +102,7 @@ class MulCompute
private: private:
int m_, n_, k_; int m_, n_, k_;
std::string kernel_func_name_{"mat_mul"}; std::string kernel_func_name_{"mat_mul"};
std::string build_options_{"-DCL_DTYPE=float"}; std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -24,90 +24,7 @@ namespace lite { ...@@ -24,90 +24,7 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class NearestInterpComputeFloatImageDefault class NearestInterpComputeImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::InterpolateParam;
std::string doc() const override {
return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFloat";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<float, cl::Image2D>();
auto* out_buf =
param.Out->mutable_data<float, cl::Image2D>(param.out_w, param.out_h);
const auto& y_dims = param.Out->dims(); // useless: check dim only
float scale_h = y_dims[2] / x_dims[2];
float scale_w = y_dims[3] / x_dims[3];
int in_dims_h = x_dims[2];
int out_dims_h = y_dims[2];
int in_dims_w = x_dims[3];
int out_dims_w = y_dims[3];
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
CL_CHECK_FATAL(status);
paddle::lite::CLImageConverterDefault default_convertor;
auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dims); // w, h
auto y_img_width = y_img_shape[0];
LOG(INFO) << "y_img_width:" << y_img_width;
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(y_img_width / y_dims[3]),
static_cast<cl::size_type>(y_dims[3]),
static_cast<cl::size_type>(y_dims[0] * y_dims[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
}
private:
std::string kernel_func_name_{"nearest_interp"};
std::string build_options_{"-DCL_DTYPE_float "};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class NearestInterpComputeFP16ImageDefault
: public KernelLite<TARGET(kOpenCL), : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
...@@ -128,11 +45,11 @@ class NearestInterpComputeFP16ImageDefault ...@@ -128,11 +45,11 @@ class NearestInterpComputeFP16ImageDefault
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
auto* x_buf = auto* x_buf =
param.X->data<int16_t, param.X->data<uint16_t,
cl::Image2D>(); // use int16_t represents half float cl::Image2D>(); // use uint16_t represents half float
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = auto* out_buf =
param.Out->mutable_data<int16_t, cl::Image2D>( // use int16_t param.Out->mutable_data<uint16_t, cl::Image2D>( // use uint16_t
// represents half float // represents half float
image_shape["width"], image_shape["width"],
image_shape["height"]); image_shape["height"]);
...@@ -204,29 +121,12 @@ class NearestInterpComputeFP16ImageDefault ...@@ -204,29 +121,12 @@ class NearestInterpComputeFP16ImageDefault
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_LITE_KERNEL(
nearest_interp,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::NearestInterpComputeFloatImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
nearest_interp, nearest_interp,
kOpenCL, kOpenCL,
kFP16, kFP16,
kImageDefault, kImageDefault,
paddle::lite::kernels::opencl::NearestInterpComputeFP16ImageDefault, paddle::lite::kernels::opencl::NearestInterpComputeImageDefault,
ImageDefault) ImageDefault)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
......
...@@ -60,7 +60,7 @@ void nearest_interp_compute_ref(const dtype *src, ...@@ -60,7 +60,7 @@ void nearest_interp_compute_ref(const dtype *src,
} }
// #define LOOP_TEST // #define LOOP_TEST
// #define PRINT_RESULT // #define PRINT_RESULT
TEST(nearest_interp_image2d_fp32, compute) { TEST(nearest_interp_image2d, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> " LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "
"nearest_interp(img) -> " "nearest_interp(img) -> "
"layout(img2buf) " "layout(img2buf) "
...@@ -105,7 +105,7 @@ TEST(nearest_interp_image2d_fp32, compute) { ...@@ -105,7 +105,7 @@ TEST(nearest_interp_image2d_fp32, compute) {
auto nearest_interp_img_kernels = auto nearest_interp_img_kernels =
KernelRegistry::Global().Create("nearest_interp", KernelRegistry::Global().Create("nearest_interp",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
...@@ -166,12 +166,12 @@ TEST(nearest_interp_image2d_fp32, compute) { ...@@ -166,12 +166,12 @@ TEST(nearest_interp_image2d_fp32, compute) {
mapped_y[i] = static_cast<int>(0); mapped_y[i] = static_cast<int>(0);
} }
auto *nearest_interp_in_data = auto *nearest_interp_in_data =
nearest_interp_in.mutable_data<float, cl::Image2D>( nearest_interp_in.mutable_data<uint16_t, cl::Image2D>(
nearest_interp_image2d_shape["width"], nearest_interp_image2d_shape["width"],
nearest_interp_image2d_shape["height"]); nearest_interp_image2d_shape["height"]);
auto *nearest_interp_out_data = auto *nearest_interp_out_data =
nearest_interp_out.mutable_data<float, cl::Image2D>(y_dim[3], nearest_interp_out.mutable_data<uint16_t, cl::Image2D>(
y_dim[2]); y_dim[3], y_dim[2]);
// set context and kernel args // set context and kernel args
LOG(INFO) << "set context and kernel args"; LOG(INFO) << "set context and kernel args";
...@@ -273,13 +273,9 @@ TEST(nearest_interp_image2d_fp32, compute) { ...@@ -273,13 +273,9 @@ TEST(nearest_interp_image2d_fp32, compute) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// nearest_interp buffer
// USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kNCHW, def);
// nearest_interp image2d fp32 // nearest_interp image2d fp32
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kImageDefault, ImageDefault);
// nearest_interp image2d fp16 // nearest_interp image2d fp16
USE_LITE_KERNEL(nearest_interp, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(nearest_interp, kOpenCL, kFP16, kImageDefault, ImageDefault);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class PoolCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::PoolParam;
std::string doc() const override { return "Pool using cl::Buffer, kFloat"; }
void PrepareForRun() override {
const auto& param = *param_.get_mutable<param_t>();
kernel_func_name_ += param.pooling_type;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
}
void Run() override {
const auto& param = *param_.get_mutable<param_t>();
const auto& in_dims = param.x->dims();
const auto& out_dims = param.output->dims();
const std::string pooling_type = param.pooling_type;
const bool global_pooling = param.global_pooling;
std::vector<int> paddings = *param.paddings;
std::vector<int> strides = param.strides;
std::vector<int> ksize = param.ksize;
if (global_pooling) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[2 * i] = 0;
paddings[2 * i + 1] = 0;
ksize[i] = static_cast<int>(in_dims[i + 2]);
}
}
bool pads_equal =
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
if (!pads_equal) {
LOG(FATAL)
<< "padding requires pad_left == pad_right, pad_top == pad_bottom";
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* input_buf = param.x->data<float, cl::Buffer>();
auto* output_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status;
auto numel = out_dims.production();
int arg_idx = 0;
status = kernel.setArg(arg_idx, static_cast<const int>(numel));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[1]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[2]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[2]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[1]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *output_buf);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(output_buf, event_);
}
private:
std::string kernel_func_name_{"pool_"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(pool2d,
kOpenCL,
kFloat,
kNCHW,
paddle::lite::kernels::opencl::PoolCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <memory>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
void pool_avg(const int padding_height,
const int padding_width,
const int stride_height,
const int stride_width,
const int ksize_height,
const int ksize_width,
const float* input_data,
const DDim& in_dim,
float* output_data,
const DDim& out_dim) {
const int batch_size = in_dim[0];
const int input_height = in_dim[2];
const int input_width = in_dim[3];
const int output_channels = out_dim[1];
const int output_height = out_dim[2];
const int output_width = out_dim[3];
const size_t input_spatial_size = input_height * input_width;
const size_t output_spatial_size = output_height * output_width;
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
int channel = i * output_channels + c;
const float* input_ptr = input_data + channel * input_spatial_size;
float* output_ptr = output_data + channel * output_spatial_size;
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
float val = 0.f;
int count = 0;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += input_ptr[h * input_width + w];
++count;
}
}
output_ptr[ph * output_width + pw] =
(count > 0) ? val * (1.f / count) : 0.f;
}
}
}
}
}
TEST(pool2d_buffer_fp32, compute) {
LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create(
"pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
LOG(INFO) << "get kernel:" << kernel->doc();
lite::Tensor x, out;
operators::PoolParam param;
param.x = &x;
param.output = &out;
param.global_pooling = true;
param.pooling_type = "avg";
std::vector<int> paddings = {0, 0, 0, 0};
param.strides = std::vector<int>{1, 1};
param.ksize = std::vector<int>{7, 7};
param.paddings = std::make_shared<std::vector<int>>(paddings);
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> pool_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(pool_context->As<OpenCLContext>()));
kernel->SetContext(std::move(pool_context));
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
x.Resize(in_dim);
out.Resize(out_dim);
auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
auto* mapped_x = static_cast<float*>(
TargetWrapperCL::Map(x_data, 0, sizeof(float) * in_dim.production()));
for (int i = 0; i < in_dim.production(); i++) {
mapped_x[i] = dist(engine);
}
kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
pool_avg(0, 0, 1, 1, 7, 7, mapped_x, in_dim, out_ref.get(), out_dim);
TargetWrapperCL::Unmap(x_data, mapped_x);
auto* out_data = out.mutable_data<float, cl::Buffer>();
auto* mapped_out = static_cast<float*>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
}
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
...@@ -26,107 +26,13 @@ namespace lite { ...@@ -26,107 +26,13 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class PoolCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::PoolParam;
std::string doc() const override { return "Pool using cl::Buffer, kFloat"; }
void PrepareForRun() override {
const auto& param = *param_.get_mutable<param_t>();
kernel_func_name_ += param.pooling_type;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
}
void Run() override {
const auto& param = *param_.get_mutable<param_t>();
const auto& in_dims = param.x->dims();
const auto& out_dims = param.output->dims();
const std::string pooling_type = param.pooling_type;
const bool global_pooling = param.global_pooling;
std::vector<int> paddings = *param.paddings;
std::vector<int> strides = param.strides;
std::vector<int> ksize = param.ksize;
if (global_pooling) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[2 * i] = 0;
paddings[2 * i + 1] = 0;
ksize[i] = static_cast<int>(in_dims[i + 2]);
}
}
bool pads_equal =
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
if (!pads_equal) {
LOG(FATAL)
<< "padding requires pad_left == pad_right, pad_top == pad_bottom";
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* input_buf = param.x->data<float, cl::Buffer>();
auto* output_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status;
auto numel = out_dims.production();
int arg_idx = 0;
status = kernel.setArg(arg_idx, static_cast<const int>(numel));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[1]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[2]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[2]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[1]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *output_buf);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(output_buf, event_);
}
private:
std::string kernel_func_name_{"pool_"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL), class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::PoolParam; using param_t = operators::PoolParam;
std::string doc() const override { return "Pool using cl::Image2D, kFloat"; } std::string doc() const override { return "Pool using cl::Image2D, kFP16"; }
void PrepareForRun() override { void PrepareForRun() override {
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -161,13 +67,13 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -161,13 +67,13 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
auto* x_img = param.x->data<float, cl::Image2D>(); auto* x_img = param.x->data<uint16_t, cl::Image2D>();
LOG(INFO) << "x_image" << x_img; LOG(INFO) << "x_image" << x_img;
auto out_image_shape = InitImageDimInfoWith(out_dims); auto out_image_shape = InitImageDimInfoWith(out_dims);
LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " " LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"]; << out_image_shape["height"];
auto* out_img = param.output->mutable_data<float, cl::Image2D>( auto* out_img = param.output->mutable_data<uint16_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]); out_image_shape["width"], out_image_shape["height"]);
LOG(INFO) << "out_image" << out_img; LOG(INFO) << "out_image" << out_img;
...@@ -220,7 +126,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -220,7 +126,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
private: private:
std::string kernel_func_name_{"pool_"}; std::string kernel_func_name_{"pool_"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
...@@ -229,28 +135,18 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -229,28 +135,18 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// REGISTER_LITE_KERNEL(pool2d,
// kOpenCL,
// kFloat,
// kNCHW,
// paddle::lite::kernels::opencl::PoolCompute,
// def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(pool2d, REGISTER_LITE_KERNEL(pool2d,
kOpenCL, kOpenCL,
kFloat, kFP16,
kImageDefault, kImageDefault,
paddle::lite::kernels::opencl::PoolComputeImage2D, paddle::lite::kernels::opencl::PoolComputeImage2D,
image2d) image2d)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#include "lite/backends/opencl/target_wrapper.h" #include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -73,82 +76,10 @@ void pool_avg(const int padding_height, ...@@ -73,82 +76,10 @@ void pool_avg(const int padding_height,
} }
} }
// buffer TEST(pool2d_image2d, compute) {
#if 0 // pool_buffer
TEST(pool2d_buffer_fp32, compute) {
LOG(INFO) << "to get kernel ..."; LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create( auto kernels = KernelRegistry::Global().Create(
"pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)); "pool2d", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
LOG(INFO) << "get kernel:" << kernel->doc();
lite::Tensor x, out;
operators::PoolParam param;
param.x = &x;
param.output = &out;
param.global_pooling = true;
param.pooling_type = "avg";
std::vector<int> paddings = {0, 0, 0, 0};
param.strides = std::vector<int>{1, 1};
param.ksize = std::vector<int>{7, 7};
param.paddings = std::make_shared<std::vector<int>>(paddings);
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> pool_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(pool_context->As<OpenCLContext>()));
kernel->SetContext(std::move(pool_context));
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
x.Resize(in_dim);
out.Resize(out_dim);
auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
auto* mapped_x = static_cast<float*>(
TargetWrapperCL::Map(x_data, 0, sizeof(float) * in_dim.production()));
for (int i = 0; i < in_dim.production(); i++) {
mapped_x[i] = dist(engine);
}
kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
pool_avg(0, 0, 1, 1, 7, 7, mapped_x, in_dim, out_ref.get(), out_dim);
TargetWrapperCL::Unmap(x_data, mapped_x);
auto* out_data = out.mutable_data<float, cl::Buffer>();
auto* mapped_out = static_cast<float*>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
}
#endif // pool_buffer
TEST(pool2d_image2d_fp32, compute) {
LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create(
"pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front()); auto kernel = std::move(kernels.front());
...@@ -192,22 +123,23 @@ TEST(pool2d_image2d_fp32, compute) { ...@@ -192,22 +123,23 @@ TEST(pool2d_image2d_fp32, compute) {
DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim); DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " " LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
<< x_image_shape[1]; << x_image_shape[1];
std::vector<float> x_image_data(x_image_shape.production() * 4); // 4 : RGBA std::vector<uint16_t> x_image_data(x_image_shape.production() *
4); // 4 : RGBA
default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim); default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<float, cl::Image2D>( auto* x_image = x.mutable_data<uint16_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data()); x_image_shape[0], x_image_shape[1], x_image_data.data());
LOG(INFO) << "x_image:" << x_image; LOG(INFO) << "x_image:" << x_image;
DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim); DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " " LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1]; << out_image_shape[1];
auto* out_image = out.mutable_data<float, cl::Image2D>(out_image_shape[0], auto* out_image = out.mutable_data<uint16_t, cl::Image2D>(out_image_shape[0],
out_image_shape[1]); out_image_shape[1]);
LOG(INFO) << "out_image:" << out_image; LOG(INFO) << "out_image:" << out_image;
kernel->Launch(); kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
...@@ -222,7 +154,7 @@ TEST(pool2d_image2d_fp32, compute) { ...@@ -222,7 +154,7 @@ TEST(pool2d_image2d_fp32, compute) {
const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0}; const size_t cl_image2d_slice_pitch{0};
float* out_image_data = new float[out_image_shape.production() * 4]; uint16_t* out_image_data = new uint16_t[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data, TargetWrapperCL::ImgcpySync(out_image_data,
out_image, out_image,
out_image_shape[0], out_image_shape[0],
...@@ -235,12 +167,22 @@ TEST(pool2d_image2d_fp32, compute) { ...@@ -235,12 +167,22 @@ TEST(pool2d_image2d_fp32, compute) {
out_image_data, out_data, out_image_shape, out_dim); out_image_data, out_data, out_image_shape, out_dim);
for (int i = 0; i < out_dim.production(); i++) { for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(out_data[i], out_ref[i], 1e-6); auto abs_diff = abs(out_data[i] - out_ref[i]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_data[" << i
<< "]:" << out_data[i] << " "
"out_ref["
<< i << "]:" << out_ref[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
}
} }
} }
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def); USE_LITE_KERNEL(pool2d, kOpenCL, kFP16, kImageDefault, image2d);
USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kImageDefault, image2d);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ReluCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
size_t count = x_dims.production();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)count);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{count};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class ReluComputeFloatImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu using cl::Image2D(ImageDefault/RGBA), kFloat";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/relu_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<float, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
}
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class ReluComputeFP16ImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/relu_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf =
param.X->data<int16_t,
cl::Image2D>(); // use int16_t represents half float
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.Out->mutable_data<int16_t, cl::Image2D>( // use int16_t
// represents half float
image_shape["width"],
image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
}
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_half -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class Relu6ComputeFloatImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFloat";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/relu6_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<float, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "threshold:" << threshold;
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
}
private:
std::string kernel_func_name_{"relu6"};
std::string build_options_{"-DCL_DTYPE_float -DRELU6"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class Relu6ComputeFP16ImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/relu6_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<int16_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<int16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "threshold:" << threshold;
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
}
private:
std::string kernel_func_name_{"relu6"};
std::string build_options_{"-DCL_DTYPE_half -DRELU6"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
// REGISTER_LITE_KERNEL(relu,`
// kOpenCL,
// kFloat,
// kNCHW,
// paddle::lite::kernels::opencl::ReluCompute,
// def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(
relu,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::ReluComputeFloatImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(relu,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ReluComputeFP16ImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Relu6
REGISTER_LITE_KERNEL(
relu6,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::Relu6ComputeFloatImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(
relu6,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::Relu6ComputeFP16ImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
...@@ -27,7 +27,7 @@ namespace opencl { ...@@ -27,7 +27,7 @@ namespace opencl {
// reshape operator // reshape operator
class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL), class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ReshapeParam; using param_t = operators::ReshapeParam;
...@@ -51,7 +51,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL), ...@@ -51,7 +51,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
const int64_t& input_image_width = input_image_shape.at("width"); const int64_t& input_image_width = input_image_shape.at("width");
const int64_t& input_image_height = input_image_shape.at("height"); const int64_t& input_image_height = input_image_shape.at("height");
const cl::Image2D* const x_image = x->data<float, cl::Image2D>(); const cl::Image2D* const x_image = x->data<uint16_t, cl::Image2D>();
const std::vector<int>& shape_vct = param.shape_vct; const std::vector<int>& shape_vct = param.shape_vct;
Tensor* const output = param.output; Tensor* const output = param.output;
...@@ -60,7 +60,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL), ...@@ -60,7 +60,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
const std::map<std::string, size_t>& out_image_shape = const std::map<std::string, size_t>& out_image_shape =
InitImageDimInfoWith(out_dims); InitImageDimInfoWith(out_dims);
cl::Image2D* const out_image = output->mutable_data<float, cl::Image2D>( cl::Image2D* const out_image = output->mutable_data<uint16_t, cl::Image2D>(
out_image_shape.at("width"), out_image_shape.at("height")); out_image_shape.at("width"), out_image_shape.at("height"));
LOG(INFO) << "out_dims= " << out_dims; LOG(INFO) << "out_dims= " << out_dims;
...@@ -159,7 +159,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL), ...@@ -159,7 +159,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
private: private:
std::string kernel_func_name_{"reshape"}; std::string kernel_func_name_{"reshape"};
std::string build_options_{"-DCL_DTYPE_float "}; std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
...@@ -170,37 +170,37 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL), ...@@ -170,37 +170,37 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
REGISTER_LITE_KERNEL(reshape, REGISTER_LITE_KERNEL(reshape,
kOpenCL, kOpenCL,
kFloat, kFP16,
kImageDefault, kImageDefault,
paddle::lite::kernels::opencl::ReshapeComputeFloatImage, paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
image2d) image2d)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(reshape2, REGISTER_LITE_KERNEL(reshape2,
kOpenCL, kOpenCL,
kFloat, kFP16,
kImageDefault, kImageDefault,
paddle::lite::kernels::opencl::ReshapeComputeFloatImage, paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
image2d) image2d)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
...@@ -17,9 +17,12 @@ ...@@ -17,9 +17,12 @@
#include "lite/backends/opencl/target_wrapper.h" #include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
#include "lite/operators/reshape_op.h" #include "lite/operators/reshape_op.h"
#include "lite/utils/logging.h" #include "lite/utils/logging.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
...@@ -81,7 +84,7 @@ static DDim ValidateShape(const std::vector<int>& shape, ...@@ -81,7 +84,7 @@ static DDim ValidateShape(const std::vector<int>& shape,
TEST(reshape_opencl, compute) { TEST(reshape_opencl, compute) {
LOG(INFO) << "to get kernel ..."; LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create( auto kernels = KernelRegistry::Global().Create(
"reshape", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)); "reshape", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front()); auto kernel = std::move(kernels.front());
...@@ -149,13 +152,13 @@ TEST(reshape_opencl, compute) { ...@@ -149,13 +152,13 @@ TEST(reshape_opencl, compute) {
} }
paddle::lite::CLImageConverterDefault default_convertor; paddle::lite::CLImageConverterDefault default_convertor;
std::vector<float> x_image_data(input_image_width * input_image_height * std::vector<uint16_t> x_image_data(input_image_width * input_image_height *
4); // 4 : RGBA 4); // 4 : RGBA
LOG(INFO) << "set mapped input ..."; LOG(INFO) << "set mapped input ...";
default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim); default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
auto* input_image = input.mutable_data<float, cl::Image2D>( auto* input_image = input.mutable_data<uint16_t, cl::Image2D>(
input_image_width, input_image_height, x_image_data.data()); input_image_width, input_image_height, x_image_data.data());
LOG(INFO) << "prepare kernel ready"; LOG(INFO) << "prepare kernel ready";
...@@ -165,8 +168,8 @@ TEST(reshape_opencl, compute) { ...@@ -165,8 +168,8 @@ TEST(reshape_opencl, compute) {
DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim); DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " " LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1]; << out_image_shape[1];
auto* out_image = output.mutable_data<float, cl::Image2D>(out_image_shape[0], auto* out_image = output.mutable_data<uint16_t, cl::Image2D>(
out_image_shape[1]); out_image_shape[0], out_image_shape[1]);
VLOG(4) << "out_dims= " << output_dim; VLOG(4) << "out_dims= " << output_dim;
LOG(INFO) << "kernel context ..."; LOG(INFO) << "kernel context ...";
...@@ -182,7 +185,7 @@ TEST(reshape_opencl, compute) { ...@@ -182,7 +185,7 @@ TEST(reshape_opencl, compute) {
kernel->Launch(); kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_image); auto it = wait_list->find(out_image);
if (it != wait_list->end()) { if (it != wait_list->end()) {
...@@ -193,9 +196,9 @@ TEST(reshape_opencl, compute) { ...@@ -193,9 +196,9 @@ TEST(reshape_opencl, compute) {
LOG(FATAL) << "Could not find the sync event for the target cl tensor."; LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
} }
float* out_image_data = new float[out_image_shape.production() * 4]; uint16_t* out_image_data = new uint16_t[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data, TargetWrapperCL::ImgcpySync(out_image_data,
output.data<float, cl::Image2D>(), output.data<uint16_t, cl::Image2D>(),
out_image_shape[0], out_image_shape[0],
out_image_shape[1], out_image_shape[1],
cl_image2d_row_pitch, cl_image2d_row_pitch,
...@@ -211,9 +214,17 @@ TEST(reshape_opencl, compute) { ...@@ -211,9 +214,17 @@ TEST(reshape_opencl, compute) {
// check output data // check output data
for (int i = 0; i < output.numel(); i++) { for (int i = 0; i < output.numel(); i++) {
EXPECT_NEAR(out_data[i], input_v_data[i], 1e-3); auto abs_diff = abs(out_data[i] - input_v_data[i]);
if (abs(out_data[i] - input_v_data[i]) > 1e-3) { auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], input_v_data[i]);
LOG(INFO) << "error idx:" << i; EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_data[" << i
<< "]:" << out_data[i] << " "
"input_v_data["
<< i << "]:" << input_v_data[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
} }
} }
} }
...@@ -223,5 +234,5 @@ TEST(reshape_opencl, compute) { ...@@ -223,5 +234,5 @@ TEST(reshape_opencl, compute) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(reshape, kOpenCL, kFloat, kImageDefault, image2d); USE_LITE_KERNEL(reshape, kOpenCL, kFP16, kImageDefault, image2d);
USE_LITE_KERNEL(reshape2, kOpenCL, kFloat, kImageDefault, image2d); USE_LITE_KERNEL(reshape2, kOpenCL, kFP16, kImageDefault, image2d);
...@@ -27,12 +27,12 @@ namespace kernels { ...@@ -27,12 +27,12 @@ namespace kernels {
namespace opencl { namespace opencl {
class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL), class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ScaleParam; using param_t = operators::ScaleParam;
std::string doc() const override { return "Scale using cl::Image2D, kFloat"; } std::string doc() const override { return "Scale using cl::Image2D, kFP16"; }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
...@@ -43,7 +43,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -43,7 +43,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
void Run() override { void Run() override {
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
const auto& in_dims = param.x->dims(); const auto& in_dims = param.x->dims();
auto* x_img = param.x->data<float, cl::Image2D>(); auto* x_img = param.x->data<uint16_t, cl::Image2D>();
const float scale = param.scale; const float scale = param.scale;
const float bias = param.bias; const float bias = param.bias;
...@@ -51,7 +51,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -51,7 +51,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
auto out_image_shape = InitImageDimInfoWith(in_dims); auto out_image_shape = InitImageDimInfoWith(in_dims);
LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " " LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"]; << out_image_shape["height"];
auto* out_img = param.output->mutable_data<float, cl::Image2D>( auto* out_img = param.output->mutable_data<uint16_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]); out_image_shape["width"], out_image_shape["height"]);
LOG(INFO) << "out_image" << out_img; LOG(INFO) << "out_image" << out_img;
...@@ -89,7 +89,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -89,7 +89,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
private: private:
std::string kernel_func_name_{"scale"}; std::string kernel_func_name_{"scale"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
...@@ -100,16 +100,16 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -100,16 +100,16 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
REGISTER_LITE_KERNEL(scale, REGISTER_LITE_KERNEL(scale,
kOpenCL, kOpenCL,
kFloat, kFP16,
kImageDefault, kImageDefault,
paddle::lite::kernels::opencl::ScaleComputeImage2D, paddle::lite::kernels::opencl::ScaleComputeImage2D,
image2d) image2d)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#include "lite/backends/opencl/target_wrapper.h" #include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -35,7 +38,7 @@ void scale(const float* input_data, ...@@ -35,7 +38,7 @@ void scale(const float* input_data,
TEST(scale_image2d_fp32, compute) { TEST(scale_image2d_fp32, compute) {
LOG(INFO) << "to get kernel ..."; LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create( auto kernels = KernelRegistry::Global().Create(
"scale", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)); "scale", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front()); auto kernel = std::move(kernels.front());
...@@ -74,19 +77,19 @@ TEST(scale_image2d_fp32, compute) { ...@@ -74,19 +77,19 @@ TEST(scale_image2d_fp32, compute) {
CLImageConverterDefault* default_converter = new CLImageConverterDefault(); CLImageConverterDefault* default_converter = new CLImageConverterDefault();
DDim image_shape = default_converter->InitImageDimInfoWith(in_dim); DDim image_shape = default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "image_shape = " << image_shape[0] << " " << image_shape[1]; LOG(INFO) << "image_shape = " << image_shape[0] << " " << image_shape[1];
std::vector<float> x_image_data(image_shape.production() * 4); // 4 : RGBA std::vector<uint16_t> x_image_data(image_shape.production() * 4); // 4 : RGBA
default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim); default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<float, cl::Image2D>( auto* x_image = x.mutable_data<uint16_t, cl::Image2D>(
image_shape[0], image_shape[1], x_image_data.data()); image_shape[0], image_shape[1], x_image_data.data());
LOG(INFO) << "x_image:" << x_image; LOG(INFO) << "x_image:" << x_image;
auto* out_image = auto* out_image =
out.mutable_data<float, cl::Image2D>(image_shape[0], image_shape[1]); out.mutable_data<uint16_t, cl::Image2D>(image_shape[0], image_shape[1]);
LOG(INFO) << "out_image:" << out_image; LOG(INFO) << "out_image:" << out_image;
kernel->Launch(); kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
...@@ -101,7 +104,7 @@ TEST(scale_image2d_fp32, compute) { ...@@ -101,7 +104,7 @@ TEST(scale_image2d_fp32, compute) {
const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0}; const size_t cl_image2d_slice_pitch{0};
float* out_image_data = new float[image_shape.production() * 4]; uint16_t* out_image_data = new uint16_t[image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data, TargetWrapperCL::ImgcpySync(out_image_data,
out_image, out_image,
image_shape[0], image_shape[0],
...@@ -114,11 +117,22 @@ TEST(scale_image2d_fp32, compute) { ...@@ -114,11 +117,22 @@ TEST(scale_image2d_fp32, compute) {
out_image_data, out_data, image_shape, out_dim); out_image_data, out_data, image_shape, out_dim);
for (int i = 0; i < out_dim.production(); i++) { for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(out_data[i], out_ref[i], 1e-6); auto abs_diff = abs(out_data[i] - out_ref[i]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_data[" << i
<< "]:" << out_data[i] << " "
"out_ref["
<< i << "]:" << out_ref[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
}
} }
} }
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(scale, kOpenCL, kFloat, kImageDefault, image2d); USE_LITE_KERNEL(scale, kOpenCL, kFP16, kImageDefault, image2d);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <math.h>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle {
namespace lite {
template <typename dtype>
void sigmoid_compute_ref(const dtype *x_data,
const DDim &x_dim,
dtype *out_data) {
for (int i = 0; i < x_dim.production(); ++i) {
out_data[i] = 1 / (1 + expf(-x_data[i]));
}
}
// buffer
#if 0 // sigmoid_buffer
TEST(opencl_sigmoid_buffer, compute) {
// prepare data
const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
lite::Tensor x, out;
x.Resize(x_dim);
out.Resize(x_dim);
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x = static_cast<float *>(
TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
mapped_x[i] = dist(engine);
}
// set param and kernel, then run
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"sigmoid", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> sigmoid_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(sigmoid_context->As<OpenCLContext>()));
kernel->SetContext(std::move(sigmoid_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.Out->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
sigmoid_compute_ref<float>(mapped_x, x_dim, out_ref.get());
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x_data, mapped_x);
}
#endif // sigmoid_buffer
#define LOOP_TEST
// #define PRINT_RESULT
TEST(sigmoid_image2d_fp32, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
"layout(img2buf) "
"-> host";
#ifdef LOOP_TEST
for (int n = 1; n <= 9; n += 3) {
for (auto c : {1, 3, 9}) {
for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) {
#else
const int n = 3;
const int c = 9;
const int h = 51;
const int w = 11;
#endif // LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
// set layout kernels
auto buf_to_img_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
auto sigmoid_img_kernels =
KernelRegistry::Global().Create("sigmoid",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(sigmoid_img_kernels.empty());
auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
// set tensors about op param
LOG(INFO) << "set tensors about op param";
// layout(buf->img): x -> sigmoid_in
// sigmoid(img): sigmoid_in -> sigmoid_out
// layout(img->buf): sigmoid_out -> y
lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
operators::LayoutParam BufferToImageParam;
operators::LayoutParam ImageToBufferParam;
BufferToImageParam.x = &x;
BufferToImageParam.y = &sigmoid_in;
ImageToBufferParam.x = &sigmoid_out;
ImageToBufferParam.y = &y;
operators::ActivationParam SigmoidParam;
SigmoidParam.X = &sigmoid_in;
SigmoidParam.Out = &sigmoid_out;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
x.Resize(x_dim);
y.Resize(x_dim);
sigmoid_in.Resize(x_dim);
sigmoid_out.Resize(x_dim);
y_ref.Resize(x_dim);
auto sigmoid_image2d_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
// initialize tensors
LOG(INFO) << "initialize tensors";
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
x_data, 0, sizeof(float) * x_dim.production()));
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production()));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-1, 1);
for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<float>(dist(engine));
}
auto *sigmoid_in_data = sigmoid_in.mutable_data<float, cl::Image2D>(
sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
auto *sigmoid_out_data = sigmoid_out.mutable_data<float, cl::Image2D>(
sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
// set context and kernel args
LOG(INFO) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
buf_to_img_kernel->SetParam(BufferToImageParam);
std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(buf_to_img_context->As<OpenCLContext>()));
buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
img_to_buf_kernel->SetParam(ImageToBufferParam);
std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(img_to_buf_context->As<OpenCLContext>()));
img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
sigmoid_img_kernel->SetParam(SigmoidParam);
std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(sigmoid_img_context->As<OpenCLContext>()));
sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
// run kernels
LOG(INFO) << "run kernel: buf_to_img_kernel";
buf_to_img_kernel->Launch();
LOG(INFO) << "run kernel: relu_img_kernel";
sigmoid_img_kernel->Launch();
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// compute ref cpu
sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result
#ifdef PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
}
#endif // PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
<< eidx << "]:" << mapped_y[eidx] << ", mapped_x["
<< eidx << "]: " << mapped_x[eidx];
break;
}
}
// free
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef LOOP_TEST
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
}
#define SIGMOID_FP16_LOOP_TEST
// #define SIGMOID_FP16_PRINT_RESULT
TEST(sigmoid_image2d_fp16, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
"layout(img2buf) "
"-> host";
#ifdef SIGMOID_FP16_LOOP_TEST
for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3}) {
for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) {
#else
const int n = 1;
const int c = 2;
const int h = 3;
const int w = 4;
#endif // SIGMOID_FP16_LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
// set layout kernels
auto buf_to_img_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
auto sigmoid_img_kernels =
KernelRegistry::Global().Create("sigmoid",
TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(sigmoid_img_kernels.empty());
auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
// set tensors about op param
LOG(INFO) << "set tensors about op param";
// layout(buf->img): x -> sigmoid_in
// sigmoid(img): sigmoid_in -> sigmoid_out
// layout(img->buf): sigmoid_out -> y
lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
operators::LayoutParam BufferToImageParam;
operators::LayoutParam ImageToBufferParam;
BufferToImageParam.x = &x;
BufferToImageParam.y = &sigmoid_in;
ImageToBufferParam.x = &sigmoid_out;
ImageToBufferParam.y = &y;
operators::ActivationParam SigmoidParam;
SigmoidParam.X = &sigmoid_in;
SigmoidParam.Out = &sigmoid_out;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
x.Resize(x_dim);
y.Resize(x_dim);
sigmoid_in.Resize(x_dim);
sigmoid_out.Resize(x_dim);
y_ref.Resize(x_dim);
auto sigmoid_image2d_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
// initialize tensors
LOG(INFO) << "initialize tensors";
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
x_data, 0, sizeof(float) * x_dim.production()));
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production()));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-1, 1);
for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<float>(dist(engine));
}
auto *sigmoid_in_data = sigmoid_in.mutable_data<int16_t, cl::Image2D>(
sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
auto *sigmoid_out_data =
sigmoid_out.mutable_data<int16_t, cl::Image2D>(
sigmoid_image2d_shape["width"],
sigmoid_image2d_shape["height"]);
// set context and kernel args
LOG(INFO) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
buf_to_img_kernel->SetParam(BufferToImageParam);
std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(buf_to_img_context->As<OpenCLContext>()));
buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
img_to_buf_kernel->SetParam(ImageToBufferParam);
std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(img_to_buf_context->As<OpenCLContext>()));
img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
sigmoid_img_kernel->SetParam(SigmoidParam);
std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(sigmoid_img_context->As<OpenCLContext>()));
sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
// run kernels
LOG(INFO) << "run kernel: buf_to_img_kernel";
buf_to_img_kernel->Launch();
LOG(INFO) << "run kernel: sigmoid_img_kernel";
sigmoid_img_kernel->Launch();
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// compute ref cpu
sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result
#ifdef SIGMOID_FP16_PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
}
#endif // SIGMOID_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]: " << y_data_ref[eidx] << ", mapped_y["
<< eidx << "]: " << mapped_y[eidx] << ", mapped_x["
<< eidx << "]: " << mapped_x[eidx];
break;
}
}
// free
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef SIGMOID_FP16_LOOP_TEST
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
}
} // namespace lite
} // namespace paddle
// sigmoid buffer
// USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
// sigmoid image2d fp32
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kImageDefault, ImageDefault);
// sigmoid image2d fp16
USE_LITE_KERNEL(sigmoid, kOpenCL, kFP16, kImageDefault, ImageDefault);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License
#pragma once
#define COMPTUE_ABS_DIFF(res0, res1) abs(res0 - res1)
#define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5))
#define IS_DIFF_PASSED(res0, res1, threshold) \
(((COMPTUE_ABS_DIFF(res0, res1) < threshold) || \
(COMPUTE_RELATIVE_DIFF(res0, res1) < threshold)) \
? true \
: false)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册