From 19bea13c61fed5b218270a24a36decc39378affd Mon Sep 17 00:00:00 2001 From: ZhenWang Date: Mon, 24 Jun 2019 22:14:33 +0800 Subject: [PATCH] fix elementwise_add acc bugs. --- paddle/fluid/lite/api/mobilenetv1_test.cc | 2 +- paddle/fluid/lite/core/context.h | 3 + .../kernels/opencl/elementwise_add_compute.cc | 4 +- .../opencl/elementwise_add_compute_test.cc | 8 +- paddle/fluid/lite/opencl/CMakeLists.txt | 6 +- paddle/fluid/lite/opencl/cl_caller.cc | 38 +- paddle/fluid/lite/opencl/cl_caller.h | 4 +- paddle/fluid/lite/opencl/cl_engine.cc | 3 +- paddle/fluid/lite/opencl/cl_half.cc | 518 ------------------ paddle/fluid/lite/opencl/cl_half.h | 32 -- paddle/fluid/lite/opencl/cl_image.cc | 9 +- .../fluid/lite/opencl/cl_image_converter.cc | 41 +- paddle/fluid/lite/opencl/cl_image_converter.h | 29 +- .../opencl/cl_kernel/channel_add_kernel.cl | 29 + .../fluid/lite/opencl/cl_kernel/cl_common.h | 16 +- .../cl_kernel/elementwise_add_kernel.cl | 11 +- .../lite/opencl/cl_kernel/pool_kernel.cl | 15 +- paddle/fluid/lite/opencl/cl_test.cc | 107 +++- 18 files changed, 210 insertions(+), 665 deletions(-) delete mode 100644 paddle/fluid/lite/opencl/cl_half.cc delete mode 100644 paddle/fluid/lite/opencl/cl_half.h create mode 100644 paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl diff --git a/paddle/fluid/lite/api/mobilenetv1_test.cc b/paddle/fluid/lite/api/mobilenetv1_test.cc index d72fc7690d1..f5b8e6e6c5c 100644 --- a/paddle/fluid/lite/api/mobilenetv1_test.cc +++ b/paddle/fluid/lite/api/mobilenetv1_test.cc @@ -61,7 +61,7 @@ void TestModel(const std::vector& valid_places, 3.13812525e-05, 6.52209565e-05, 4.78087313e-05, 2.58822285e-04}); for (int i = 0; i < results.size(); ++i) { - EXPECT_NEAR(out->data()[i], results[i], 1e-5); + EXPECT_NEAR(out->data()[i], results[i], 1e-6); } ASSERT_EQ(out->dims().size(), 2); ASSERT_EQ(out->dims()[0], 1); diff --git a/paddle/fluid/lite/core/context.h b/paddle/fluid/lite/core/context.h index 55268c4b1c7..d75c85d54cf 100644 --- a/paddle/fluid/lite/core/context.h +++ b/paddle/fluid/lite/core/context.h @@ -236,12 +236,15 @@ class Context { void CopySharedTo(const OpenClContext* ctx) { ctx->cl_context_ = cl_context_; + ctx->cl_helper_ = cl_helper_; } private: void PrepareKernels() { cl_helper_->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + cl_helper_->AddKernel("channel_add", "channel_add_kernel.cl"); cl_helper_->AddKernel("pool_max", "pool_kernel.cl"); + cl_helper_->AddKernel("pool_avg", "pool_kernel.cl"); } }; #endif diff --git a/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc index c5230cbaf7d..4213e2a81e7 100644 --- a/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc +++ b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc @@ -31,10 +31,10 @@ class ElementwiseAddCompute void Run() override { auto& param = *param_.get_mutable(); auto& context = ctx_->As(); - CHECK(context.cl_context()); + CHECK(context.cl_helper() != nullptr); elementwise_add( - context.cl_context(), static_cast(param.X->raw_data()), + context.cl_helper(), static_cast(param.X->raw_data()), param.X->dims(), static_cast(param.Y->raw_data()), param.Y->dims(), param.Out->mutable_data(), param.Out->dims()); } diff --git a/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc index 9f382244abc..f82d8477d55 100644 --- a/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc +++ b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc @@ -40,9 +40,9 @@ TEST(elementwise_add, init) { kernel->SetParam(param); kernel->SetContext(std::move(context)); - X.Resize({1, 10}); - Y.Resize({1, 10}); - Out.Resize({1, 10}); + X.Resize({1, 1, 1, 10}); + Y.Resize({1, 1, 1, 10}); + Out.Resize({1, 1, 1, 10}); auto* x_data = X.mutable_data(); auto* y_data = Y.mutable_data(); @@ -56,7 +56,7 @@ TEST(elementwise_add, init) { kernel->Launch(); for (int i = 0; i < 10; i++) { - EXPECT_NEAR(out_data[i], 3.4 * i, 1e-1); + EXPECT_NEAR(out_data[i], 3.4 * i, 1e-6); } } diff --git a/paddle/fluid/lite/opencl/CMakeLists.txt b/paddle/fluid/lite/opencl/CMakeLists.txt index 39282aeff4f..2017346f751 100644 --- a/paddle/fluid/lite/opencl/CMakeLists.txt +++ b/paddle/fluid/lite/opencl/CMakeLists.txt @@ -5,13 +5,11 @@ endif() cc_library(cl_wrapper SRCS cl_wrapper.cc) cc_library(cl_tool SRCS cl_tool.cc) target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers) -cc_library(cl_half SRCS cl_half.cc) -target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing) cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool) cc_library(cl_context SRCS cl_context.cc DEPS cl_engine) cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context) -cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor) -cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine) +cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS lite_tensor) +cc_library(cl_image SRCS cl_image.cc DEPS lite_tensor cl_image_converter cl_engine) cc_library(cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image) lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper) add_dependencies(cl_tool opencl_clhpp) diff --git a/paddle/fluid/lite/opencl/cl_caller.cc b/paddle/fluid/lite/opencl/cl_caller.cc index 6db41aadbd7..a56540feb72 100644 --- a/paddle/fluid/lite/opencl/cl_caller.cc +++ b/paddle/fluid/lite/opencl/cl_caller.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/lite/opencl/cl_caller.h" #include #include "paddle/fluid/lite/core/compatible_tensor.h" -#include "paddle/fluid/lite/opencl/cl_context.h" #include "paddle/fluid/lite/opencl/cl_engine.h" #include "paddle/fluid/lite/opencl/cl_helper.h" #include "paddle/fluid/lite/opencl/cl_image.h" @@ -23,16 +22,17 @@ limitations under the License. */ namespace paddle { namespace lite { -static void CopyImageData(const CLImage& cl_image, float* out) { +static void CopyImageData(CLHelper* helper, const CLImage& cl_image, + float* out) { int width = cl_image.image_dims()[0]; int height = cl_image.image_dims()[1]; - half_t* image_data = new half_t[height * width * 4]; + float* image_data = new float[height * width * 4]; cl::Image* image = cl_image.cl_image(); const std::array origin{0, 0, 0}; const std::array region{static_cast(width), static_cast(height), 1}; - cl_int err = CLEngine::Global()->command_queue().enqueueReadImage( + cl_int err = helper->OpenCLCommandQueue().enqueueReadImage( *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr); CL_CHECK_ERRORS(err); @@ -49,22 +49,25 @@ bool InitOpenCLEngine(std::string cl_path) { return engine->IsInitSuccess(); } -void elementwise_add(CLContext* context, const float* in, const DDim& in_dim, +void elementwise_add(CLHelper* helper, const float* in, const DDim& in_dim, const float* bias, const DDim& bias_dim, float* out, const DDim& out_dim) { - CLHelper helper(context); - helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); - auto kernel = helper.GetKernel(0); + if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) { + LOG(FATAL) << "Error: bias dims is error"; + return; + } + auto kernel = bias_dim.size() == 1 ? helper->GetKernel("channel_add") + : helper->GetKernel("elementwise_add"); CLImage in_image; in_image.set_tensor_data(in, in_dim); - in_image.InitNormalCLImage(helper.OpenCLContext()); + in_image.InitNormalCLImage(helper->OpenCLContext()); VLOG(3) << " --- Inpu image: " << in_image << " --- "; CLImage bias_image; bias_image.set_tensor_data(bias, bias_dim); - bias_image.InitNormalCLImage(helper.OpenCLContext()); + bias_image.InitCLImage(helper->OpenCLContext()); VLOG(3) << " --- Bias image: " << bias_image << " --- "; CLImage out_image; - out_image.InitEmptyImage(helper.OpenCLContext(), out_dim); + out_image.InitEmptyImage(helper->OpenCLContext(), out_dim); cl_int status; status = kernel.setArg(0, *in_image.cl_image()); CL_CHECK_ERRORS(status); @@ -72,16 +75,23 @@ void elementwise_add(CLContext* context, const float* in, const DDim& in_dim, CL_CHECK_ERRORS(status); status = kernel.setArg(2, *out_image.cl_image()); CL_CHECK_ERRORS(status); + + if (bias_dim.size() == 1) { + int tensor_w = in_dim[3]; + status = kernel.setArg(3, tensor_w); + CL_CHECK_ERRORS(status); + } size_t width = in_image.ImageWidth(); size_t height = in_image.ImageHeight(); auto global_work_size = cl::NDRange{width, height}; - status = helper.OpenCLCommandQueue().enqueueNDRangeKernel( + status = helper->OpenCLCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); CL_CHECK_ERRORS(status); + status = helper->OpenCLCommandQueue().finish(); + CL_CHECK_ERRORS(status); VLOG(3) << " --- Out image: " << out_image << " --- "; - - CopyImageData(out_image, out); + CopyImageData(helper, out_image, out); } } // namespace lite diff --git a/paddle/fluid/lite/opencl/cl_caller.h b/paddle/fluid/lite/opencl/cl_caller.h index 221c52f653a..a55724b5cef 100644 --- a/paddle/fluid/lite/opencl/cl_caller.h +++ b/paddle/fluid/lite/opencl/cl_caller.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/lite/core/compatible_tensor.h" -#include "paddle/fluid/lite/opencl/cl_context.h" +#include "paddle/fluid/lite/opencl/cl_helper.h" namespace paddle { namespace lite { @@ -27,7 +27,7 @@ bool InitOpenCLEngine(std::string cl_path); /// black box so that the framework can remain simple. /// NOTE Currently, these methods are quite expensive, we will optimize them /// latter. -void elementwise_add(CLContext* context, const float* in, const DDim& in_dim, +void elementwise_add(CLHelper* helper, const float* in, const DDim& in_dim, const float* bias, const DDim& bias_dim, float* out, const DDim& out_dim); diff --git a/paddle/fluid/lite/opencl/cl_engine.cc b/paddle/fluid/lite/opencl/cl_engine.cc index be82ba23cbb..bcf39992c2a 100644 --- a/paddle/fluid/lite/opencl/cl_engine.cc +++ b/paddle/fluid/lite/opencl/cl_engine.cc @@ -156,8 +156,7 @@ bool CLEngine::InitializeDevice() { if (ext_data.find("cl_khr_fp16") != std::string::npos) { LOG(INFO) << "The chosen device supports the half data type."; } else { - LOG(ERROR) << "The chosen device doesn't support the half data type!"; - return false; + LOG(INFO) << "The chosen device doesn't support the half data type!"; } auto max_units = device_->getInfo(); LOG(INFO) << "The chosen device has " << max_units << " compute units."; diff --git a/paddle/fluid/lite/opencl/cl_half.cc b/paddle/fluid/lite/opencl/cl_half.cc deleted file mode 100644 index bbed7c0b8b9..00000000000 --- a/paddle/fluid/lite/opencl/cl_half.cc +++ /dev/null @@ -1,518 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf - -#include "paddle/fluid/lite/opencl/cl_half.h" - -namespace paddle { -namespace lite { - -static const uint32_t mantissatable[2048] = { - 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, - 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, - 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, - 0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000, - 0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000, - 0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000, - 0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000, - 0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000, - 0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000, - 0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000, - 0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000, - 0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000, - 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000, - 0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000, - 0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000, - 0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000, - 0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000, - 0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000, - 0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000, - 0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000, - 0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000, - 0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, - 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, - 0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000, - 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, - 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000, - 0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000, - 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, - 0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000, - 0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, - 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, - 0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000, - 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, - 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000, - 0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000, - 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, - 0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000, - 0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, - 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, - 0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000, - 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, - 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000, - 0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000, - 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, - 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, - 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, - 0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000, - 0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000, - 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, - 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, - 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, - 0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000, - 0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000, - 0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000, - 0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000, - 0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000, - 0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000, - 0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000, - 0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000, - 0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000, - 0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000, - 0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000, - 0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000, - 0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000, - 0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000, - 0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000, - 0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000, - 0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000, - 0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000, - 0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000, - 0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000, - 0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000, - 0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000, - 0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000, - 0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000, - 0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000, - 0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000, - 0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000, - 0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000, - 0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000, - 0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000, - 0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000, - 0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000, - 0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000, - 0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000, - 0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000, - 0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000, - 0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000, - 0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000, - 0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000, - 0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000, - 0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000, - 0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000, - 0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000, - 0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000, - 0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000, - 0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000, - 0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000, - 0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000, - 0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000, - 0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000, - 0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000, - 0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000, - 0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000, - 0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000, - 0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000, - 0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000, - 0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000, - 0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000, - 0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000, - 0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000, - 0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000, - 0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000, - 0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000, - 0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000, - 0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000, - 0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000, - 0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000, - 0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000, - 0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000, - 0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000, - 0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000, - 0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000, - 0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000, - 0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000, - 0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000, - 0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000, - 0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000, - 0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000, - 0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000, - 0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000, - 0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000, - 0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000, - 0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000, - 0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000, - 0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000, - 0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000, - 0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000, - 0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000, - 0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000, - 0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000, - 0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000, - 0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000, - 0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000, - 0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000, - 0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000, - 0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000, - 0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000, - 0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000, - 0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000, - 0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000, - 0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000, - 0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000, - 0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000, - 0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000, - 0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000, - 0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000, - 0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000, - 0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000, - 0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000, - 0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000, - 0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000, - 0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000, - 0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000, - 0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000, - 0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000, - 0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000, - 0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000, - 0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000, - 0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000, - 0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000, - 0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000, - 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000, - 0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, - 0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000, - 0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000, - 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000, - 0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, - 0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000, - 0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000, - 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000, - 0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, - 0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000, - 0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000, - 0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000, - 0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000, - 0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000, - 0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000, - 0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000, - 0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000, - 0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000, - 0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000, - 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000, - 0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, - 0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000, - 0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000, - 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000, - 0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, - 0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000, - 0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000, - 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000, - 0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, - 0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000, - 0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000, - 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000, - 0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000, - 0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000, - 0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000, - 0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000, - 0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000, - 0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000, - 0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000, - 0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000, - 0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, - 0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000, - 0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000, - 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000, - 0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, - 0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000, - 0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000, - 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000, - 0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, - 0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000, - 0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000, - 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000, - 0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, - 0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000, - 0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000, - 0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000, - 0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000, - 0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000, - 0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000, - 0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000, - 0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000, - 0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000, - 0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000, - 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000, - 0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, - 0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000, - 0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000, - 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000, - 0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, - 0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000, - 0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000, - 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000, - 0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, - 0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000, - 0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000, - 0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000, - 0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000, - 0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000, - 0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000, - 0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000, - 0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000, - 0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000, - 0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000, - 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000, - 0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, - 0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000, - 0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000, - 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000, - 0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, - 0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000, - 0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000, - 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000, - 0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, - 0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000, - 0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000, - 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000, - 0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000, - 0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000, - 0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000, - 0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000, - 0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000, - 0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000, - 0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000, - 0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000, - 0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, - 0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000, - 0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000, - 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000, - 0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, - 0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000, - 0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000, - 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000, - 0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, - 0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000, - 0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000, - 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000, - 0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, - 0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000, - 0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000, - 0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000, - 0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000, - 0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000, - 0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000, - 0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000, - 0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000, - 0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000, - 0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000, - 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000, - 0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, - 0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000, - 0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000, - 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000, - 0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, - 0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000, - 0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000, - 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000, - 0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, - 0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000, - 0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000, - 0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000, - 0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000, - 0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000, - 0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000, - 0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000, - 0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000, - 0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000, - 0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000, - 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000, - 0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, - 0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000, - 0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000, - 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000, - 0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, - 0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000, - 0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000, - 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000, - 0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, - 0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000, - 0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000, - 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000, - 0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000, - 0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000, - 0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000, - 0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000, - 0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000, - 0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000, - 0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000, - 0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000, - 0x387fc000, 0x387fe000}; - -static const uint16_t offsettable[64] = { - 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; - -static const uint32_t exponenttable[64] = { - 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, - 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, - 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, - 0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000, - 0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000, - 0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, - 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, - 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, - 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000, - 0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000, - 0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000}; - -static const uint16_t basetable[512] = { - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, - 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, - 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, - 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800, - 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, - 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, - 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, - 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, - 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, - 0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00}; - -static const uint8_t shifttable[512] = { - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, - 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, - 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; - -half_t Float2Half(float f) { - uint32_t v = *reinterpret_cast(&f); - return basetable[(v >> 23) & 0x1ff] + - ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); -} - -float Half2Float(half_t h) { - uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + - exponenttable[h >> 10]; - return *reinterpret_cast(&v); -} - -void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) { - for (int i = 0; i < count; ++i) { - h_array[i] = Float2Half(f_array[i]); - } -} - -void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) { - for (int i = 0; i < count; ++i) { - f_array[i] = Half2Float(h_array[i]); - } -} - -} // namespace lite -} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_half.h b/paddle/fluid/lite/opencl/cl_half.h deleted file mode 100644 index 0dcf325db2b..00000000000 --- a/paddle/fluid/lite/opencl/cl_half.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -namespace paddle { -namespace lite { - -typedef uint16_t half_t; - -half_t Float2Half(float f); - -float Half2Float(half_t h); - -void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); - -void HalfArray2FloatArray(half_t *h_array, float *f_array, int count); - -} // namespace lite -} // namespace paddle diff --git a/paddle/fluid/lite/opencl/cl_image.cc b/paddle/fluid/lite/opencl/cl_image.cc index 04dfc07b912..40da48ba48a 100644 --- a/paddle/fluid/lite/opencl/cl_image.cc +++ b/paddle/fluid/lite/opencl/cl_image.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/lite/opencl/cl_engine.h" -#include "paddle/fluid/lite/opencl/cl_half.h" #include "paddle/fluid/lite/opencl/cl_tool.h" namespace paddle { @@ -26,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { int width = cl_image.image_dims_[0]; int height = cl_image.image_dims_[1]; - half_t* image_data = new half_t[height * width * 4]; + float* image_data = new float[height * width * 4]; cl::Image* image = cl_image.cl_image(); const std::array origin{0, 0, 0}; const std::array region{static_cast(width), @@ -131,9 +130,9 @@ void CLImage::InitCLImage(const cl::Context& context, image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - half_t* image_data = new half_t[image_dims_.product() * 4]; + float* image_data = new float[image_dims_.product() * 4]; #else - half_t* image_data = new half_t[image_dims_.production() * 4]; + float* image_data = new float[image_dims_.production() * 4]; #endif VLOG(3) << " convert to image "; @@ -151,7 +150,7 @@ void CLImage::InitCLImage(const cl::Context& context, void CLImage::InitCLImage(const cl::Context& context, int width, int height, void* data) { - cl::ImageFormat img_format(CL_RGBA, CL_HALF_FLOAT); + cl::ImageFormat img_format(CL_RGBA, CL_FLOAT); cl_int err; cl_image_.reset(new cl::Image2D( context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0), diff --git a/paddle/fluid/lite/opencl/cl_image_converter.cc b/paddle/fluid/lite/opencl/cl_image_converter.cc index 4408625e8fe..03b3edd7a1c 100644 --- a/paddle/fluid/lite/opencl/cl_image_converter.cc +++ b/paddle/fluid/lite/opencl/cl_image_converter.cc @@ -36,7 +36,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) { static_cast(height)})); } -void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, +void CLImageConverterDefault::NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) { size_t new_dims[] = {1, 1, 1, 1}; for (size_t j = 0; j < tensor_dim.size(); ++j) { @@ -68,7 +68,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, if (c < C) { // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // (c % 4); - image[i2] = Float2Half(*p); + image[i2] = *p; i2 += 4; p++; } else { @@ -83,7 +83,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, } } -void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, +void CLImageConverterDefault::ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { size_t new_dims[] = {1, 1, 1, 1}; @@ -107,7 +107,7 @@ void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, for (size_t h = 0; h < H; h++) { size_t i2 = (i1 << 2) + c % 4; for (size_t w = 0; w < W; w++) { - *p = Half2Float(image[i2]); + *p = image[i2]; i2 += 4; p++; } @@ -161,7 +161,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) { } } -void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image, +void CLImageConverterFolder::NCHWToImage(float *tensor, float *image, const DDim &tensor_dim) { CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) << " Tensor dim is not support!"; @@ -184,14 +184,13 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image, for (size_t h = 0; h < tdim[0]; h++) { for (size_t w = 0; w < tdim[1]; w++) { - image[(h * width + w / 4) * 4 + (w % 4)] = - Float2Half(tensor[h * tdim[1] + w]); + image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w]; } } } } -void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, +void CLImageConverterFolder::ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { if (tensor_dim.size() > 2) { @@ -213,7 +212,7 @@ void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, for (size_t h = 0; h < H; h++) { for (size_t w = 0; w < W; w++) { - p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]); + p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)]; } } } @@ -233,7 +232,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { static_cast(height)})); } -void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, +void CLImageConverterNWBlock::NCHWToImage(float *tensor, float *image, const DDim &tensor_dim) { CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; auto image_dim = InitImageDimInfoWith(tensor_dim); @@ -253,7 +252,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + w * 4 + n % 4; if (n < N) { - image[index] = Float2Half(*p); + image[index] = *p; p++; } else { image[index] = 0.0; @@ -268,7 +267,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, VLOG(3) << " init done"; } -void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, +void CLImageConverterNWBlock::ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; @@ -286,7 +285,7 @@ void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, for (size_t w = 0; w < W; ++w) { size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + w * 4 + n % 4; - *p = Half2Float(image[index]); + *p = image[index]; p++; if (index >= (width * height * 4)) { LOG(INFO) << " index out of range "; @@ -312,7 +311,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { static_cast(height)})); } -void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, +void CLImageConverterDWBlock::NCHWToImage(float *tensor, float *image, const DDim &tensor_dim) { size_t new_dims[] = {1, 1, 1, 1}; for (size_t j = 0; j < tensor_dim.size(); ++j) { @@ -344,7 +343,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, if (c < C) { // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // (c % 4); - image[i2] = Float2Half(*p); + image[i2] = *p; i2 += 4; p++; } else { @@ -359,7 +358,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, } } -void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, +void CLImageConverterDWBlock::ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; @@ -377,7 +376,7 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, for (size_t h = 0; h < H; h++) { size_t i2 = (i1 << 2) + c % 4; for (size_t w = 0; w < W; w++) { - *p = Half2Float(image[i2]); + *p = image[i2]; i2 += 4; p++; } @@ -410,7 +409,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) { static_cast(height)})); } -void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image, +void CLImageConverterNormal::NCHWToImage(float *tensor, float *image, const DDim &tensor_dim) { CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) << " Tensor dim is not support!"; @@ -419,7 +418,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image, default_converter.NCHWToImage(tensor, image, tensor_dim); } -void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor, +void CLImageConverterNormal::ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { CLImageConverterDefault default_converter; @@ -439,10 +438,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith( static_cast(height)})); } -void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image, +void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, float *image, const DDim &tensor_dim) {} -void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor, +void CLImageConverterWinoTransWeight::ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) {} diff --git a/paddle/fluid/lite/opencl/cl_image_converter.h b/paddle/fluid/lite/opencl/cl_image_converter.h index 9dceca45030..874f292e0f4 100644 --- a/paddle/fluid/lite/opencl/cl_image_converter.h +++ b/paddle/fluid/lite/opencl/cl_image_converter.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "paddle/fluid/lite/core/compatible_tensor.h" -#include "paddle/fluid/lite/opencl/cl_half.h" namespace paddle { namespace lite { @@ -24,10 +23,10 @@ class CLImageConverterBase { public: virtual ~CLImageConverterBase() {} - virtual void NCHWToImage(float *nchw, half_t *image, + virtual void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) = 0; - virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim, + virtual void ImageToNCHW(float *image, float *nchw, const DDim &image_dim, const DDim &tensor_dim) = 0; virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0; }; @@ -35,16 +34,16 @@ class CLImageConverterBase { class CLImageConverterDefault : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim); + void ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim); }; class CLImageConverterFolder : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim); + void ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim); /* @@ -68,8 +67,8 @@ class CLImageConverterFolder : public CLImageConverterBase { class CLImageConverterNormal : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim); + void ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim); /* @@ -92,22 +91,22 @@ class CLImageConverterNormal : public CLImageConverterBase { class CLImageConverterNWBlock : public CLImageConverterBase { DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim); + void ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim); }; class CLImageConverterDWBlock : public CLImageConverterBase { DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim); + void ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim); }; class CLImageConverterWinoTransWeight : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim); + void ImageToNCHW(float *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim); }; diff --git a/paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl b/paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl new file mode 100644 index 00000000000..c1063778300 --- /dev/null +++ b/paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +__kernel void channel_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage, __private const int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x/w; + coords_bias.y = 0; + float4 in = read_imagef(input, sampler, coords); + float4 biase = read_imagef(bias, sampler, coords_bias); + float4 output = in + biase; + write_imagef(outputImage, coords, output); + } diff --git a/paddle/fluid/lite/opencl/cl_kernel/cl_common.h b/paddle/fluid/lite/opencl/cl_kernel/cl_common.h index 31ca6d7f65c..dedd05a13c5 100644 --- a/paddle/fluid/lite/opencl/cl_kernel/cl_common.h +++ b/paddle/fluid/lite/opencl/cl_kernel/cl_common.h @@ -14,21 +14,19 @@ limitations under the License. */ #pragma once -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -inline half4 activation(half4 in +inline float4 activation(float4 in #ifdef PRELU - , - half4 prelu_alpha + , + float4 prelu_alpha #endif - ) { - half4 output; + ) { + float4 output; #ifdef PRELU - output = select(prelu_alpha * in, in, in >= (half4)0.0); + output = select(prelu_alpha * in, in, in >= (float4)0.0); #endif #ifdef RELU - output = fmax(in, (half4)(0.0f)); + output = fmax(in, (float4)(0.0f)); #endif return output; } diff --git a/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl b/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl index f3047648689..ecf719ae931 100644 --- a/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl +++ b/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl @@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) { +__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) { int x = get_global_id(0); int y = get_global_id(1); const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; int2 coords; coords.x = x; coords.y = y; - half4 in = read_imageh(input, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords); - half4 output = in + biase; - write_imageh(outputImage,coords,output); + float4 in = read_imagef(input, sampler, coords); + float4 biase = read_imagef(bias, sampler, coords); + float4 output = in + biase; + write_imagef(outputImage,coords,output); } diff --git a/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl b/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl index a6a4da690fa..0ca3b9141da 100644 --- a/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl +++ b/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma OPENCL EXTENSION cl_khr_fp16 : enable #define MIN_VALUE -FLT_MAX __kernel void pool_max( @@ -41,16 +40,16 @@ __kernel void pool_max( const int pos_in_x = out_c * in_width; const int pos_in_y = out_n * in_height; - half4 max_value = (half4)(MIN_VALUE); + float4 max_value = (float4)(MIN_VALUE); for (int y = start_h; y < end_h; ++y) { for (int x = start_w; x < end_w; ++x) { - half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); max_value = max(max_value, tmp); } } const int pos_out_x = mad24(out_c, out_width, out_w); - write_imageh(output, (int2)(pos_out_x, out_nh), max_value); + write_imagef(output, (int2)(pos_out_x, out_nh), max_value); } __kernel void pool_avg( @@ -77,15 +76,15 @@ __kernel void pool_avg( const int pos_in_x = out_c * in_width; const int pos_in_y = out_n * in_height; - half4 sum = (half4)(0.0f); + float4 sum = (float4)(0.0f); int num = 0; for (int y = start_h; y < end_h; ++y) { for (int x = start_w; x < end_w; ++x) { - sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); num++; } } - half4 avg = sum / num; + float4 avg = sum / num; const int pos_out_x = mad24(out_c, out_width, out_w); - write_imageh(output, (int2)(pos_out_x, out_nh), avg); + write_imagef(output, (int2)(pos_out_x, out_nh), avg); } diff --git a/paddle/fluid/lite/opencl/cl_test.cc b/paddle/fluid/lite/opencl/cl_test.cc index 39392a8f194..57192b79d72 100644 --- a/paddle/fluid/lite/opencl/cl_test.cc +++ b/paddle/fluid/lite/opencl/cl_test.cc @@ -67,28 +67,28 @@ TEST(cl_test, kernel_test) { helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); auto kernel = helper->GetKernel(2); - std::unique_ptr in_data(new float[1024 * 512]); - for (int i = 0; i < 1024 * 512; i++) { + std::unique_ptr in_data(new float[4 * 3 * 256 * 512]); + for (int i = 0; i < 4 * 3 * 256 * 512; i++) { in_data[i] = 1.f; } - const DDim in_dim = DDim(std::vector{1024, 512}); + const DDim in_dim = DDim(std::vector{4, 3, 256, 512}); CLImage in_image; in_image.set_tensor_data(in_data.get(), in_dim); in_image.InitNormalCLImage(helper->OpenCLContext()); LOG(INFO) << in_image; - std::unique_ptr bias_data(new float[1024 * 512]); - for (int i = 0; i < 1024 * 512; i++) { + std::unique_ptr bias_data(new float[4 * 3 * 256 * 512]); + for (int i = 0; i < 4 * 3 * 256 * 512; i++) { bias_data[i] = 2.f; } - const DDim bias_dim = DDim(std::vector{1024, 512}); + const DDim bias_dim = DDim(std::vector{4, 3, 256, 512}); CLImage bias_image; bias_image.set_tensor_data(bias_data.get(), bias_dim); bias_image.InitNormalCLImage(helper->OpenCLContext()); LOG(INFO) << bias_image; CLImage out_image; - const DDim out_dim = DDim(std::vector{1024, 512}); + const DDim out_dim = DDim(std::vector{4, 3, 256, 512}); out_image.InitEmptyImage(helper->OpenCLContext(), out_dim); LOG(INFO) << out_image; @@ -108,7 +108,8 @@ TEST(cl_test, kernel_test) { status = helper->OpenCLCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event); CL_CHECK_ERRORS(status); - + status = helper->OpenCLCommandQueue().finish(); + CL_CHECK_ERRORS(status); double start_nanos = event.getProfilingInfo(); double stop_nanos = event.getProfilingInfo(); double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; @@ -116,37 +117,99 @@ TEST(cl_test, kernel_test) { LOG(INFO) << out_image; } -TEST(cl_test, elementwise_add_test) { +TEST(cl_test, channel_add_test) { std::default_random_engine engine; std::uniform_real_distribution dist(-5, 5); - const DDim in_dim = DDim(std::vector{1024, 512}); - std::unique_ptr in_data(new float[1024 * 512]); - for (int i = 0; i < 1024 * 512; i++) { + const DDim in_dim = DDim(std::vector{4, 16, 256, 512}); + std::unique_ptr in_data(new float[4 * 16 * 256 * 512]); + for (int i = 0; i < 4 * 16 * 256 * 512; i++) { in_data[i] = dist(engine); } - const DDim bias_dim = DDim(std::vector{1024, 512}); - std::unique_ptr bias_data(new float[1024 * 512]); - for (int i = 0; i < 1024 * 512; i++) { + const DDim bias_dim = DDim(std::vector{16}); + std::unique_ptr bias_data(new float[16]); + for (int i = 0; i < 16; i++) { bias_data[i] = dist(engine); } - const DDim out_dim = DDim(std::vector{1024, 512}); - std::unique_ptr out(new float[1024 * 512]); + std::unique_ptr out_ref(new float[4 * 16 * 256 * 512]); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 16; j++) { + float b = bias_data[j]; + for (int k = 0; k < 256 * 512; k++) { + int index = (i * 16 + j) * 256 * 512 + k; + out_ref[index] = in_data[index] + b; + } + } + } + + const DDim out_dim = DDim(std::vector{4, 16, 256, 512}); + std::unique_ptr out(new float[4 * 16 * 256 * 512]); bool status = InitOpenCLEngine(FLAGS_cl_path); CHECK(status) << "Fail to initialize OpenCL engine."; - CLContext context; + std::unique_ptr context(new CLContext); + std::unique_ptr helper(new CLHelper(context.get())); + helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + helper->AddKernel("channel_add", "channel_add_kernel.cl"); + elementwise_add(helper.get(), in_data.get(), in_dim, bias_data.get(), + bias_dim, out.get(), out_dim); + + int stride = 4 * 16 * 256 * 512 / 20; + for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) { + std::cout << out[i] << " "; + } - elementwise_add(&context, in_data.get(), in_dim, bias_data.get(), bias_dim, - out.get(), out_dim); + for (int i = 0; i < 4 * 16 * 256 * 512; i++) { + EXPECT_NEAR(out[i], out_ref[i], 1e-6); + } - int stride = 1024 * 512 / 20; - for (int i = 0; i < 1024 * 512; i += stride) { + std::cout << std::endl; +} + +TEST(cl_test, elementwise_add_test) { + std::default_random_engine engine; + std::uniform_real_distribution dist(-5, 5); + + const DDim in_dim = DDim(std::vector{4, 16, 256, 512}); + std::unique_ptr in_data(new float[4 * 16 * 256 * 512]); + for (int i = 0; i < 4 * 16 * 256 * 512; i++) { + in_data[i] = dist(engine); + } + + const DDim bias_dim = DDim(std::vector{4, 16, 256, 512}); + std::unique_ptr bias_data(new float[4 * 16 * 256 * 512]); + for (int i = 0; i < 4 * 16 * 256 * 512; i++) { + bias_data[i] = dist(engine); + } + + std::unique_ptr out_ref(new float[4 * 16 * 256 * 512]); + for (int i = 0; i < 4 * 16 * 256 * 512; i++) { + out_ref[i] = in_data[i] + bias_data[i]; + } + + const DDim out_dim = DDim(std::vector{4, 16, 256, 512}); + std::unique_ptr out(new float[4 * 16 * 256 * 512]); + + bool status = InitOpenCLEngine(FLAGS_cl_path); + CHECK(status) << "Fail to initialize OpenCL engine."; + std::unique_ptr context(new CLContext); + std::unique_ptr helper(new CLHelper(context.get())); + helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + helper->AddKernel("channel_add", "channel_add_kernel.cl"); + elementwise_add(helper.get(), in_data.get(), in_dim, bias_data.get(), + bias_dim, out.get(), out_dim); + + int stride = 4 * 16 * 256 * 512 / 20; + for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) { std::cout << out[i] << " "; } + for (int i = 0; i < 4 * 16 * 256 * 512; i++) { + EXPECT_NEAR(out[i], out_ref[i], 1e-6); + } + std::cout << std::endl; } -- GitLab