提交 fc3a66ae 编写于 作者: C Chunwei

Merge branch 'fix_opencl_acc' into 'incubate/lite'

fix elementwise_add acc bugs.

See merge request inference/paddlelite!58
...@@ -61,7 +61,7 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -61,7 +61,7 @@ void TestModel(const std::vector<Place>& valid_places,
3.13812525e-05, 6.52209565e-05, 4.78087313e-05, 3.13812525e-05, 6.52209565e-05, 4.78087313e-05,
2.58822285e-04}); 2.58822285e-04});
for (int i = 0; i < results.size(); ++i) { for (int i = 0; i < results.size(); ++i) {
EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5); EXPECT_NEAR(out->data<float>()[i], results[i], 1e-6);
} }
ASSERT_EQ(out->dims().size(), 2); ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1); ASSERT_EQ(out->dims()[0], 1);
......
...@@ -236,12 +236,15 @@ class Context<TargetType::kOpenCL> { ...@@ -236,12 +236,15 @@ class Context<TargetType::kOpenCL> {
void CopySharedTo(const OpenClContext* ctx) { void CopySharedTo(const OpenClContext* ctx) {
ctx->cl_context_ = cl_context_; ctx->cl_context_ = cl_context_;
ctx->cl_helper_ = cl_helper_;
} }
private: private:
void PrepareKernels() { void PrepareKernels() {
cl_helper_->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); cl_helper_->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
cl_helper_->AddKernel("channel_add", "channel_add_kernel.cl");
cl_helper_->AddKernel("pool_max", "pool_kernel.cl"); cl_helper_->AddKernel("pool_max", "pool_kernel.cl");
cl_helper_->AddKernel("pool_avg", "pool_kernel.cl");
} }
}; };
#endif #endif
......
...@@ -31,10 +31,10 @@ class ElementwiseAddCompute ...@@ -31,10 +31,10 @@ class ElementwiseAddCompute
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenClContext>(); auto& context = ctx_->As<OpenClContext>();
CHECK(context.cl_context()); CHECK(context.cl_helper() != nullptr);
elementwise_add( elementwise_add(
context.cl_context(), static_cast<const float*>(param.X->raw_data()), context.cl_helper(), static_cast<const float*>(param.X->raw_data()),
param.X->dims(), static_cast<const float*>(param.Y->raw_data()), param.X->dims(), static_cast<const float*>(param.Y->raw_data()),
param.Y->dims(), param.Out->mutable_data<float>(), param.Out->dims()); param.Y->dims(), param.Out->mutable_data<float>(), param.Out->dims());
} }
......
...@@ -40,9 +40,9 @@ TEST(elementwise_add, init) { ...@@ -40,9 +40,9 @@ TEST(elementwise_add, init) {
kernel->SetParam(param); kernel->SetParam(param);
kernel->SetContext(std::move(context)); kernel->SetContext(std::move(context));
X.Resize({1, 10}); X.Resize({1, 1, 1, 10});
Y.Resize({1, 10}); Y.Resize({1, 1, 1, 10});
Out.Resize({1, 10}); Out.Resize({1, 1, 1, 10});
auto* x_data = X.mutable_data<float>(); auto* x_data = X.mutable_data<float>();
auto* y_data = Y.mutable_data<float>(); auto* y_data = Y.mutable_data<float>();
...@@ -56,7 +56,7 @@ TEST(elementwise_add, init) { ...@@ -56,7 +56,7 @@ TEST(elementwise_add, init) {
kernel->Launch(); kernel->Launch();
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
EXPECT_NEAR(out_data[i], 3.4 * i, 1e-1); EXPECT_NEAR(out_data[i], 3.4 * i, 1e-6);
} }
} }
......
...@@ -5,13 +5,11 @@ endif() ...@@ -5,13 +5,11 @@ endif()
cc_library(cl_wrapper SRCS cl_wrapper.cc) cc_library(cl_wrapper SRCS cl_wrapper.cc)
cc_library(cl_tool SRCS cl_tool.cc) cc_library(cl_tool SRCS cl_tool.cc)
target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers) target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers)
cc_library(cl_half SRCS cl_half.cc)
target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing)
cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool) cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool)
cc_library(cl_context SRCS cl_context.cc DEPS cl_engine) cc_library(cl_context SRCS cl_context.cc DEPS cl_engine)
cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context) cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context)
cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor) cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS lite_tensor)
cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine) cc_library(cl_image SRCS cl_image.cc DEPS lite_tensor cl_image_converter cl_engine)
cc_library(cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image) cc_library(cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image)
lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper) lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper)
add_dependencies(cl_tool opencl_clhpp) add_dependencies(cl_tool opencl_clhpp)
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/lite/opencl/cl_caller.h" #include "paddle/fluid/lite/opencl/cl_caller.h"
#include <string> #include <string>
#include "paddle/fluid/lite/core/compatible_tensor.h" #include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_context.h"
#include "paddle/fluid/lite/opencl/cl_engine.h" #include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_helper.h" #include "paddle/fluid/lite/opencl/cl_helper.h"
#include "paddle/fluid/lite/opencl/cl_image.h" #include "paddle/fluid/lite/opencl/cl_image.h"
...@@ -23,16 +22,17 @@ limitations under the License. */ ...@@ -23,16 +22,17 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace lite { namespace lite {
static void CopyImageData(const CLImage& cl_image, float* out) { static void CopyImageData(CLHelper* helper, const CLImage& cl_image,
float* out) {
int width = cl_image.image_dims()[0]; int width = cl_image.image_dims()[0];
int height = cl_image.image_dims()[1]; int height = cl_image.image_dims()[1];
half_t* image_data = new half_t[height * width * 4]; float* image_data = new float[height * width * 4];
cl::Image* image = cl_image.cl_image(); cl::Image* image = cl_image.cl_image();
const std::array<size_t, 3> origin{0, 0, 0}; const std::array<size_t, 3> origin{0, 0, 0};
const std::array<size_t, 3> region{static_cast<size_t>(width), const std::array<size_t, 3> region{static_cast<size_t>(width),
static_cast<size_t>(height), 1}; static_cast<size_t>(height), 1};
cl_int err = CLEngine::Global()->command_queue().enqueueReadImage( cl_int err = helper->OpenCLCommandQueue().enqueueReadImage(
*image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr); *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
CL_CHECK_ERRORS(err); CL_CHECK_ERRORS(err);
...@@ -49,22 +49,25 @@ bool InitOpenCLEngine(std::string cl_path) { ...@@ -49,22 +49,25 @@ bool InitOpenCLEngine(std::string cl_path) {
return engine->IsInitSuccess(); return engine->IsInitSuccess();
} }
void elementwise_add(CLContext* context, const float* in, const DDim& in_dim, void elementwise_add(CLHelper* helper, const float* in, const DDim& in_dim,
const float* bias, const DDim& bias_dim, float* out, const float* bias, const DDim& bias_dim, float* out,
const DDim& out_dim) { const DDim& out_dim) {
CLHelper helper(context); if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) {
helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); LOG(FATAL) << "Error: bias dims is error";
auto kernel = helper.GetKernel(0); return;
}
auto kernel = bias_dim.size() == 1 ? helper->GetKernel("channel_add")
: helper->GetKernel("elementwise_add");
CLImage in_image; CLImage in_image;
in_image.set_tensor_data(in, in_dim); in_image.set_tensor_data(in, in_dim);
in_image.InitNormalCLImage(helper.OpenCLContext()); in_image.InitNormalCLImage(helper->OpenCLContext());
VLOG(3) << " --- Inpu image: " << in_image << " --- "; VLOG(3) << " --- Inpu image: " << in_image << " --- ";
CLImage bias_image; CLImage bias_image;
bias_image.set_tensor_data(bias, bias_dim); bias_image.set_tensor_data(bias, bias_dim);
bias_image.InitNormalCLImage(helper.OpenCLContext()); bias_image.InitCLImage(helper->OpenCLContext());
VLOG(3) << " --- Bias image: " << bias_image << " --- "; VLOG(3) << " --- Bias image: " << bias_image << " --- ";
CLImage out_image; CLImage out_image;
out_image.InitEmptyImage(helper.OpenCLContext(), out_dim); out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
cl_int status; cl_int status;
status = kernel.setArg(0, *in_image.cl_image()); status = kernel.setArg(0, *in_image.cl_image());
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
...@@ -72,16 +75,23 @@ void elementwise_add(CLContext* context, const float* in, const DDim& in_dim, ...@@ -72,16 +75,23 @@ void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = kernel.setArg(2, *out_image.cl_image()); status = kernel.setArg(2, *out_image.cl_image());
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
if (bias_dim.size() == 1) {
int tensor_w = in_dim[3];
status = kernel.setArg(3, tensor_w);
CL_CHECK_ERRORS(status);
}
size_t width = in_image.ImageWidth(); size_t width = in_image.ImageWidth();
size_t height = in_image.ImageHeight(); size_t height = in_image.ImageHeight();
auto global_work_size = cl::NDRange{width, height}; auto global_work_size = cl::NDRange{width, height};
status = helper.OpenCLCommandQueue().enqueueNDRangeKernel( status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = helper->OpenCLCommandQueue().finish();
CL_CHECK_ERRORS(status);
VLOG(3) << " --- Out image: " << out_image << " --- "; VLOG(3) << " --- Out image: " << out_image << " --- ";
CopyImageData(helper, out_image, out);
CopyImageData(out_image, out);
} }
} // namespace lite } // namespace lite
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/lite/core/compatible_tensor.h" #include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_context.h" #include "paddle/fluid/lite/opencl/cl_helper.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -27,7 +27,7 @@ bool InitOpenCLEngine(std::string cl_path); ...@@ -27,7 +27,7 @@ bool InitOpenCLEngine(std::string cl_path);
/// black box so that the framework can remain simple. /// black box so that the framework can remain simple.
/// NOTE Currently, these methods are quite expensive, we will optimize them /// NOTE Currently, these methods are quite expensive, we will optimize them
/// latter. /// latter.
void elementwise_add(CLContext* context, const float* in, const DDim& in_dim, void elementwise_add(CLHelper* helper, const float* in, const DDim& in_dim,
const float* bias, const DDim& bias_dim, float* out, const float* bias, const DDim& bias_dim, float* out,
const DDim& out_dim); const DDim& out_dim);
......
...@@ -156,8 +156,7 @@ bool CLEngine::InitializeDevice() { ...@@ -156,8 +156,7 @@ bool CLEngine::InitializeDevice() {
if (ext_data.find("cl_khr_fp16") != std::string::npos) { if (ext_data.find("cl_khr_fp16") != std::string::npos) {
LOG(INFO) << "The chosen device supports the half data type."; LOG(INFO) << "The chosen device supports the half data type.";
} else { } else {
LOG(ERROR) << "The chosen device doesn't support the half data type!"; LOG(INFO) << "The chosen device doesn't support the half data type!";
return false;
} }
auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
LOG(INFO) << "The chosen device has " << max_units << " compute units."; LOG(INFO) << "The chosen device has " << max_units << " compute units.";
......
此差异已折叠。
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <array> #include <array>
#include "paddle/fluid/lite/opencl/cl_engine.h" #include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_half.h"
#include "paddle/fluid/lite/opencl/cl_tool.h" #include "paddle/fluid/lite/opencl/cl_tool.h"
namespace paddle { namespace paddle {
...@@ -26,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { ...@@ -26,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
int width = cl_image.image_dims_[0]; int width = cl_image.image_dims_[0];
int height = cl_image.image_dims_[1]; int height = cl_image.image_dims_[1];
half_t* image_data = new half_t[height * width * 4]; float* image_data = new float[height * width * 4];
cl::Image* image = cl_image.cl_image(); cl::Image* image = cl_image.cl_image();
const std::array<size_t, 3> origin{0, 0, 0}; const std::array<size_t, 3> origin{0, 0, 0};
const std::array<size_t, 3> region{static_cast<size_t>(width), const std::array<size_t, 3> region{static_cast<size_t>(width),
...@@ -131,9 +130,9 @@ void CLImage::InitCLImage(const cl::Context& context, ...@@ -131,9 +130,9 @@ void CLImage::InitCLImage(const cl::Context& context,
image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
half_t* image_data = new half_t[image_dims_.product() * 4]; float* image_data = new float[image_dims_.product() * 4];
#else #else
half_t* image_data = new half_t[image_dims_.production() * 4]; float* image_data = new float[image_dims_.production() * 4];
#endif #endif
VLOG(3) << " convert to image "; VLOG(3) << " convert to image ";
...@@ -151,7 +150,7 @@ void CLImage::InitCLImage(const cl::Context& context, ...@@ -151,7 +150,7 @@ void CLImage::InitCLImage(const cl::Context& context,
void CLImage::InitCLImage(const cl::Context& context, int width, int height, void CLImage::InitCLImage(const cl::Context& context, int width, int height,
void* data) { void* data) {
cl::ImageFormat img_format(CL_RGBA, CL_HALF_FLOAT); cl::ImageFormat img_format(CL_RGBA, CL_FLOAT);
cl_int err; cl_int err;
cl_image_.reset(new cl::Image2D( cl_image_.reset(new cl::Image2D(
context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0), context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
......
...@@ -36,7 +36,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -36,7 +36,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast<DDim::value_type>(height)})); static_cast<DDim::value_type>(height)}));
} }
void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, void CLImageConverterDefault::NCHWToImage(float *nchw, float *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1}; size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) { for (size_t j = 0; j < tensor_dim.size(); ++j) {
...@@ -68,7 +68,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, ...@@ -68,7 +68,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
if (c < C) { if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4); // (c % 4);
image[i2] = Float2Half(*p); image[i2] = *p;
i2 += 4; i2 += 4;
p++; p++;
} else { } else {
...@@ -83,7 +83,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, ...@@ -83,7 +83,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
} }
} }
void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, void CLImageConverterDefault::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1}; size_t new_dims[] = {1, 1, 1, 1};
...@@ -107,7 +107,7 @@ void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, ...@@ -107,7 +107,7 @@ void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
for (size_t h = 0; h < H; h++) { for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4; size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) { for (size_t w = 0; w < W; w++) {
*p = Half2Float(image[i2]); *p = image[i2];
i2 += 4; i2 += 4;
p++; p++;
} }
...@@ -161,7 +161,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -161,7 +161,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
} }
void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image, void CLImageConverterFolder::NCHWToImage(float *tensor, float *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!"; << " Tensor dim is not support!";
...@@ -184,14 +184,13 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image, ...@@ -184,14 +184,13 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
for (size_t h = 0; h < tdim[0]; h++) { for (size_t h = 0; h < tdim[0]; h++) {
for (size_t w = 0; w < tdim[1]; w++) { for (size_t w = 0; w < tdim[1]; w++) {
image[(h * width + w / 4) * 4 + (w % 4)] = image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w];
Float2Half(tensor[h * tdim[1] + w]);
} }
} }
} }
} }
void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, void CLImageConverterFolder::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
if (tensor_dim.size() > 2) { if (tensor_dim.size() > 2) {
...@@ -213,7 +212,7 @@ void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, ...@@ -213,7 +212,7 @@ void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
for (size_t h = 0; h < H; h++) { for (size_t h = 0; h < H; h++) {
for (size_t w = 0; w < W; w++) { for (size_t w = 0; w < W; w++) {
p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]); p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)];
} }
} }
} }
...@@ -233,7 +232,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -233,7 +232,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast<DDim::value_type>(height)})); static_cast<DDim::value_type>(height)}));
} }
void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, void CLImageConverterNWBlock::NCHWToImage(float *tensor, float *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
auto image_dim = InitImageDimInfoWith(tensor_dim); auto image_dim = InitImageDimInfoWith(tensor_dim);
...@@ -253,7 +252,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, ...@@ -253,7 +252,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4; w * 4 + n % 4;
if (n < N) { if (n < N) {
image[index] = Float2Half(*p); image[index] = *p;
p++; p++;
} else { } else {
image[index] = 0.0; image[index] = 0.0;
...@@ -268,7 +267,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, ...@@ -268,7 +267,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
VLOG(3) << " init done"; VLOG(3) << " init done";
} }
void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, void CLImageConverterNWBlock::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
...@@ -286,7 +285,7 @@ void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, ...@@ -286,7 +285,7 @@ void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
for (size_t w = 0; w < W; ++w) { for (size_t w = 0; w < W; ++w) {
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4; w * 4 + n % 4;
*p = Half2Float(image[index]); *p = image[index];
p++; p++;
if (index >= (width * height * 4)) { if (index >= (width * height * 4)) {
LOG(INFO) << " index out of range "; LOG(INFO) << " index out of range ";
...@@ -312,7 +311,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -312,7 +311,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast<DDim::value_type>(height)})); static_cast<DDim::value_type>(height)}));
} }
void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, void CLImageConverterDWBlock::NCHWToImage(float *tensor, float *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1}; size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) { for (size_t j = 0; j < tensor_dim.size(); ++j) {
...@@ -344,7 +343,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, ...@@ -344,7 +343,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
if (c < C) { if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4); // (c % 4);
image[i2] = Float2Half(*p); image[i2] = *p;
i2 += 4; i2 += 4;
p++; p++;
} else { } else {
...@@ -359,7 +358,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, ...@@ -359,7 +358,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
} }
} }
void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, void CLImageConverterDWBlock::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
...@@ -377,7 +376,7 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, ...@@ -377,7 +376,7 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
for (size_t h = 0; h < H; h++) { for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4; size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) { for (size_t w = 0; w < W; w++) {
*p = Half2Float(image[i2]); *p = image[i2];
i2 += 4; i2 += 4;
p++; p++;
} }
...@@ -410,7 +409,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -410,7 +409,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast<DDim::value_type>(height)})); static_cast<DDim::value_type>(height)}));
} }
void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image, void CLImageConverterNormal::NCHWToImage(float *tensor, float *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!"; << " Tensor dim is not support!";
...@@ -419,7 +418,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image, ...@@ -419,7 +418,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
default_converter.NCHWToImage(tensor, image, tensor_dim); default_converter.NCHWToImage(tensor, image, tensor_dim);
} }
void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor, void CLImageConverterNormal::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CLImageConverterDefault default_converter; CLImageConverterDefault default_converter;
...@@ -439,10 +438,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith( ...@@ -439,10 +438,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
static_cast<DDim::value_type>(height)})); static_cast<DDim::value_type>(height)}));
} }
void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image, void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, float *image,
const DDim &tensor_dim) {} const DDim &tensor_dim) {}
void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor, void CLImageConverterWinoTransWeight::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) {} const DDim &tensor_dim) {}
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/lite/core/compatible_tensor.h" #include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_half.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -24,10 +23,10 @@ class CLImageConverterBase { ...@@ -24,10 +23,10 @@ class CLImageConverterBase {
public: public:
virtual ~CLImageConverterBase() {} virtual ~CLImageConverterBase() {}
virtual void NCHWToImage(float *nchw, half_t *image, virtual void NCHWToImage(float *nchw, float *image,
const DDim &tensor_dim) = 0; const DDim &tensor_dim) = 0;
virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim, virtual void ImageToNCHW(float *image, float *nchw, const DDim &image_dim,
const DDim &tensor_dim) = 0; const DDim &tensor_dim) = 0;
virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0; virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
}; };
...@@ -35,16 +34,16 @@ class CLImageConverterBase { ...@@ -35,16 +34,16 @@ class CLImageConverterBase {
class CLImageConverterDefault : public CLImageConverterBase { class CLImageConverterDefault : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim); DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim); void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim); const DDim &tensor_dim);
}; };
class CLImageConverterFolder : public CLImageConverterBase { class CLImageConverterFolder : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim); DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim); const DDim &tensor_dim);
/* /*
...@@ -68,8 +67,8 @@ class CLImageConverterFolder : public CLImageConverterBase { ...@@ -68,8 +67,8 @@ class CLImageConverterFolder : public CLImageConverterBase {
class CLImageConverterNormal : public CLImageConverterBase { class CLImageConverterNormal : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim); DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim); const DDim &tensor_dim);
/* /*
...@@ -92,22 +91,22 @@ class CLImageConverterNormal : public CLImageConverterBase { ...@@ -92,22 +91,22 @@ class CLImageConverterNormal : public CLImageConverterBase {
class CLImageConverterNWBlock : public CLImageConverterBase { class CLImageConverterNWBlock : public CLImageConverterBase {
DDim InitImageDimInfoWith(const DDim &tensor_dim); DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim); const DDim &tensor_dim);
}; };
class CLImageConverterDWBlock : public CLImageConverterBase { class CLImageConverterDWBlock : public CLImageConverterBase {
DDim InitImageDimInfoWith(const DDim &tensor_dim); DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim); const DDim &tensor_dim);
}; };
class CLImageConverterWinoTransWeight : public CLImageConverterBase { class CLImageConverterWinoTransWeight : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim); DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim); const DDim &tensor_dim);
}; };
......
...@@ -12,21 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,21 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once __kernel void channel_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage, __private const int w) {
#include <cstdint> int x = get_global_id(0);
int y = get_global_id(1);
namespace paddle { const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
namespace lite { int2 coords;
coords.x = x;
typedef uint16_t half_t; coords.y = y;
int2 coords_bias;
half_t Float2Half(float f); coords_bias.x = x/w;
coords_bias.y = 0;
float Half2Float(half_t h); float4 in = read_imagef(input, sampler, coords);
float4 biase = read_imagef(bias, sampler, coords_bias);
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); float4 output = in + biase;
write_imagef(outputImage, coords, output);
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count); }
} // namespace lite
} // namespace paddle
...@@ -14,21 +14,19 @@ limitations under the License. */ ...@@ -14,21 +14,19 @@ limitations under the License. */
#pragma once #pragma once
#pragma OPENCL EXTENSION cl_khr_fp16 : enable inline float4 activation(float4 in
inline half4 activation(half4 in
#ifdef PRELU #ifdef PRELU
, ,
half4 prelu_alpha float4 prelu_alpha
#endif #endif
) { ) {
half4 output; float4 output;
#ifdef PRELU #ifdef PRELU
output = select(prelu_alpha * in, in, in >= (half4)0.0); output = select(prelu_alpha * in, in, in >= (float4)0.0);
#endif #endif
#ifdef RELU #ifdef RELU
output = fmax(in, (half4)(0.0f)); output = fmax(in, (float4)(0.0f));
#endif #endif
return output; return output;
} }
...@@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable __kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords; int2 coords;
coords.x = x; coords.x = x;
coords.y = y; coords.y = y;
half4 in = read_imageh(input, sampler, coords); float4 in = read_imagef(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords); float4 biase = read_imagef(bias, sampler, coords);
half4 output = in + biase; float4 output = in + biase;
write_imageh(outputImage,coords,output); write_imagef(outputImage,coords,output);
} }
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define MIN_VALUE -FLT_MAX #define MIN_VALUE -FLT_MAX
__kernel void pool_max( __kernel void pool_max(
...@@ -41,16 +40,16 @@ __kernel void pool_max( ...@@ -41,16 +40,16 @@ __kernel void pool_max(
const int pos_in_x = out_c * in_width; const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height; const int pos_in_y = out_n * in_height;
half4 max_value = (half4)(MIN_VALUE); float4 max_value = (float4)(MIN_VALUE);
for (int y = start_h; y < end_h; ++y) { for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) { for (int x = start_w; x < end_w; ++x) {
half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
max_value = max(max_value, tmp); max_value = max(max_value, tmp);
} }
} }
const int pos_out_x = mad24(out_c, out_width, out_w); const int pos_out_x = mad24(out_c, out_width, out_w);
write_imageh(output, (int2)(pos_out_x, out_nh), max_value); write_imagef(output, (int2)(pos_out_x, out_nh), max_value);
} }
__kernel void pool_avg( __kernel void pool_avg(
...@@ -77,15 +76,15 @@ __kernel void pool_avg( ...@@ -77,15 +76,15 @@ __kernel void pool_avg(
const int pos_in_x = out_c * in_width; const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height; const int pos_in_y = out_n * in_height;
half4 sum = (half4)(0.0f); float4 sum = (float4)(0.0f);
int num = 0; int num = 0;
for (int y = start_h; y < end_h; ++y) { for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) { for (int x = start_w; x < end_w; ++x) {
sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
num++; num++;
} }
} }
half4 avg = sum / num; float4 avg = sum / num;
const int pos_out_x = mad24(out_c, out_width, out_w); const int pos_out_x = mad24(out_c, out_width, out_w);
write_imageh(output, (int2)(pos_out_x, out_nh), avg); write_imagef(output, (int2)(pos_out_x, out_nh), avg);
} }
...@@ -67,28 +67,28 @@ TEST(cl_test, kernel_test) { ...@@ -67,28 +67,28 @@ TEST(cl_test, kernel_test) {
helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
auto kernel = helper->GetKernel(2); auto kernel = helper->GetKernel(2);
std::unique_ptr<float[]> in_data(new float[1024 * 512]); std::unique_ptr<float[]> in_data(new float[4 * 3 * 256 * 512]);
for (int i = 0; i < 1024 * 512; i++) { for (int i = 0; i < 4 * 3 * 256 * 512; i++) {
in_data[i] = 1.f; in_data[i] = 1.f;
} }
const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512}); const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
CLImage in_image; CLImage in_image;
in_image.set_tensor_data(in_data.get(), in_dim); in_image.set_tensor_data(in_data.get(), in_dim);
in_image.InitNormalCLImage(helper->OpenCLContext()); in_image.InitNormalCLImage(helper->OpenCLContext());
LOG(INFO) << in_image; LOG(INFO) << in_image;
std::unique_ptr<float[]> bias_data(new float[1024 * 512]); std::unique_ptr<float[]> bias_data(new float[4 * 3 * 256 * 512]);
for (int i = 0; i < 1024 * 512; i++) { for (int i = 0; i < 4 * 3 * 256 * 512; i++) {
bias_data[i] = 2.f; bias_data[i] = 2.f;
} }
const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512}); const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
CLImage bias_image; CLImage bias_image;
bias_image.set_tensor_data(bias_data.get(), bias_dim); bias_image.set_tensor_data(bias_data.get(), bias_dim);
bias_image.InitNormalCLImage(helper->OpenCLContext()); bias_image.InitNormalCLImage(helper->OpenCLContext());
LOG(INFO) << bias_image; LOG(INFO) << bias_image;
CLImage out_image; CLImage out_image;
const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512}); const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
out_image.InitEmptyImage(helper->OpenCLContext(), out_dim); out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
LOG(INFO) << out_image; LOG(INFO) << out_image;
...@@ -108,7 +108,8 @@ TEST(cl_test, kernel_test) { ...@@ -108,7 +108,8 @@ TEST(cl_test, kernel_test) {
status = helper->OpenCLCommandQueue().enqueueNDRangeKernel( status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event); kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = helper->OpenCLCommandQueue().finish();
CL_CHECK_ERRORS(status);
double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
...@@ -116,37 +117,99 @@ TEST(cl_test, kernel_test) { ...@@ -116,37 +117,99 @@ TEST(cl_test, kernel_test) {
LOG(INFO) << out_image; LOG(INFO) << out_image;
} }
TEST(cl_test, elementwise_add_test) { TEST(cl_test, channel_add_test) {
std::default_random_engine engine; std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5); std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512}); const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> in_data(new float[1024 * 512]); std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 1024 * 512; i++) { for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
in_data[i] = dist(engine); in_data[i] = dist(engine);
} }
const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512}); const DDim bias_dim = DDim(std::vector<DDim::value_type>{16});
std::unique_ptr<float[]> bias_data(new float[1024 * 512]); std::unique_ptr<float[]> bias_data(new float[16]);
for (int i = 0; i < 1024 * 512; i++) { for (int i = 0; i < 16; i++) {
bias_data[i] = dist(engine); bias_data[i] = dist(engine);
} }
const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512}); std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
std::unique_ptr<float[]> out(new float[1024 * 512]); for (int i = 0; i < 4; i++) {
for (int j = 0; j < 16; j++) {
float b = bias_data[j];
for (int k = 0; k < 256 * 512; k++) {
int index = (i * 16 + j) * 256 * 512 + k;
out_ref[index] = in_data[index] + b;
}
}
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
bool status = InitOpenCLEngine(FLAGS_cl_path); bool status = InitOpenCLEngine(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL engine."; CHECK(status) << "Fail to initialize OpenCL engine.";
CLContext context; std::unique_ptr<CLContext> context(new CLContext);
std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
helper->AddKernel("channel_add", "channel_add_kernel.cl");
elementwise_add(helper.get(), in_data.get(), in_dim, bias_data.get(),
bias_dim, out.get(), out_dim);
int stride = 4 * 16 * 256 * 512 / 20;
for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
std::cout << out[i] << " ";
}
elementwise_add(&context, in_data.get(), in_dim, bias_data.get(), bias_dim, for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
out.get(), out_dim); EXPECT_NEAR(out[i], out_ref[i], 1e-6);
}
int stride = 1024 * 512 / 20; std::cout << std::endl;
for (int i = 0; i < 1024 * 512; i += stride) { }
TEST(cl_test, elementwise_add_test) {
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
in_data[i] = dist(engine);
}
const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> bias_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
bias_data[i] = dist(engine);
}
std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
out_ref[i] = in_data[i] + bias_data[i];
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
bool status = InitOpenCLEngine(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL engine.";
std::unique_ptr<CLContext> context(new CLContext);
std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
helper->AddKernel("channel_add", "channel_add_kernel.cl");
elementwise_add(helper.get(), in_data.get(), in_dim, bias_data.get(),
bias_dim, out.get(), out_dim);
int stride = 4 * 16 * 256 * 512 / 20;
for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
std::cout << out[i] << " "; std::cout << out[i] << " ";
} }
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
EXPECT_NEAR(out[i], out_ref[i], 1e-6);
}
std::cout << std::endl; std::cout << std::endl;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册