提交 19bea13c 编写于 作者: Z ZhenWang

fix elementwise_add acc bugs.

上级 1fbd77d4
......@@ -61,7 +61,7 @@ void TestModel(const std::vector<Place>& valid_places,
3.13812525e-05, 6.52209565e-05, 4.78087313e-05,
2.58822285e-04});
for (int i = 0; i < results.size(); ++i) {
EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
EXPECT_NEAR(out->data<float>()[i], results[i], 1e-6);
}
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
......
......@@ -236,12 +236,15 @@ class Context<TargetType::kOpenCL> {
void CopySharedTo(const OpenClContext* ctx) {
ctx->cl_context_ = cl_context_;
ctx->cl_helper_ = cl_helper_;
}
private:
void PrepareKernels() {
cl_helper_->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
cl_helper_->AddKernel("channel_add", "channel_add_kernel.cl");
cl_helper_->AddKernel("pool_max", "pool_kernel.cl");
cl_helper_->AddKernel("pool_avg", "pool_kernel.cl");
}
};
#endif
......
......@@ -31,10 +31,10 @@ class ElementwiseAddCompute
void Run() override {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenClContext>();
CHECK(context.cl_context());
CHECK(context.cl_helper() != nullptr);
elementwise_add(
context.cl_context(), static_cast<const float*>(param.X->raw_data()),
context.cl_helper(), static_cast<const float*>(param.X->raw_data()),
param.X->dims(), static_cast<const float*>(param.Y->raw_data()),
param.Y->dims(), param.Out->mutable_data<float>(), param.Out->dims());
}
......
......@@ -40,9 +40,9 @@ TEST(elementwise_add, init) {
kernel->SetParam(param);
kernel->SetContext(std::move(context));
X.Resize({1, 10});
Y.Resize({1, 10});
Out.Resize({1, 10});
X.Resize({1, 1, 1, 10});
Y.Resize({1, 1, 1, 10});
Out.Resize({1, 1, 1, 10});
auto* x_data = X.mutable_data<float>();
auto* y_data = Y.mutable_data<float>();
......@@ -56,7 +56,7 @@ TEST(elementwise_add, init) {
kernel->Launch();
for (int i = 0; i < 10; i++) {
EXPECT_NEAR(out_data[i], 3.4 * i, 1e-1);
EXPECT_NEAR(out_data[i], 3.4 * i, 1e-6);
}
}
......
......@@ -5,13 +5,11 @@ endif()
cc_library(cl_wrapper SRCS cl_wrapper.cc)
cc_library(cl_tool SRCS cl_tool.cc)
target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers)
cc_library(cl_half SRCS cl_half.cc)
target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing)
cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool)
cc_library(cl_context SRCS cl_context.cc DEPS cl_engine)
cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context)
cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor)
cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine)
cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS lite_tensor)
cc_library(cl_image SRCS cl_image.cc DEPS lite_tensor cl_image_converter cl_engine)
cc_library(cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image)
lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper)
add_dependencies(cl_tool opencl_clhpp)
......@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/lite/opencl/cl_caller.h"
#include <string>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_context.h"
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_helper.h"
#include "paddle/fluid/lite/opencl/cl_image.h"
......@@ -23,16 +22,17 @@ limitations under the License. */
namespace paddle {
namespace lite {
static void CopyImageData(const CLImage& cl_image, float* out) {
static void CopyImageData(CLHelper* helper, const CLImage& cl_image,
float* out) {
int width = cl_image.image_dims()[0];
int height = cl_image.image_dims()[1];
half_t* image_data = new half_t[height * width * 4];
float* image_data = new float[height * width * 4];
cl::Image* image = cl_image.cl_image();
const std::array<size_t, 3> origin{0, 0, 0};
const std::array<size_t, 3> region{static_cast<size_t>(width),
static_cast<size_t>(height), 1};
cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
cl_int err = helper->OpenCLCommandQueue().enqueueReadImage(
*image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
CL_CHECK_ERRORS(err);
......@@ -49,22 +49,25 @@ bool InitOpenCLEngine(std::string cl_path) {
return engine->IsInitSuccess();
}
void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
void elementwise_add(CLHelper* helper, const float* in, const DDim& in_dim,
const float* bias, const DDim& bias_dim, float* out,
const DDim& out_dim) {
CLHelper helper(context);
helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
auto kernel = helper.GetKernel(0);
if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) {
LOG(FATAL) << "Error: bias dims is error";
return;
}
auto kernel = bias_dim.size() == 1 ? helper->GetKernel("channel_add")
: helper->GetKernel("elementwise_add");
CLImage in_image;
in_image.set_tensor_data(in, in_dim);
in_image.InitNormalCLImage(helper.OpenCLContext());
in_image.InitNormalCLImage(helper->OpenCLContext());
VLOG(3) << " --- Inpu image: " << in_image << " --- ";
CLImage bias_image;
bias_image.set_tensor_data(bias, bias_dim);
bias_image.InitNormalCLImage(helper.OpenCLContext());
bias_image.InitCLImage(helper->OpenCLContext());
VLOG(3) << " --- Bias image: " << bias_image << " --- ";
CLImage out_image;
out_image.InitEmptyImage(helper.OpenCLContext(), out_dim);
out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
cl_int status;
status = kernel.setArg(0, *in_image.cl_image());
CL_CHECK_ERRORS(status);
......@@ -72,16 +75,23 @@ void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
CL_CHECK_ERRORS(status);
status = kernel.setArg(2, *out_image.cl_image());
CL_CHECK_ERRORS(status);
if (bias_dim.size() == 1) {
int tensor_w = in_dim[3];
status = kernel.setArg(3, tensor_w);
CL_CHECK_ERRORS(status);
}
size_t width = in_image.ImageWidth();
size_t height = in_image.ImageHeight();
auto global_work_size = cl::NDRange{width, height};
status = helper.OpenCLCommandQueue().enqueueNDRangeKernel(
status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
CL_CHECK_ERRORS(status);
status = helper->OpenCLCommandQueue().finish();
CL_CHECK_ERRORS(status);
VLOG(3) << " --- Out image: " << out_image << " --- ";
CopyImageData(out_image, out);
CopyImageData(helper, out_image, out);
}
} // namespace lite
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_context.h"
#include "paddle/fluid/lite/opencl/cl_helper.h"
namespace paddle {
namespace lite {
......@@ -27,7 +27,7 @@ bool InitOpenCLEngine(std::string cl_path);
/// black box so that the framework can remain simple.
/// NOTE Currently, these methods are quite expensive, we will optimize them
/// latter.
void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
void elementwise_add(CLHelper* helper, const float* in, const DDim& in_dim,
const float* bias, const DDim& bias_dim, float* out,
const DDim& out_dim);
......
......@@ -156,8 +156,7 @@ bool CLEngine::InitializeDevice() {
if (ext_data.find("cl_khr_fp16") != std::string::npos) {
LOG(INFO) << "The chosen device supports the half data type.";
} else {
LOG(ERROR) << "The chosen device doesn't support the half data type!";
return false;
LOG(INFO) << "The chosen device doesn't support the half data type!";
}
auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
LOG(INFO) << "The chosen device has " << max_units << " compute units.";
......
此差异已折叠。
......@@ -16,7 +16,6 @@ limitations under the License. */
#include <glog/logging.h>
#include <array>
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_half.h"
#include "paddle/fluid/lite/opencl/cl_tool.h"
namespace paddle {
......@@ -26,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
int width = cl_image.image_dims_[0];
int height = cl_image.image_dims_[1];
half_t* image_data = new half_t[height * width * 4];
float* image_data = new float[height * width * 4];
cl::Image* image = cl_image.cl_image();
const std::array<size_t, 3> origin{0, 0, 0};
const std::array<size_t, 3> region{static_cast<size_t>(width),
......@@ -131,9 +130,9 @@ void CLImage::InitCLImage(const cl::Context& context,
image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
half_t* image_data = new half_t[image_dims_.product() * 4];
float* image_data = new float[image_dims_.product() * 4];
#else
half_t* image_data = new half_t[image_dims_.production() * 4];
float* image_data = new float[image_dims_.production() * 4];
#endif
VLOG(3) << " convert to image ";
......@@ -151,7 +150,7 @@ void CLImage::InitCLImage(const cl::Context& context,
void CLImage::InitCLImage(const cl::Context& context, int width, int height,
void* data) {
cl::ImageFormat img_format(CL_RGBA, CL_HALF_FLOAT);
cl::ImageFormat img_format(CL_RGBA, CL_FLOAT);
cl_int err;
cl_image_.reset(new cl::Image2D(
context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
......
......@@ -36,7 +36,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast<DDim::value_type>(height)}));
}
void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
void CLImageConverterDefault::NCHWToImage(float *nchw, float *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) {
......@@ -68,7 +68,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = Float2Half(*p);
image[i2] = *p;
i2 += 4;
p++;
} else {
......@@ -83,7 +83,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
}
}
void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
void CLImageConverterDefault::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
......@@ -107,7 +107,7 @@ void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) {
*p = Half2Float(image[i2]);
*p = image[i2];
i2 += 4;
p++;
}
......@@ -161,7 +161,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
}
}
void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
void CLImageConverterFolder::NCHWToImage(float *tensor, float *image,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!";
......@@ -184,14 +184,13 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
for (size_t h = 0; h < tdim[0]; h++) {
for (size_t w = 0; w < tdim[1]; w++) {
image[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor[h * tdim[1] + w]);
image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w];
}
}
}
}
void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
void CLImageConverterFolder::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
if (tensor_dim.size() > 2) {
......@@ -213,7 +212,7 @@ void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
for (size_t h = 0; h < H; h++) {
for (size_t w = 0; w < W; w++) {
p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)];
}
}
}
......@@ -233,7 +232,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast<DDim::value_type>(height)}));
}
void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
void CLImageConverterNWBlock::NCHWToImage(float *tensor, float *image,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
auto image_dim = InitImageDimInfoWith(tensor_dim);
......@@ -253,7 +252,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4;
if (n < N) {
image[index] = Float2Half(*p);
image[index] = *p;
p++;
} else {
image[index] = 0.0;
......@@ -268,7 +267,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
VLOG(3) << " init done";
}
void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
void CLImageConverterNWBlock::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
......@@ -286,7 +285,7 @@ void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
for (size_t w = 0; w < W; ++w) {
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4;
*p = Half2Float(image[index]);
*p = image[index];
p++;
if (index >= (width * height * 4)) {
LOG(INFO) << " index out of range ";
......@@ -312,7 +311,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast<DDim::value_type>(height)}));
}
void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
void CLImageConverterDWBlock::NCHWToImage(float *tensor, float *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) {
......@@ -344,7 +343,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = Float2Half(*p);
image[i2] = *p;
i2 += 4;
p++;
} else {
......@@ -359,7 +358,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
}
}
void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
void CLImageConverterDWBlock::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
......@@ -377,7 +376,7 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) {
*p = Half2Float(image[i2]);
*p = image[i2];
i2 += 4;
p++;
}
......@@ -410,7 +409,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast<DDim::value_type>(height)}));
}
void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
void CLImageConverterNormal::NCHWToImage(float *tensor, float *image,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!";
......@@ -419,7 +418,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
default_converter.NCHWToImage(tensor, image, tensor_dim);
}
void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
void CLImageConverterNormal::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
CLImageConverterDefault default_converter;
......@@ -439,10 +438,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
static_cast<DDim::value_type>(height)}));
}
void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image,
void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, float *image,
const DDim &tensor_dim) {}
void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor,
void CLImageConverterWinoTransWeight::ImageToNCHW(float *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {}
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_half.h"
namespace paddle {
namespace lite {
......@@ -24,10 +23,10 @@ class CLImageConverterBase {
public:
virtual ~CLImageConverterBase() {}
virtual void NCHWToImage(float *nchw, half_t *image,
virtual void NCHWToImage(float *nchw, float *image,
const DDim &tensor_dim) = 0;
virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
virtual void ImageToNCHW(float *image, float *nchw, const DDim &image_dim,
const DDim &tensor_dim) = 0;
virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
};
......@@ -35,16 +34,16 @@ class CLImageConverterBase {
class CLImageConverterDefault : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim);
void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterFolder : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
/*
......@@ -68,8 +67,8 @@ class CLImageConverterFolder : public CLImageConverterBase {
class CLImageConverterNormal : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
/*
......@@ -92,22 +91,22 @@ class CLImageConverterNormal : public CLImageConverterBase {
class CLImageConverterNWBlock : public CLImageConverterBase {
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterDWBlock : public CLImageConverterBase {
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterWinoTransWeight : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
......
......@@ -12,21 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
namespace paddle {
namespace lite {
typedef uint16_t half_t;
half_t Float2Half(float f);
float Half2Float(half_t h);
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
} // namespace lite
} // namespace paddle
__kernel void channel_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage, __private const int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x/w;
coords_bias.y = 0;
float4 in = read_imagef(input, sampler, coords);
float4 biase = read_imagef(bias, sampler, coords_bias);
float4 output = in + biase;
write_imagef(outputImage, coords, output);
}
......@@ -14,21 +14,19 @@ limitations under the License. */
#pragma once
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
inline half4 activation(half4 in
inline float4 activation(float4 in
#ifdef PRELU
,
half4 prelu_alpha
,
float4 prelu_alpha
#endif
) {
half4 output;
) {
float4 output;
#ifdef PRELU
output = select(prelu_alpha * in, in, in >= (half4)0.0);
output = select(prelu_alpha * in, in, in >= (float4)0.0);
#endif
#ifdef RELU
output = fmax(in, (half4)(0.0f));
output = fmax(in, (float4)(0.0f));
#endif
return output;
}
......@@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = in + biase;
write_imageh(outputImage,coords,output);
float4 in = read_imagef(input, sampler, coords);
float4 biase = read_imagef(bias, sampler, coords);
float4 output = in + biase;
write_imagef(outputImage,coords,output);
}
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define MIN_VALUE -FLT_MAX
__kernel void pool_max(
......@@ -41,16 +40,16 @@ __kernel void pool_max(
const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
half4 max_value = (half4)(MIN_VALUE);
float4 max_value = (float4)(MIN_VALUE);
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
max_value = max(max_value, tmp);
}
}
const int pos_out_x = mad24(out_c, out_width, out_w);
write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
write_imagef(output, (int2)(pos_out_x, out_nh), max_value);
}
__kernel void pool_avg(
......@@ -77,15 +76,15 @@ __kernel void pool_avg(
const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
half4 sum = (half4)(0.0f);
float4 sum = (float4)(0.0f);
int num = 0;
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
num++;
}
}
half4 avg = sum / num;
float4 avg = sum / num;
const int pos_out_x = mad24(out_c, out_width, out_w);
write_imageh(output, (int2)(pos_out_x, out_nh), avg);
write_imagef(output, (int2)(pos_out_x, out_nh), avg);
}
......@@ -67,28 +67,28 @@ TEST(cl_test, kernel_test) {
helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
auto kernel = helper->GetKernel(2);
std::unique_ptr<float[]> in_data(new float[1024 * 512]);
for (int i = 0; i < 1024 * 512; i++) {
std::unique_ptr<float[]> in_data(new float[4 * 3 * 256 * 512]);
for (int i = 0; i < 4 * 3 * 256 * 512; i++) {
in_data[i] = 1.f;
}
const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
CLImage in_image;
in_image.set_tensor_data(in_data.get(), in_dim);
in_image.InitNormalCLImage(helper->OpenCLContext());
LOG(INFO) << in_image;
std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
for (int i = 0; i < 1024 * 512; i++) {
std::unique_ptr<float[]> bias_data(new float[4 * 3 * 256 * 512]);
for (int i = 0; i < 4 * 3 * 256 * 512; i++) {
bias_data[i] = 2.f;
}
const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
CLImage bias_image;
bias_image.set_tensor_data(bias_data.get(), bias_dim);
bias_image.InitNormalCLImage(helper->OpenCLContext());
LOG(INFO) << bias_image;
CLImage out_image;
const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
LOG(INFO) << out_image;
......@@ -108,7 +108,8 @@ TEST(cl_test, kernel_test) {
status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
CL_CHECK_ERRORS(status);
status = helper->OpenCLCommandQueue().finish();
CL_CHECK_ERRORS(status);
double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
......@@ -116,37 +117,99 @@ TEST(cl_test, kernel_test) {
LOG(INFO) << out_image;
}
TEST(cl_test, elementwise_add_test) {
TEST(cl_test, channel_add_test) {
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
std::unique_ptr<float[]> in_data(new float[1024 * 512]);
for (int i = 0; i < 1024 * 512; i++) {
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
in_data[i] = dist(engine);
}
const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
for (int i = 0; i < 1024 * 512; i++) {
const DDim bias_dim = DDim(std::vector<DDim::value_type>{16});
std::unique_ptr<float[]> bias_data(new float[16]);
for (int i = 0; i < 16; i++) {
bias_data[i] = dist(engine);
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
std::unique_ptr<float[]> out(new float[1024 * 512]);
std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 16; j++) {
float b = bias_data[j];
for (int k = 0; k < 256 * 512; k++) {
int index = (i * 16 + j) * 256 * 512 + k;
out_ref[index] = in_data[index] + b;
}
}
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
bool status = InitOpenCLEngine(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL engine.";
CLContext context;
std::unique_ptr<CLContext> context(new CLContext);
std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
helper->AddKernel("channel_add", "channel_add_kernel.cl");
elementwise_add(helper.get(), in_data.get(), in_dim, bias_data.get(),
bias_dim, out.get(), out_dim);
int stride = 4 * 16 * 256 * 512 / 20;
for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
std::cout << out[i] << " ";
}
elementwise_add(&context, in_data.get(), in_dim, bias_data.get(), bias_dim,
out.get(), out_dim);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
EXPECT_NEAR(out[i], out_ref[i], 1e-6);
}
int stride = 1024 * 512 / 20;
for (int i = 0; i < 1024 * 512; i += stride) {
std::cout << std::endl;
}
TEST(cl_test, elementwise_add_test) {
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
in_data[i] = dist(engine);
}
const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> bias_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
bias_data[i] = dist(engine);
}
std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
out_ref[i] = in_data[i] + bias_data[i];
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
bool status = InitOpenCLEngine(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL engine.";
std::unique_ptr<CLContext> context(new CLContext);
std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
helper->AddKernel("channel_add", "channel_add_kernel.cl");
elementwise_add(helper.get(), in_data.get(), in_dim, bias_data.get(),
bias_dim, out.get(), out_dim);
int stride = 4 * 16 * 256 * 512 / 20;
for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
std::cout << out[i] << " ";
}
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
EXPECT_NEAR(out[i], out_ref[i], 1e-6);
}
std::cout << std::endl;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册