未验证 提交 8b90a0c7 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] support fp16 for cl_image_converter, layout, activation all...

[LITE][OPENCL] support fp16 for cl_image_converter, layout, activation all OpenCL image kernel. test=develop (#2964)

* [LITE][OPENCL] support fp16 for cl_image_converter, layout, activation image kernel. test=develop

* add conv, depthwise and UT. test=develop

* add pool, conv, nearest_interp kernel. test=develop

* support fp16 for scale, reshape, concat, fc buffer opencl kernel. test=develop

* refactor for mul opencl buffer kernel. test=develop

* support fp16 for elementwise_mul opecl image kernel. test=develop

* support fp16 for elementwise_mul opencl image kernel. test=develop

* support fp16 for ele_add, fuse_ele_add_act opencl kernel. test=develop

* rename io_copy. test=develop

* mobilenetv1,v2 passed on 855. test=develop

* fix opt for opencl. test=develop
上级 6fcad721
......@@ -81,29 +81,65 @@ void TestModel(const std::vector<Place>& valid_places,
auto* out = predictor.GetOutput(0);
const auto* pdata = out->data<float>();
int step = 50;
#ifdef LITE_WITH_NPU
ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
VLOG(3) << diff;
EXPECT_LT(diff, eps);
// Get target and check result
VLOG(1) << "valid_places.size():" << valid_places.size();
for (int i = 0; i < valid_places.size(); ++i) {
auto p = valid_places[i];
VLOG(1) << "valid_places[" << i << "]:" << p.DebugString();
}
auto first_target = valid_places[0].target;
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
VLOG(3) << diff;
EXPECT_LT(diff, eps);
}
}
} else {
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
double eps = 1e-6;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
EXPECT_NEAR(result, ref[i][j], eps);
}
}
}
#else
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
double eps = 1e-6;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
EXPECT_NEAR(result, ref[i][j], eps);
// Get detailed result
auto* pred = &predictor;
size_t output_tensor_num = pred->GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor(
std::move(pred->GetOutput(tidx)));
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
float sum = 0.f;
for (int i = 0; i < out_dims.production(); ++i) {
sum += output_tensor->data<float>()[i];
}
VLOG(1) << "out_dims.production():" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
// print result
for (int i = 0; i < out_dims.production(); ++i) {
VLOG(2) << "output_tensor->data<float>()[" << i
<< "]:" << output_tensor->data<float>()[i];
}
}
#endif
}
#ifdef LITE_WITH_NPU
......@@ -130,7 +166,7 @@ TEST(MobileNetV1, test_arm) {
#ifdef LITE_WITH_OPENCL
TEST(MobileNetV1, test_opencl) {
std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
......
......@@ -83,27 +83,65 @@ void TestModel(const std::vector<Place>& valid_places,
auto* out = predictor.GetOutput(0);
const auto* pdata = out->data<float>();
int step = 50;
#ifdef LITE_WITH_NPU
ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
VLOG(3) << diff;
EXPECT_LT(diff, eps);
// Get target and check result
VLOG(1) << "valid_places.size():" << valid_places.size();
for (int i = 0; i < valid_places.size(); ++i) {
auto p = valid_places[i];
VLOG(1) << "valid_places[" << i << "]:" << p.DebugString();
}
auto first_target = valid_places[0].target;
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
VLOG(3) << diff;
EXPECT_LT(diff, eps);
}
}
} else {
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
double eps = 1e-6;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
EXPECT_NEAR(result, ref[i][j], eps);
}
}
}
#else
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
EXPECT_NEAR(pdata[j * step + (out->dims()[1] * i)], ref[i][j], 1e-6);
// Get detailed result
auto* pred = &predictor;
size_t output_tensor_num = pred->GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor(
std::move(pred->GetOutput(tidx)));
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
float sum = 0.f;
for (int i = 0; i < out_dims.production(); ++i) {
sum += output_tensor->data<float>()[i];
}
VLOG(1) << "out_dims.production():" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
// print result
for (int i = 0; i < out_dims.production(); ++i) {
VLOG(2) << "output_tensor->data<float>()[" << i
<< "]:" << output_tensor->data<float>()[i];
}
}
#endif
}
#ifdef LITE_WITH_NPU
......@@ -130,7 +168,7 @@ TEST(MobileNetV2, test_arm) {
#ifdef LITE_WITH_OPENCL
TEST(MobileNetV2, test_opencl) {
std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
......
......@@ -91,7 +91,7 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") {
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)});
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
......
......@@ -6,7 +6,8 @@ lite_cc_library(cl_wrapper SRCS cl_wrapper.cc)
lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper)
lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility)
lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime)
lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor)
lite_cc_library(cl_half SRCS cl_half.cc)
lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor cl_half)
lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime)
lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image)
lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
......
......@@ -30,7 +30,7 @@ static void CopyImageData(CLContext* context,
int width = cl_image.image_dims()[0];
int height = cl_image.image_dims()[1];
float* image_data = new float[height * width * 4];
uint16_t* image_data = new uint16_t[height * width * 4];
cl::Image* image = cl_image.cl_image();
cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = {
......
此差异已折叠。
......@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
#pragma once
#include <cstdint>
__kernel void relu(__read_only image2d_t input,
__write_only image2d_t output) {
namespace paddle {
namespace lite {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
typedef uint16_t half_t;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half_t Float2Half(float f);
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
float Half2Float(half_t h);
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
} // namespace lite
} // namespace paddle
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/opencl/cl_image.h"
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_runtime.h"
#include "lite/backends/opencl/cl_utility.h"
#include "lite/utils/cp_logging.h"
......@@ -24,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
int width = cl_image.image_dims_[0];
int height = cl_image.image_dims_[1];
float* image_data = new float[height * width * 4];
uint16_t* image_data = new uint16_t[height * width * 4];
cl::Image* image = cl_image.cl_image();
cl::array<size_t, 3> origin = {0, 0, 0};
......@@ -123,7 +124,7 @@ void CLImage::InitCLImage(const cl::Context& context,
VLOG(3) << " begin init cl image ";
image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
float* image_data = new float[image_dims_.production() * 4];
uint16_t* image_data = new uint16_t[image_dims_.production() * 4];
VLOG(3) << " convert to image ";
converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_);
......
......@@ -37,7 +37,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
}
void CLImageConverterDefault::NCHWToImage(float *nchw,
float *image,
half_t *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) {
......@@ -69,7 +69,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = *p;
image[i2] = Float2Half(*p);
i2 += 4;
p++;
} else {
......@@ -84,7 +84,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
}
}
void CLImageConverterDefault::ImageToNCHW(float *image,
void CLImageConverterDefault::ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
......@@ -109,7 +109,7 @@ void CLImageConverterDefault::ImageToNCHW(float *image,
for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) {
*p = image[i2];
*p = Half2Float(image[i2]);
i2 += 4;
p++;
}
......@@ -164,7 +164,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
}
void CLImageConverterFolder::NCHWToImage(float *tensor,
float *image,
half_t *image,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!";
......@@ -187,13 +187,14 @@ void CLImageConverterFolder::NCHWToImage(float *tensor,
for (size_t h = 0; h < tdim[0]; h++) {
for (size_t w = 0; w < tdim[1]; w++) {
image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w];
image[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor[h * tdim[1] + w]);
}
}
}
}
void CLImageConverterFolder::ImageToNCHW(float *image,
void CLImageConverterFolder::ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
......@@ -216,7 +217,7 @@ void CLImageConverterFolder::ImageToNCHW(float *image,
for (size_t h = 0; h < H; h++) {
for (size_t w = 0; w < W; w++) {
p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)];
p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
}
}
}
......@@ -237,7 +238,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
}
void CLImageConverterNWBlock::NCHWToImage(float *tensor,
float *image,
half_t *image,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
auto image_dim = InitImageDimInfoWith(tensor_dim);
......@@ -257,7 +258,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4;
if (n < N) {
image[index] = *p;
image[index] = Float2Half(*p);
p++;
} else {
image[index] = 0.0;
......@@ -272,7 +273,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
VLOG(3) << " init done";
}
void CLImageConverterNWBlock::ImageToNCHW(float *image,
void CLImageConverterNWBlock::ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
......@@ -291,7 +292,7 @@ void CLImageConverterNWBlock::ImageToNCHW(float *image,
for (size_t w = 0; w < W; ++w) {
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4;
*p = image[index];
*p = Half2Float(image[index]);
p++;
if (index >= (width * height * 4)) {
LOG(INFO) << " index out of range ";
......@@ -318,7 +319,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
}
void CLImageConverterDWBlock::NCHWToImage(float *tensor,
float *image,
half_t *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) {
......@@ -350,7 +351,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = *p;
image[i2] = Float2Half(*p);
i2 += 4;
p++;
} else {
......@@ -365,7 +366,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
}
}
void CLImageConverterDWBlock::ImageToNCHW(float *image,
void CLImageConverterDWBlock::ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
......@@ -384,7 +385,7 @@ void CLImageConverterDWBlock::ImageToNCHW(float *image,
for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) {
*p = image[i2];
*p = Half2Float(image[i2]);
i2 += 4;
p++;
}
......@@ -418,7 +419,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
}
void CLImageConverterNormal::NCHWToImage(float *tensor,
float *image,
half_t *image,
const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!";
......@@ -427,7 +428,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor,
default_converter.NCHWToImage(tensor, image, tensor_dim);
}
void CLImageConverterNormal::ImageToNCHW(float *image,
void CLImageConverterNormal::ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
......@@ -449,10 +450,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
}
void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor,
float *image,
half_t *image,
const DDim &tensor_dim) {}
void CLImageConverterWinoTransWeight::ImageToNCHW(float *image,
void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {}
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include "lite/backends/opencl/cl_half.h"
#include "lite/core/tensor.h"
namespace paddle {
......@@ -24,10 +25,10 @@ class CLImageConverterBase {
virtual ~CLImageConverterBase() {}
virtual void NCHWToImage(float *nchw,
float *image,
half_t *image,
const DDim &tensor_dim) = 0;
virtual void ImageToNCHW(float *image,
virtual void ImageToNCHW(half_t *image,
float *nchw,
const DDim &image_dim,
const DDim &tensor_dim) = 0;
......@@ -37,8 +38,8 @@ class CLImageConverterBase {
class CLImageConverterDefault : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) override;
void ImageToNCHW(float *image,
void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim) override;
void ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) override;
......@@ -48,9 +49,9 @@ class CLImageConverterFolder : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor,
float *image,
half_t *image,
const DDim &tensor_dim) override;
void ImageToNCHW(float *image,
void ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) override;
......@@ -77,9 +78,9 @@ class CLImageConverterNormal : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor,
float *image,
half_t *image,
const DDim &tensor_dim) override;
void ImageToNCHW(float *image,
void ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) override;
......@@ -106,9 +107,9 @@ class CLImageConverterNWBlock : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor,
float *image,
half_t *image,
const DDim &tensor_dim) override;
void ImageToNCHW(float *image,
void ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) override;
......@@ -117,9 +118,9 @@ class CLImageConverterDWBlock : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor,
float *image,
half_t *image,
const DDim &tensor_dim) override;
void ImageToNCHW(float *image,
void ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) override;
......@@ -129,9 +130,9 @@ class CLImageConverterWinoTransWeight : public CLImageConverterBase {
public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor,
float *image,
half_t *image,
const DDim &tensor_dim) override;
void ImageToNCHW(float *image,
void ImageToNCHW(half_t *image,
float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) override;
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include <cl_common.h>
// #define DEBUG
// buffer -> image2d
__kernel void buffer_to_image2d(__global CL_DTYPE *in,
__write_only image2d_t output_image,
......@@ -27,6 +28,7 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H;
......@@ -47,29 +49,92 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output = (CL_DTYPE4)0.0f;
output.x = convert_float(in[input_pos0]);
if(out_C - 4 * out_c >= 2){
output.y = convert_float(in[input_pos1]);
CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f);
output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE);
if (out_C - 4 * out_c >= 2) {
output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE);
}
if (out_C - 4 * out_c >= 3) {
output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE);
}
if (out_C - 4 * out_c >= 4) {
output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE);
}
if(out_C - 4 * out_c >= 3){
output.z = convert_float(in[input_pos2]);
#ifdef DEBUG
if (out_w > 2045) {
printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n",
out_w,
out_C - 4 * out_c,
(float)(in[input_pos0]),
(float)(in[input_pos1]),
(float)(in[input_pos2]),
(float)(in[input_pos3]));
printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh,
output_pos.x, output_pos.y,
(float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w));
}
if(out_C - 4 * out_c >= 4){
output.w = convert_float(in[input_pos3]);
#endif
WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
}
// image2d -> buffer
__kernel void image2d_to_buffer(__read_only image2d_t input,
__private const int in_width,
__private const int in_height,
__global CL_DTYPE* out,
__private const int size_ch,
__private const int size_block,
__private const int size_batch,
__private const int C) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
const int in_n = in_nh / in_height;
const int in_h = in_nh % in_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
#ifdef DEBUG
if (in_w > 2045) {
printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh,
pos_x, in_nh,
(float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w));
}
#endif
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
if (C - 4 * in_c >= 2) {
out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
}
if(C - 4 * in_c >= 3) {
out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
}
if(C - 4 * in_c >= 4) {
out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
}
write_imagef(output_image, output_pos, output);
}
#if 0
// buffer -> image2d_nw
__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
__write_only image2d_t output_image,
__private const int out_H,
__private const int out_W,
__private const int out_N,
__private const int Stride0,
__private const int Stride1,
__private const int Stride2) {
__write_only image2d_t output_image,
__private const int out_H,
__private const int out_W,
__private const int out_N,
__private const int Stride0,
__private const int Stride1,
__private const int Stride2) {
const int out_n = get_global_id(0);
const int out_w = get_global_id(1);
const int out_ch = get_global_id(2);
......@@ -97,55 +162,23 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
output_pos.y = out_ch;
CL_DTYPE4 output = (CL_DTYPE4)0.0f;
output.x = convert_float(in[input_pos0]);
output.x = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos0]);
if (out_N - 4 * out_n >= 2) {
output.y = convert_float(in[input_pos1]);
output.y = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos1]);
}
if (out_N - 4 * out_n >= 3) {
output.z = convert_float(in[input_pos2]);
output.z = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos2]);
}
if (out_N - 4 * out_n >= 4) {
output.w = convert_float(in[input_pos3]);
output.w = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos3]);
}
write_imagef(output_image, output_pos, output);
}
// image2d -> buffer
__kernel void image2d_to_buffer(__read_only image2d_t input,
__private const int in_width,
__private const int in_height,
__global CL_DTYPE* out,
__private const int size_ch,
__private const int size_block,
__private const int size_batch,
__private const int C) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
const int in_n = in_nh / in_height;
const int in_h = in_nh % in_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w);
CL_DTYPE4 in = read_imagef(input, sampler, (int2)(pos_x, in_nh));
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = convert_float(in.x);
if (C - 4 * in_c >= 2) {
out[index + size_ch] = convert_float(in.y);
}
if(C - 4 * in_c >= 3) {
out[index + size_ch * 2] = convert_float(in.z);
}
if(C - 4 * in_c >= 4) {
out[index + size_ch * 3] = convert_float(in.w);
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
#endif
#if 0
// image2d -> buffer
__kernel void image2d_to_buffer_2d(__private const int in_height,
__private const int in_width,
......@@ -157,11 +190,12 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = read_imagef(input, sampler, (int2)(in_w, in_h));
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(in_w, in_h));
const int index = (in_h * in_width + in_w) * 4;
out[index] = convert_float(in.x);
out[index + 1] = convert_float(in.y);
out[index + 2] = convert_float(in.z);
out[index + 3] = convert_float(in.w);
out[index] = CONVERT_TYPE_TO(CL_DTYPE, in.x);
out[index + 1] = CONVERT_TYPE_TO(CL_DTYPE, in.y);
out[index + 2] = CONVERT_TYPE_TO(CL_DTYPE, in.z);
out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w);
}
#endif
......@@ -29,11 +29,15 @@ limitations under the License. */
#ifdef CL_DTYPE_float
#define CL_DTYPE float
#define CL_DTYPE_CHAR f
#define CL_COMPUTE_DTYPE half
#define CL_COMPUTE_DTYPE_CHAR h
#endif
#ifdef CL_DTYPE_half
#define CL_DTYPE half
#define CL_DTYPE_CHAR h
#define CL_COMPUTE_DTYPE half
#define CL_COMPUTE_DTYPE_CHAR h
#endif
/////////////////////////////////
......@@ -43,6 +47,7 @@ limitations under the License. */
#define GET_VEC_TYPE(type__, size__) type__##size__
#define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
#define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
#define CL_COMPUTE_DTYPE4 VECTORIZED_TYPE(CL_COMPUTE_DTYPE, 4)
/////////////////////////////////
// CONVERT_TYPE_TO
......
......@@ -14,6 +14,23 @@ limitations under the License. */
#include <cl_common.h>
__kernel void relu(__read_only image2d_t input,
__write_only image2d_t output) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
__kernel void relu6(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold){
......@@ -30,3 +47,19 @@ __kernel void relu6(__read_only image2d_t input,
in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
__kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = 1 / (1 + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
......@@ -12,26 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
__private const float scale_h, __private const float scale_w,
__private const int in_dims_h, __private const int out_dims_h,
__private const int in_dims_w, __private const int out_dims_w) {
const int c = get_global_id(0);
const int w = get_global_id(1);
const int nh = get_global_id(2);
int2 output_pos;
output_pos.x = c * out_dims_w + w;
output_pos.y = nh;
int out_n = nh / out_dims_h;
int out_h = nh % out_dims_h;
int2 input_pos;
input_pos.x = c * in_dims_w + w / scale_w;
input_pos.y = out_n * in_dims_h + out_h / scale_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data);
#include <cl_common.h>
__kernel void nearest_interp(__read_only image2d_t input,
__write_only image2d_t output,
__private const float scale_h,
__private const float scale_w,
__private const int in_dims_h,
__private const int out_dims_h,
__private const int in_dims_w,
__private const int out_dims_w) {
const int c = get_global_id(0);
const int w = get_global_id(1);
const int nh = get_global_id(2);
int2 output_pos;
output_pos.x = c * out_dims_w + w;
output_pos.y = nh;
int out_n = nh / out_dims_h;
int out_h = nh % out_dims_h;
int2 input_pos;
input_pos.x = c * in_dims_w + w / scale_w;
input_pos.y = out_n * in_dims_h + out_h / scale_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(input_pos.x, input_pos.y));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(output_pos.x , output_pos.y), input_data);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = 1 / (1 + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
......@@ -81,10 +81,10 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
return cl_image;
}
template <> // use int16_t represents half float
void *TargetWrapperCL::MallocImage<int16_t>(const size_t cl_image2d_width,
const size_t cl_image2d_height,
void *host_ptr) {
template <> // use uint16_t represents half float
void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
const size_t cl_image2d_height,
void *host_ptr) {
cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16)));
cl_int status;
cl::Image2D *cl_image =
......
......@@ -178,5 +178,6 @@ void PrecisionCastPass::SetValidPlaces(const std::vector<Place>& valid_places) {
REGISTER_MIR_PASS(type_precision_cast_pass,
paddle::lite::mir::PrecisionCastPass)
.BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kOpenCL)})
.BindKernel("calib_once")
.BindKernel("calib");
......@@ -103,8 +103,8 @@ const cl::Image2D *TensorLite::data<float, cl::Image2D>() const {
return static_cast<const cl::Image2D *>(buffer_->data());
}
template <> // use int16_t represent half float
const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const {
template <> // use uint16_t represent half float
const cl::Image2D *TensorLite::data<uint16_t, cl::Image2D>() const {
if (nullptr == buffer_->data()) return nullptr;
return static_cast<const cl::Image2D *>(buffer_->data());
}
......
......@@ -260,8 +260,8 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) {
template <>
const cl::Image2D *TensorLite::data<float, cl::Image2D>() const;
template <> // use int16_t represent half float
const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const;
template <> // use uint16_t represent half float
const cl::Image2D *TensorLite::data<uint16_t, cl::Image2D>() const;
#endif
} // namespace lite
......
......@@ -4,91 +4,136 @@ endif()
set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter)
add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps})
add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_compute.cc DEPS ${cl_kernel_deps})
#####################
# image kernel #
#####################
# basic
add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fusion_elementwise_add_activation_opencl
OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
OPENCL basic SRCS fusion_elementwise_add_activation_image_compute.cc
DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
add_kernel(sigmoid_opencl OPENCL basic SRCS sigmoid_compute.cc DEPS ${cl_kernel_deps})
add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl OPENCL basic SRCS concat_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
# extra
# wait to add ...
######################
# image kernel test #
######################
lite_cc_test(test_activation_image_opencl SRCS activation_image_compute_test.cc
DEPS activation_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_elementwise_mul_opencl SRCS elementwise_mul_compute_test.cc
DEPS elementwise_mul_opencl op_registry program context
lite_cc_test(test_conv_image_opencl SRCS conv_image_compute_test.cc
DEPS conv_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc
DEPS pool_opencl op_registry program context
lite_cc_test(test_depthwise_conv2d_image_opencl SRCS depthwise_conv2d_image_compute_test.cc
DEPS conv_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
DEPS fc_opencl op_registry program context
lite_cc_test(test_nearest_interp_image_opencl SRCS nearest_interp_image_compute_test.cc
DEPS nearest_interp_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_pool_image_opencl SRCS pool_image_compute_test.cc
DEPS pool_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
# TODO(ysh329): comment for buffer-impl mul
#lite_cc_test(test_mul_opencl SRCS mul_compute_test.cc
# DEPS mul_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
DEPS scale_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
DEPS io_copy_compute_opencl op_registry program context
lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
DEPS reshape_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#TODO(ysh329): comment buffer-impl relu
lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
DEPS relu_opencl layout_opencl op_registry program context
lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
DEPS concat_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_sigmoid_opencl SRCS sigmoid_compute_test.cc
DEPS sigmoid_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
DEPS elementwise_mul_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
DEPS depthwise_conv2d_opencl op_registry program context
lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
DEPS layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_depthwise_conv2d_image2d_opencl SRCS depthwise_conv2d_image2d_compute_test.cc
DEPS conv_opencl op_registry program context
lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
DEPS reshape_opencl op_registry program context
######################
# buffer kernel #
######################
# basic
#add_kernel(activation_opencl OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(conv_opencl OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(fusion_elementwise_add_activation_opencl
# OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
# DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
# extra
# wait to add ...
######################
# buffer kernel test #
######################
#lite_cc_test(test_activation_buffer_opencl SRCS activation_buffer_compute_test.cc
# DEPS activation_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
# DEPS conv_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
# DEPS depthwise_conv2d_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_pool_buffer_opencl SRCS pool_buffer_compute_test.cc
# DEPS pool_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_concat_buffer_opencl SRCS concat_buffer_compute_test.cc
# DEPS concat_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
DEPS fc_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
DEPS conv_opencl op_registry program context
lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
DEPS mul_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
DEPS conv_opencl op_registry program context cl_image_converter
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_elementwise_add_buffer_opencl SRCS elementwise_add__buffer_compute_test.cc
# DEPS elementwise_add_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
DEPS layout_opencl op_registry program context cl_image_converter
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_concat_opencl SRCS concat_compute_test.cc
DEPS concat_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_scale_opencl SRCS scale_compute_test.cc
DEPS scale_opencl op_registry program context
lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc
DEPS io_copy_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ReluCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
size_t count = x_dims.production();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)count);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{count};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class SigmoidCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Sigmoid using cl::Buffer, kFloat";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
size_t count = x_dims.production();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)count);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{count};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
private:
std::string kernel_func_name_{"sigmoid"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
// Relu
REGISTER_LITE_KERNEL(relu,
kOpenCL,
kFloat,
kNCHW,
paddle::lite::kernels::opencl::ReluCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
// Sigmoid
REGISTER_LITE_KERNEL(sigmoid,
kOpenCL,
kFloat,
kNCHW,
paddle::lite::kernels::opencl::SigmoidCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle {
namespace lite {
template <typename dtype>
void relu_compute_ref(const dtype *x_data,
const DDim &x_dim,
dtype *out_data,
float threshold = 0.f) {
if (abs(threshold) < 1e-5) {
// relu
for (int i = 0; i < x_dim.production(); ++i) {
out_data[i] = (x_data[i] > threshold) ? x_data[i] : threshold;
}
} else {
// relu6 or relu with threshold
for (int i = 0; i < x_dim.production(); ++i) {
auto out_tmp = (x_data[i] > 0) ? x_data[i] : 0;
out_data[i] = (out_tmp < threshold) ? out_tmp : threshold;
}
}
}
template <typename dtype>
void sigmoid_compute_ref(const dtype *x_data,
const DDim &x_dim,
dtype *out_data) {
for (int i = 0; i < x_dim.production(); ++i) {
out_data[i] = 1 / (1 + expf(-x_data[i]));
}
}
TEST(opencl_relu_buffer, compute) {
// prepare data
const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
lite::Tensor x, out;
x.Resize(x_dim);
out.Resize(x_dim);
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x = static_cast<float *>(
TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
mapped_x[i] = dist(engine);
}
// set param and kernel, then run
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"relu", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> relu_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(relu_context->As<OpenCLContext>()));
kernel->SetContext(std::move(relu_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.Out->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
relu_compute_ref<float>(mapped_x, x_dim, out_ref.get());
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x_data, mapped_x);
}
TEST(opencl_sigmoid_buffer, compute) {
// prepare data
const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
lite::Tensor x, out;
x.Resize(x_dim);
out.Resize(x_dim);
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x = static_cast<float *>(
TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
mapped_x[i] = dist(engine);
}
// set param and kernel, then run
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"sigmoid", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> sigmoid_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(sigmoid_context->As<OpenCLContext>()));
kernel->SetContext(std::move(sigmoid_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.Out->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
sigmoid_compute_ref<float>(mapped_x, x_dim, out_ref.get());
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x_data, mapped_x);
}
} // namespace lite
} // namespace paddle
// sigmoid buffer
USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
// relu buffer
USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
......@@ -24,44 +24,55 @@ namespace lite {
namespace kernels {
namespace opencl {
class SigmoidCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Sigmoid using cl::Buffer, kFloat";
return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
size_t count = x_dims.production();
auto* x_buf = param.X->data<uint16_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)count);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{count};
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
......@@ -70,40 +81,42 @@ class SigmoidCompute
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
}
private:
std::string kernel_func_name_{"sigmoid"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_half -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class SigmoidComputeFloatImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFloat";
return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/sigmoid_kernel.cl", build_options_);
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<float, cl::Image2D>();
auto* x_buf = param.X->data<uint16_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
auto* out_buf = param.Out->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
......@@ -116,6 +129,8 @@ class SigmoidComputeFloatImageDefault
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
......@@ -125,6 +140,7 @@ class SigmoidComputeFloatImageDefault
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "threshold:" << threshold;
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
......@@ -143,12 +159,12 @@ class SigmoidComputeFloatImageDefault
}
private:
std::string kernel_func_name_{"sigmoid"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
std::string kernel_func_name_{"relu6"};
std::string build_options_{"-DCL_DTYPE_half -DRELU6"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class SigmoidComputeFP16ImageDefault
class SigmoidComputeImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
......@@ -162,19 +178,19 @@ class SigmoidComputeFP16ImageDefault
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/sigmoid_kernel.cl", build_options_);
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf =
param.X->data<int16_t,
cl::Image2D>(); // use int16_t represents half float
param.X->data<uint16_t,
cl::Image2D>(); // use uint16_t represents half float
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.Out->mutable_data<int16_t, cl::Image2D>( // use int16_t
// represents half float
param.Out->mutable_data<uint16_t, cl::Image2D>( // use uint16_t
// represents half float
image_shape["width"],
image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
......@@ -227,40 +243,47 @@ class SigmoidComputeFP16ImageDefault
} // namespace lite
} // namespace paddle
// REGISTER_LITE_KERNEL(sigmoid,
// kOpenCL,
// kFloat,
// kNCHW,
// paddle::lite::kernels::opencl::SigmoidCompute,
// def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(
sigmoid,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::SigmoidComputeFloatImageDefault,
ImageDefault)
// Relu
REGISTER_LITE_KERNEL(relu,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ReluComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Relu6
REGISTER_LITE_KERNEL(relu6,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::Relu6ComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(
sigmoid,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::SigmoidComputeFP16ImageDefault,
ImageDefault)
// Sigmoid
REGISTER_LITE_KERNEL(sigmoid,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::SigmoidComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ConcatCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConcatParam;
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
auto global_work_size = cl::NDRange{axis_size_};
int total = axis_size_ * post_size_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
auto axis0 = inputs[0]->dims()[axis_];
int total0 = axis0 * post_size_;
int total1 = (axis_size_ - axis0) * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total1);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
int size = inputs[i]->dims()[axis_];
auto* x_buf = inputs[i]->data<float, cl::Buffer>();
global_work_size = cl::NDRange{static_cast<size_t>(size)};
int total0 = size * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(size));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
start += size;
}
}
}
std::string doc() { return "Concat using cl::Buffer, kFloat"; }
int axis_size_ = 1;
int post_size_ = 1;
int pre_size_ = 1;
int axis_ = 1;
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::ConcatCompute Concat_buffer;
REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// // Licensed under the Apache License, Version 2.0 (the "License");
// // you may not use this file except in compliance with the License.
// // You may obtain a copy of the License at
// //
// // http://www.apache.org/licenses/LICENSE-2.0
// //
// // Unless required by applicable law or agreed to in writing, software
// // distributed under the License is distributed on an "AS IS" BASIS,
// // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// // See the License for the specific language governing permissions and
// // limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle {
namespace lite {
template <typename dtype>
void concat2_compute_ref(const dtype *in0,
const dtype *in1,
const int axis,
const DDim in0_dim,
const DDim in1_dim,
const DDim out_dim,
dtype *out_data) {
int pre_size = 1;
int post_size = 1;
for (int i = 0; i < axis; i++) {
pre_size *= in0_dim[i];
}
for (int i = axis + 1; i < in0_dim.size(); i++) {
post_size *= in0_dim[i];
}
int axis_size = out_dim[axis];
for (int i = 0; i < pre_size; i++) {
for (int j = 0; j < axis_size; j++) {
if (j < in0_dim[axis]) {
memcpy(out_data, in0, sizeof(dtype) * post_size);
in0 += post_size;
out_data += post_size;
}
}
}
}
template <typename dtype>
void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
std::vector<const DDim> ins_dim,
int axis,
const DDim out_dim,
dtype *out_data) {
int pre_size = 1;
int post_size = 1;
for (int i = 0; i < axis; i++) {
pre_size *= ins_dim[0][i];
}
for (int i = axis + 1; i < ins_dim[0].size(); i++) {
post_size *= ins_dim[0][i];
}
int axis_size = out_dim[axis];
for (int i = 0; i < pre_size; i++) {
for (int j = 0; j < ins_data.size(); j++) {
int size = post_size * ins_dim[j][axis];
memcpy(out_data, ins_data[j], sizeof(dtype) * size);
out_data += size;
}
}
}
TEST(opencl_concat_buffer, compute) {
// prepare data
const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
lite::Tensor x0, x1, x2, out, out_ref;
x0.Resize(x0_dim);
x1.Resize(x1_dim);
x2.Resize(x2_dim);
out.Resize(out_dim);
out_ref.Resize(out_dim);
auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x0 = static_cast<float *>(
TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
auto *mapped_x1 = static_cast<float *>(
TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
auto *mapped_x2 = static_cast<float *>(
TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
for (int i = 0; i < x0_dim.production(); i++) {
mapped_x0[i] = dist(engine);
}
for (int i = 0; i < x1_dim.production(); i++) {
mapped_x1[i] = dist(engine);
}
for (int i = 0; i < x2_dim.production(); i++) {
mapped_x2[i] = dist(engine);
}
// set param and kernel, then run
operators::ConcatParam param;
std::vector<lite::Tensor *> ins;
ins.push_back(&x0);
ins.push_back(&x1);
ins.push_back(&x2);
auto axis = 1;
param.x = ins;
param.output = &out;
param.axis = axis;
std::vector<const float *> ins_data;
std::vector<const DDim> ins_dim;
ins_data.push_back(mapped_x0);
ins_data.push_back(mapped_x1);
ins_data.push_back(mapped_x2);
ins_dim.push_back(x0_dim);
ins_dim.push_back(x1_dim);
ins_dim.push_back(x2_dim);
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> concat_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(concat_context->As<OpenCLContext>()));
kernel->SetContext(std::move(concat_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.output->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x0_data, mapped_x0);
TargetWrapperCL::Unmap(x1_data, mapped_x1);
TargetWrapperCL::Unmap(x2_data, mapped_x2);
}
} // namespace lite
} // namespace paddle
// concat buffer
USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/concat_compute.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
template <>
void ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kImageDefault)>::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "image/concat_kernel.cl", build_options_);
// UpdateParams<kFloat, kImageDefault>();
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
// auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::Run() {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
int width = inputs[0]->dims()[-1];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int flag = 1; // cxw
switch (axis_) {
case 0:
width = x_dims[2]; // n
flag = 0;
break;
case 1:
width = x_dims[3]; // c
break;
case 2:
width = x_dims[0]; // h
flag = 0;
break;
case 3:
case -1:
width = x_dims[1]; // w
break;
default:
printf("this axis: %d does not support \n", axis_);
}
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status =
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto* x_buf = inputs[i]->data<float, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
start += inputs[i]->dims()[axis_];
}
}
}
template <>
std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::doc() {
return "Concat using cl::Image, kFloat";
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
// UpdateParams<kFloat, kImageDefault>();
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::Run() {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
auto global_work_size = cl::NDRange{axis_size_};
int total = axis_size_ * post_size_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
auto axis0 = inputs[0]->dims()[axis_];
int total0 = axis0 * post_size_;
int total1 = (axis_size_ - axis0) * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total1);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
int size = inputs[i]->dims()[axis_];
auto* x_buf = inputs[i]->data<float, cl::Buffer>();
global_work_size = cl::NDRange{static_cast<size_t>(size)};
int total0 = size * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(size));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
start += size;
}
}
}
template <>
std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::doc() {
return "Concat using cl::Buffer, kFloat";
}
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
Concat_buffer;
typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kImageDefault)>
Concat_image;
REGISTER_LITE_KERNEL(
concat, kOpenCL, kFloat, kImageDefault, Concat_image, ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
// REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
// .BindInput("X",
// {LiteType::GetTensorTy(TARGET(kOpenCL),
// PRECISION(kFloat),
// DATALAYOUT(kNCHW))})
// .BindInput("AxisTensor",
// {LiteType::GetTensorTy(TARGET(kOpenCL),
// PRECISION(kInt32),
// DATALAYOUT(kNCHW))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kOpenCL),
// PRECISION(kFloat),
// DATALAYOUT(kNCHW))})
// .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConcatParam;
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "image/concat_kernel.cl", build_options_);
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
// auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.output->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
int width = inputs[0]->dims()[-1];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int flag = 1; // cxw
switch (axis_) {
case 0:
width = x_dims[2]; // n
flag = 0;
break;
case 1:
width = x_dims[3]; // c
break;
case 2:
width = x_dims[0]; // h
flag = 0;
break;
case 3:
case -1:
width = x_dims[1]; // w
break;
default:
printf("this axis: %d does not support \n", axis_);
}
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<uint16_t, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<uint16_t, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status =
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto* x_buf = inputs[i]->data<uint16_t, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
start += inputs[i]->dims()[axis_];
}
}
}
std::string doc() { return "Concat using cl::Image, kFP16"; }
int axis_size_ = 1;
int post_size_ = 1;
int pre_size_ = 1;
int axis_ = 1;
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::ConcatComputeImage Concat_image;
REGISTER_LITE_KERNEL(
concat, kOpenCL, kFP16, kImageDefault, Concat_image, ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
......@@ -18,6 +18,9 @@
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle {
namespace lite {
......@@ -73,106 +76,10 @@ void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
}
}
}
#if 0 // concat_buffer
TEST(opencl_concat_buffer, compute) {
// prepare data
const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
lite::Tensor x0, x1, x2, out, out_ref;
x0.Resize(x0_dim);
x1.Resize(x1_dim);
x2.Resize(x2_dim);
out.Resize(out_dim);
out_ref.Resize(out_dim);
auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x0 = static_cast<float *>(
TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
auto *mapped_x1 = static_cast<float *>(
TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
auto *mapped_x2 = static_cast<float *>(
TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
for (int i = 0; i < x0_dim.production(); i++) {
mapped_x0[i] = dist(engine);
}
for (int i = 0; i < x1_dim.production(); i++) {
mapped_x1[i] = dist(engine);
}
for (int i = 0; i < x2_dim.production(); i++) {
mapped_x2[i] = dist(engine);
}
// set param and kernel, then run
operators::ConcatParam param;
std::vector<lite::Tensor *> ins;
ins.push_back(&x0);
ins.push_back(&x1);
ins.push_back(&x2);
auto axis = 1;
param.x = ins;
param.output = &out;
param.axis = axis;
std::vector<const float *> ins_data;
std::vector<const DDim> ins_dim;
ins_data.push_back(mapped_x0);
ins_data.push_back(mapped_x1);
ins_data.push_back(mapped_x2);
ins_dim.push_back(x0_dim);
ins_dim.push_back(x1_dim);
ins_dim.push_back(x2_dim);
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> concat_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(concat_context->As<OpenCLContext>()));
kernel->SetContext(std::move(concat_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.output->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x0_data, mapped_x0);
TargetWrapperCL::Unmap(x1_data, mapped_x1);
TargetWrapperCL::Unmap(x2_data, mapped_x2);
}
#endif // concat_buffer
// #define LOOP_TEST
// #define PRINT_RESULT
TEST(concat_image2d_fp32, compute) {
TEST(concat_image2d, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> "
"layout(img2buf) "
"-> host";
......@@ -209,7 +116,7 @@ TEST(concat_image2d_fp32, compute) {
auto concat_img_kernels =
KernelRegistry::Global().Create("concat",
TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels1.empty());
......@@ -284,14 +191,18 @@ TEST(concat_image2d_fp32, compute) {
for (int i = 0; i < out_dim.production(); ++i) {
mapped_y[i] = static_cast<int>(0);
}
auto *concat_in_data0 = concat_in0.mutable_data<float, cl::Image2D>(
concat_image2d_shape_in0["width"],
concat_image2d_shape_in0["height"]);
auto *concat_in_data1 = concat_in1.mutable_data<float, cl::Image2D>(
concat_image2d_shape_in1["width"],
concat_image2d_shape_in1["height"]);
auto *concat_out_data = concat_out.mutable_data<float, cl::Image2D>(
concat_image2d_shape["width"], concat_image2d_shape["height"]);
auto *concat_in_data0 =
concat_in0.mutable_data<uint16_t, cl::Image2D>(
concat_image2d_shape_in0["width"],
concat_image2d_shape_in0["height"]);
auto *concat_in_data1 =
concat_in1.mutable_data<uint16_t, cl::Image2D>(
concat_image2d_shape_in1["width"],
concat_image2d_shape_in1["height"]);
auto *concat_out_data =
concat_out.mutable_data<uint16_t, cl::Image2D>(
concat_image2d_shape["width"],
concat_image2d_shape["height"]);
// set context and kernel args
LOG(INFO) << "set context and kernel args";
......@@ -347,22 +258,35 @@ TEST(concat_image2d_fp32, compute) {
#ifdef PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
std::cout << mapped_x0[eidx] << ", " << mapped_x1[eidx] << " -> "
<< mapped_y[eidx] << std::endl;
std::cout << "x0[" << eidx << "]:" << mapped_x0[eidx] << ",\t x1["
<< eidx << "]:" << mapped_x1[eidx] << " -> y[" << eidx
<< "]:" << mapped_y[eidx] << "\t, y_ref[" << eidx
<< "]:" << y_data_ref[eidx] << ",\t IS_DIFF_PASSED:"
<< IS_DIFF_PASSED(
y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF)
<< std::endl;
}
#endif // PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < out_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x0_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
<< eidx << "]:" << mapped_y[eidx];
for (int i = 0; i < out_dim.production(); i++) {
auto abs_diff = abs(y_data_ref[i] - mapped_y[i]);
auto relative_diff =
COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
(abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) &&
(abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " mapped_y[" << i
<< "]:" << mapped_y[i] << " y_data_ref[" << i
<< "]:" << y_data_ref[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
break;
}
}
// free
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data0, mapped_x0);
......@@ -382,9 +306,9 @@ TEST(concat_image2d_fp32, compute) {
} // namespace paddle
// concat buffer
// USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
// USE_LITE_KERNEL(concat, kOpenCL, kFP16, kNCHW, def);
// concat image2d fp32
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(concat, kOpenCL, kFloat, kImageDefault, ImageDefault);
USE_LITE_KERNEL(concat, kOpenCL, kFP16, kImageDefault, ImageDefault);
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/conv_compute.h"
#include "lite/kernels/opencl/conv_buffer_compute.h"
#include <sstream>
......@@ -1431,50 +1431,14 @@ void ConvImageCompute::Run() { (this->*impl_)(); }
} // namespace lite
} // namespace paddle
// REGISTER_LITE_KERNEL(conv2d,
// kOpenCL,
// kFloat,
// kNCHW,
// paddle::lite::kernels::opencl::ConvCompute,
// def)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(conv2d,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::ConvImageCompute,
image2d)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(depthwise_conv2d,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::ConvImageCompute,
image2d)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
kNCHW,
paddle::lite::kernels::opencl::ConvCompute,
def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
......@@ -58,34 +58,6 @@ class ConvCompute
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)();
void PrepareForRun() override;
void Run() override;
private:
void Conv2d1x1();
void Conv2d3x3();
void Conv2d5x5();
void Conv2d7x7();
void DepthwiseConv2d3x3s1();
void DepthwiseConv2d3x3();
void DepthwiseConv2d();
kernel_t impl_;
std::vector<std::string> kernel_func_names_{};
std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{};
std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_;
Tensor bias_gpu_image_;
};
} // namespace opencl
} // namespace kernels
} // namespace lite
......
......@@ -167,7 +167,6 @@ void PrintData(std::string name,
}
// buffer
#if 0
// #define PRINT_RESULT
#define LOOP_TEST
TEST(conv2d, compute_conv2d_1x1) {
......@@ -625,9 +624,8 @@ TEST(conv2d, compute_conv2d_gemm) {
} // batch_size
#endif
}
#endif
} // namespace lite
} // namespace paddle
// USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
此差异已折叠。
......@@ -11,41 +11,50 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/tensor.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
template <PrecisionType Ptype, DataLayoutType layout>
class ConcatCompute : public KernelLite<TARGET(kOpenCL), Ptype, layout> {
class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConcatParam;
using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)();
void PrepareForRun() override;
void Run() override;
std::string doc(); // override;
// protected:
// void UpdateParams();
int axis_size_ = 1;
int post_size_ = 1;
int pre_size_ = 1;
int axis_ = 1;
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float"};
private:
void Conv2d1x1();
void Conv2d3x3();
void Conv2d5x5();
void Conv2d7x7();
void DepthwiseConv2d3x3s1();
void DepthwiseConv2d3x3();
void DepthwiseConv2d();
kernel_t impl_;
std::vector<std::string> kernel_func_names_{};
std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{};
std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_;
Tensor bias_gpu_image_;
};
} // namespace opencl
......
......@@ -105,6 +105,7 @@ int ConvOutputSize(int input_size,
return output_size;
}
// #define LOOP_TEST
TEST(depthwise_conv2d_basic, compute) {
// conv infos
// const int ksize = 1;
......@@ -144,7 +145,7 @@ TEST(depthwise_conv2d_basic, compute) {
auto kernels =
KernelRegistry::Global().Create("depthwise_conv2d",
TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
......@@ -252,14 +253,14 @@ TEST(depthwise_conv2d_basic, compute) {
paddle::lite::CLImageConverterDefault default_convertor;
VLOG(4) << "set mapped input ...";
std::vector<float> x_image_v(input_image_width * input_image_height *
4); // 4 : RGBA
std::vector<float> filter_image_v(
std::vector<uint16_t> x_image_v(input_image_width *
input_image_height * 4); // 4 : RGBA
std::vector<uint16_t> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 : RGBA
std::vector<float> bias_image_v(bias_image_width * bias_image_height *
4); // 4 : RGBA
std::vector<float> out_image_v(out_image_width * out_image_height *
4); // 4 : RGBA
std::vector<uint16_t> bias_image_v(
bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<uint16_t> out_image_v(out_image_width * out_image_height *
4); // 4 : RGBA
default_convertor.NCHWToImage(
input_v.data(), x_image_v.data(), input_dim);
......@@ -269,9 +270,9 @@ TEST(depthwise_conv2d_basic, compute) {
nw_convertor.NCHWToImage(
filter_v.data(), filter_image_v.data(), filter_dim);
auto* input_image2d = input.mutable_data<float, cl::Image2D>(
auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data());
auto* filter_image2d = filter.mutable_data<float, cl::Image2D>(
auto* filter_image2d = filter.mutable_data<uint16_t, cl::Image2D>(
filter_image_width, filter_image_height, filter_image_v.data());
if (bias_flag) {
......@@ -284,7 +285,7 @@ TEST(depthwise_conv2d_basic, compute) {
CLImageConverterFolder folder_convertor;
folder_convertor.NCHWToImage(
bias_v.data(), bias_image_v.data(), bias_dim);
auto* bias_data = bias.mutable_data<float, cl::Image2D>(
auto* bias_data = bias.mutable_data<uint16_t, cl::Image2D>(
bias_image_width, bias_image_height, bias_image_v.data());
}
......@@ -300,11 +301,11 @@ TEST(depthwise_conv2d_basic, compute) {
VLOG(4) << "kernel launch ...";
kernel->Launch();
VLOG(4) << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>(
auto* output_image2d = output.mutable_data<uint16_t, cl::Image2D>(
out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>();
auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
......@@ -318,7 +319,7 @@ TEST(depthwise_conv2d_basic, compute) {
}
TargetWrapperCL::ImgcpySync(out_image_v.data(),
output.data<float, cl::Image2D>(),
output.data<uint16_t, cl::Image2D>(),
out_image_width,
out_image_height,
cl_image2d_row_pitch,
......@@ -387,7 +388,7 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
......@@ -433,11 +434,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
default_converter->InitImageDimInfoWith(input.dims());
LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
<< input_image_shape[1];
std::vector<float> input_image_data(input_image_shape.production() *
4); // 4 : RGBA
std::vector<uint16_t> input_image_data(input_image_shape.production() *
4); // 4 : RGBA
default_converter->NCHWToImage(
input_v.data(), input_image_data.data(), input.dims());
auto* input_image = input.mutable_data<int16_t, cl::Image2D>(
auto* input_image = input.mutable_data<uint16_t, cl::Image2D>(
input_image_shape[0], input_image_shape[1], input_image_data.data());
LOG(INFO) << "prepare kernel";
......@@ -446,11 +447,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
<< filter_image_shape[1];
std::vector<float> filter_image_data(filter_image_shape.production() *
4); // 4 : RGBA
std::vector<uint16_t> filter_image_data(filter_image_shape.production() *
4); // 4 : RGBA
nw_converter->NCHWToImage(
filter_v.data(), filter_image_data.data(), filter.dims());
auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>(
auto* filter_image = filter.mutable_data<uint16_t, cl::Image2D>(
filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
LOG(INFO) << "launch";
......@@ -459,13 +460,13 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
default_converter->InitImageDimInfoWith(output.dims());
LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
<< output_image_shape[1];
auto* output_image = output.mutable_data<int16_t, cl::Image2D>(
auto* output_image = output.mutable_data<uint16_t, cl::Image2D>(
output_image_shape[0], output_image_shape[1]);
kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<int16_t, cl::Image2D>();
auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
......@@ -490,7 +491,8 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
float* output_image_data = new float[output_image_shape.production() * 4];
uint16_t* output_image_data =
new uint16_t[output_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(output_image_data,
output_image,
output_image_shape[0],
......@@ -512,4 +514,4 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kImageDefault, image2d);
USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFP16, kImageDefault, image2d);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/elementwise_add_buffer_compute.h"
#include <memory>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
void ElementwiseAddCompute::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
ele_param_ = param_.get_mutable<param_t>();
UpdateParams();
}
void ElementwiseAddCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(ele_param_->X->target());
VLOG(4) << TargetToStr(ele_param_->Y->target());
VLOG(4) << TargetToStr(ele_param_->Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)batch_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)channels_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)num_);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{channels_, batch_};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
void ElementwiseAddCompute::UpdateParams() {
auto axis = ele_param_->axis;
const auto& x_dims = ele_param_->X->dims();
const auto& y_dims = ele_param_->Y->dims();
const auto& out_dims = ele_param_->Out->dims();
if (axis < 0) {
axis = static_cast<int>(x_dims.size() - y_dims.size());
}
for (int i = 0; i < axis; ++i) {
batch_ *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels_ *= y_dims[i];
}
for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
num_ *= x_dims[i];
}
VLOG(4) << "axis: " << axis;
VLOG(4) << "batch: " << batch_;
VLOG(4) << "channels: " << channels_;
VLOG(4) << "num: " << num_;
}
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
namespace ocl = paddle::lite::kernels::opencl;
REGISTER_LITE_KERNEL(
elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
......@@ -49,28 +49,6 @@ class ElementwiseAddCompute
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class ElementwiseAddImageCompute
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ElementwiseParam;
void PrepareForRun() override;
void Run() override;
std::string doc() const override {
return "ElementwiseAdd using cl::Image2D, kFloat";
}
protected:
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{" -DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/elementwise_add_compute.h"
#include "lite/kernels/opencl/elementwise_add_image_compute.h"
#include <memory>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h"
......@@ -23,80 +23,6 @@ namespace lite {
namespace kernels {
namespace opencl {
/* Buffer */
#if 0
void ElementwiseAddCompute::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
ele_param_ = param_.get_mutable<param_t>();
UpdateParams();
}
void ElementwiseAddCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(ele_param_->X->target());
VLOG(4) << TargetToStr(ele_param_->Y->target());
VLOG(4) << TargetToStr(ele_param_->Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)batch_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)channels_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)num_);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{channels_, batch_};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
void ElementwiseAddCompute::UpdateParams() {
auto axis = ele_param_->axis;
const auto& x_dims = ele_param_->X->dims();
const auto& y_dims = ele_param_->Y->dims();
const auto& out_dims = ele_param_->Out->dims();
if (axis < 0) {
axis = static_cast<int>(x_dims.size() - y_dims.size());
}
for (int i = 0; i < axis; ++i) {
batch_ *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels_ *= y_dims[i];
}
for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
num_ *= x_dims[i];
}
VLOG(4) << "axis: " << axis;
VLOG(4) << "batch: " << batch_;
VLOG(4) << "channels: " << channels_;
VLOG(4) << "num: " << num_;
}
#endif
/* Image2D */
void ElementwiseAddImageCompute::PrepareForRun() {
ele_param_ = param_.get_mutable<param_t>();
auto* x = ele_param_->X;
......@@ -152,10 +78,10 @@ void ElementwiseAddImageCompute::Run() {
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
auto* x_img = x->data<float, cl::Image2D>();
auto* y_img = y->data<float, cl::Image2D>();
auto* out_img =
out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]);
auto* x_img = x->data<uint16_t, cl::Image2D>();
auto* y_img = y->data<uint16_t, cl::Image2D>();
auto* out_img = out->mutable_data<uint16_t, cl::Image2D>(out_img_shape[0],
out_img_shape[1]);
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
......@@ -220,14 +146,7 @@ void ElementwiseAddImageCompute::Run() {
namespace ocl = paddle::lite::kernels::opencl;
// REGISTER_LITE_KERNEL(
// elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
// TODO(ysh329): Not fix.
// TODO(ysh329): May need fix.
// "Y" may from constant value like conv bias (kARM, need do cl_image_converter
// on CPU);
// may from anther branch like "X" (kOpenCL, nothing to do).
......@@ -235,20 +154,20 @@ namespace ocl = paddle::lite::kernels::opencl;
// set target of "Y" as kOpenCL temporarily.
REGISTER_LITE_KERNEL(elementwise_add,
kOpenCL,
kFloat,
kFP16,
kImageDefault,
ocl::ElementwiseAddImageCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
......@@ -15,7 +15,6 @@
#include <memory>
#include <string>
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/core/kernel.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
......@@ -25,25 +24,25 @@ namespace lite {
namespace kernels {
namespace opencl {
class ElementwiseMulFloatImageCompute
class ElementwiseAddImageCompute
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ElementwiseParam;
std::string doc() const override {
return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
}
void PrepareForRun() override;
void Run() override;
std::string doc() const override {
return "ElementwiseAdd using cl::Image2D, kFP16";
}
protected:
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_mul"};
std::string build_options_{"-DCL_DTYPE_float"};
std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
此差异已折叠。
......@@ -111,7 +111,7 @@ void elementwise_compute_ref(const dtype *x_data,
}
// #define PRINT_RESULT
TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
TEST(elementwise_mul_image, compute) {
LOG(INFO)
<< "main steps of test: host -> layout(buf2img on cpu) -> elemul(img) -> "
"layout(img2buf on cpu) "
......@@ -151,9 +151,10 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim); // w, h
auto x_img_w = x_img_shape[0];
auto x_img_h = x_img_shape[1];
std::vector<float> x_img_v(x_img_w * x_img_h * 4); // 4: RGBA
std::vector<uint16_t> x_img_v(x_img_w * x_img_h * 4); // 4: RGBA
default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
elemul_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data());
elemul_x.mutable_data<uint16_t, cl::Image2D>(
x_img_w, x_img_h, x_img_v.data());
// y
std::vector<float> y_v(y_dim.production());
......@@ -161,19 +162,21 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim); // w, h
auto y_img_w = y_img_shape[0];
auto y_img_h = y_img_shape[1];
std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4); // 4: RGBA
std::vector<uint16_t> y_img_v(y_img_shape[0] * y_img_shape[1] *
4); // 4: RGBA
default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
elemul_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data());
elemul_y.mutable_data<uint16_t, cl::Image2D>(
y_img_w, y_img_h, y_img_v.data());
// out
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out_dim); // w, h
auto out_img_w = out_img_shape[0];
auto out_img_h = out_img_shape[1];
elemul_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h);
elemul_out.mutable_data<uint16_t, cl::Image2D>(out_img_w, out_img_h);
std::vector<float> out_img_v(out_img_w * out_img_h * 4);
fill_data<float>(
std::vector<uint16_t> out_img_v(out_img_w * out_img_h * 4);
fill_data<uint16_t>(
out_img_v.data(), out_img_v.size(), 0); // fill with zero value
std::vector<float> out_v(out_dim.production());
......@@ -189,7 +192,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
auto elemul_img_kernels =
KernelRegistry::Global().Create("elementwise_mul",
TARGET(kOpenCL),
PRECISION(kFloat),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(elemul_img_kernels.empty());
......@@ -215,7 +218,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(out_img_v.data(),
elemul_out.data<float, cl::Image2D>(),
elemul_out.data<uint16_t, cl::Image2D>(),
out_img_w,
out_img_h,
cl_image2d_row_pitch,
......@@ -266,4 +269,4 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFloat, kImageDefault, def);
USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFP16, kImageDefault, def);
......@@ -102,7 +102,7 @@ class MulCompute
private:
int m_, n_, k_;
std::string kernel_func_name_{"mat_mul"};
std::string build_options_{"-DCL_DTYPE=float"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册