提交 55b45651 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] support fp16 for cl_image_converter, layout, activation all...

[LITE][OPENCL] support fp16 for cl_image_converter, layout, activation all OpenCL image kernel. test=develop (#2964)

* [LITE][OPENCL] support fp16 for cl_image_converter, layout, activation image kernel. test=develop

* add conv, depthwise and UT. test=develop

* add pool, conv, nearest_interp kernel. test=develop

* support fp16 for scale, reshape, concat, fc buffer opencl kernel. test=develop

* refactor for mul opencl buffer kernel. test=develop

* support fp16 for elementwise_mul opecl image kernel. test=develop

* support fp16 for elementwise_mul opencl image kernel. test=develop

* support fp16 for ele_add, fuse_ele_add_act opencl kernel. test=develop

* rename io_copy. test=develop

* mobilenetv1,v2 passed on 855. test=develop

* fix opt for opencl. test=develop
上级 119dafcd
...@@ -81,29 +81,65 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -81,29 +81,65 @@ void TestModel(const std::vector<Place>& valid_places,
auto* out = predictor.GetOutput(0); auto* out = predictor.GetOutput(0);
const auto* pdata = out->data<float>(); const auto* pdata = out->data<float>();
int step = 50; int step = 50;
#ifdef LITE_WITH_NPU
ASSERT_EQ(out->dims().production(), 1000); // Get target and check result
double eps = 0.1; VLOG(1) << "valid_places.size():" << valid_places.size();
for (int i = 0; i < ref.size(); ++i) { for (int i = 0; i < valid_places.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) { auto p = valid_places[i];
auto result = pdata[j * step + (out->dims()[1] * i)]; VLOG(1) << "valid_places[" << i << "]:" << p.DebugString();
auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); }
VLOG(3) << diff; auto first_target = valid_places[0].target;
EXPECT_LT(diff, eps);
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
VLOG(3) << diff;
EXPECT_LT(diff, eps);
}
}
} else {
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
double eps = 1e-6;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
EXPECT_NEAR(result, ref[i][j], eps);
}
} }
} }
#else
ASSERT_EQ(out->dims().size(), 2); // Get detailed result
ASSERT_EQ(out->dims()[0], 1); auto* pred = &predictor;
ASSERT_EQ(out->dims()[1], 1000); size_t output_tensor_num = pred->GetOutputNames().size();
double eps = 1e-6; VLOG(1) << "output tesnor num:" << output_tensor_num;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) { for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
auto result = pdata[j * step + (out->dims()[1] * i)]; std::unique_ptr<const Tensor> output_tensor(
EXPECT_NEAR(result, ref[i][j], eps); std::move(pred->GetOutput(tidx)));
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
float sum = 0.f;
for (int i = 0; i < out_dims.production(); ++i) {
sum += output_tensor->data<float>()[i];
}
VLOG(1) << "out_dims.production():" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
// print result
for (int i = 0; i < out_dims.production(); ++i) {
VLOG(2) << "output_tensor->data<float>()[" << i
<< "]:" << output_tensor->data<float>()[i];
} }
} }
#endif
} }
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
...@@ -130,7 +166,7 @@ TEST(MobileNetV1, test_arm) { ...@@ -130,7 +166,7 @@ TEST(MobileNetV1, test_arm) {
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
TEST(MobileNetV1, test_opencl) { TEST(MobileNetV1, test_opencl) {
std::vector<Place> valid_places({ std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
......
...@@ -83,27 +83,65 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -83,27 +83,65 @@ void TestModel(const std::vector<Place>& valid_places,
auto* out = predictor.GetOutput(0); auto* out = predictor.GetOutput(0);
const auto* pdata = out->data<float>(); const auto* pdata = out->data<float>();
int step = 50; int step = 50;
#ifdef LITE_WITH_NPU
ASSERT_EQ(out->dims().production(), 1000); // Get target and check result
double eps = 0.1; VLOG(1) << "valid_places.size():" << valid_places.size();
for (int i = 0; i < ref.size(); ++i) { for (int i = 0; i < valid_places.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) { auto p = valid_places[i];
auto result = pdata[j * step + (out->dims()[1] * i)]; VLOG(1) << "valid_places[" << i << "]:" << p.DebugString();
auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); }
VLOG(3) << diff; auto first_target = valid_places[0].target;
EXPECT_LT(diff, eps);
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
VLOG(3) << diff;
EXPECT_LT(diff, eps);
}
}
} else {
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
double eps = 1e-6;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
EXPECT_NEAR(result, ref[i][j], eps);
}
} }
} }
#else
ASSERT_EQ(out->dims().size(), 2); // Get detailed result
ASSERT_EQ(out->dims()[0], 1); auto* pred = &predictor;
ASSERT_EQ(out->dims()[1], 1000); size_t output_tensor_num = pred->GetOutputNames().size();
for (int i = 0; i < ref.size(); ++i) { VLOG(1) << "output tesnor num:" << output_tensor_num;
for (int j = 0; j < ref[i].size(); ++j) {
EXPECT_NEAR(pdata[j * step + (out->dims()[1] * i)], ref[i][j], 1e-6); for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor(
std::move(pred->GetOutput(tidx)));
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
float sum = 0.f;
for (int i = 0; i < out_dims.production(); ++i) {
sum += output_tensor->data<float>()[i];
}
VLOG(1) << "out_dims.production():" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
// print result
for (int i = 0; i < out_dims.production(); ++i) {
VLOG(2) << "output_tensor->data<float>()[" << i
<< "]:" << output_tensor->data<float>()[i];
} }
} }
#endif
} }
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
...@@ -130,7 +168,7 @@ TEST(MobileNetV2, test_arm) { ...@@ -130,7 +168,7 @@ TEST(MobileNetV2, test_arm) {
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
TEST(MobileNetV2, test_opencl) { TEST(MobileNetV2, test_opencl) {
std::vector<Place> valid_places({ std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
......
...@@ -91,7 +91,7 @@ std::vector<Place> ParserValidPlaces() { ...@@ -91,7 +91,7 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(TARGET(kARM)); valid_places.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") { } else if (target_repr == "opencl") {
valid_places.emplace_back( valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}); Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places.emplace_back( valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}); Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places.emplace_back( valid_places.emplace_back(
......
...@@ -6,7 +6,8 @@ lite_cc_library(cl_wrapper SRCS cl_wrapper.cc) ...@@ -6,7 +6,8 @@ lite_cc_library(cl_wrapper SRCS cl_wrapper.cc)
lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper) lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper)
lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility) lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility)
lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime) lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime)
lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor) lite_cc_library(cl_half SRCS cl_half.cc)
lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor cl_half)
lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime) lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime)
lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image) lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image)
lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime) lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
......
...@@ -30,7 +30,7 @@ static void CopyImageData(CLContext* context, ...@@ -30,7 +30,7 @@ static void CopyImageData(CLContext* context,
int width = cl_image.image_dims()[0]; int width = cl_image.image_dims()[0];
int height = cl_image.image_dims()[1]; int height = cl_image.image_dims()[1];
float* image_data = new float[height * width * 4]; uint16_t* image_data = new uint16_t[height * width * 4];
cl::Image* image = cl_image.cl_image(); cl::Image* image = cl_image.cl_image();
cl::array<size_t, 3> origin = {0, 0, 0}; cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = { cl::array<size_t, 3> region = {
......
此差异已折叠。
...@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cl_common.h> #pragma once
#include <cstdint>
__kernel void relu(__read_only image2d_t input, namespace paddle {
__write_only image2d_t output) { namespace lite {
const int x = get_global_id(0); // image_width typedef uint16_t half_t;
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | half_t Float2Half(float f);
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); float Half2Float(half_t h);
in = max((CL_DTYPE4)(0.0f), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
}
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
} // namespace lite
} // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "lite/backends/opencl/cl_image.h" #include "lite/backends/opencl/cl_image.h"
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_runtime.h" #include "lite/backends/opencl/cl_runtime.h"
#include "lite/backends/opencl/cl_utility.h" #include "lite/backends/opencl/cl_utility.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
...@@ -24,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { ...@@ -24,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
int width = cl_image.image_dims_[0]; int width = cl_image.image_dims_[0];
int height = cl_image.image_dims_[1]; int height = cl_image.image_dims_[1];
float* image_data = new float[height * width * 4]; uint16_t* image_data = new uint16_t[height * width * 4];
cl::Image* image = cl_image.cl_image(); cl::Image* image = cl_image.cl_image();
cl::array<size_t, 3> origin = {0, 0, 0}; cl::array<size_t, 3> origin = {0, 0, 0};
...@@ -123,7 +124,7 @@ void CLImage::InitCLImage(const cl::Context& context, ...@@ -123,7 +124,7 @@ void CLImage::InitCLImage(const cl::Context& context,
VLOG(3) << " begin init cl image "; VLOG(3) << " begin init cl image ";
image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
float* image_data = new float[image_dims_.production() * 4]; uint16_t* image_data = new uint16_t[image_dims_.production() * 4];
VLOG(3) << " convert to image "; VLOG(3) << " convert to image ";
converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_); converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_);
......
...@@ -37,7 +37,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -37,7 +37,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
void CLImageConverterDefault::NCHWToImage(float *nchw, void CLImageConverterDefault::NCHWToImage(float *nchw,
float *image, half_t *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1}; size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) { for (size_t j = 0; j < tensor_dim.size(); ++j) {
...@@ -69,7 +69,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, ...@@ -69,7 +69,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
if (c < C) { if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4); // (c % 4);
image[i2] = *p; image[i2] = Float2Half(*p);
i2 += 4; i2 += 4;
p++; p++;
} else { } else {
...@@ -84,7 +84,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, ...@@ -84,7 +84,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
} }
} }
void CLImageConverterDefault::ImageToNCHW(float *image, void CLImageConverterDefault::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
...@@ -109,7 +109,7 @@ void CLImageConverterDefault::ImageToNCHW(float *image, ...@@ -109,7 +109,7 @@ void CLImageConverterDefault::ImageToNCHW(float *image,
for (size_t h = 0; h < H; h++) { for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4; size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) { for (size_t w = 0; w < W; w++) {
*p = image[i2]; *p = Half2Float(image[i2]);
i2 += 4; i2 += 4;
p++; p++;
} }
...@@ -164,7 +164,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -164,7 +164,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
void CLImageConverterFolder::NCHWToImage(float *tensor, void CLImageConverterFolder::NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!"; << " Tensor dim is not support!";
...@@ -187,13 +187,14 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, ...@@ -187,13 +187,14 @@ void CLImageConverterFolder::NCHWToImage(float *tensor,
for (size_t h = 0; h < tdim[0]; h++) { for (size_t h = 0; h < tdim[0]; h++) {
for (size_t w = 0; w < tdim[1]; w++) { for (size_t w = 0; w < tdim[1]; w++) {
image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w]; image[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor[h * tdim[1] + w]);
} }
} }
} }
} }
void CLImageConverterFolder::ImageToNCHW(float *image, void CLImageConverterFolder::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
...@@ -216,7 +217,7 @@ void CLImageConverterFolder::ImageToNCHW(float *image, ...@@ -216,7 +217,7 @@ void CLImageConverterFolder::ImageToNCHW(float *image,
for (size_t h = 0; h < H; h++) { for (size_t h = 0; h < H; h++) {
for (size_t w = 0; w < W; w++) { for (size_t w = 0; w < W; w++) {
p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)]; p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
} }
} }
} }
...@@ -237,7 +238,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -237,7 +238,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
void CLImageConverterNWBlock::NCHWToImage(float *tensor, void CLImageConverterNWBlock::NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
auto image_dim = InitImageDimInfoWith(tensor_dim); auto image_dim = InitImageDimInfoWith(tensor_dim);
...@@ -257,7 +258,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, ...@@ -257,7 +258,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4; w * 4 + n % 4;
if (n < N) { if (n < N) {
image[index] = *p; image[index] = Float2Half(*p);
p++; p++;
} else { } else {
image[index] = 0.0; image[index] = 0.0;
...@@ -272,7 +273,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, ...@@ -272,7 +273,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
VLOG(3) << " init done"; VLOG(3) << " init done";
} }
void CLImageConverterNWBlock::ImageToNCHW(float *image, void CLImageConverterNWBlock::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
...@@ -291,7 +292,7 @@ void CLImageConverterNWBlock::ImageToNCHW(float *image, ...@@ -291,7 +292,7 @@ void CLImageConverterNWBlock::ImageToNCHW(float *image,
for (size_t w = 0; w < W; ++w) { for (size_t w = 0; w < W; ++w) {
size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4; w * 4 + n % 4;
*p = image[index]; *p = Half2Float(image[index]);
p++; p++;
if (index >= (width * height * 4)) { if (index >= (width * height * 4)) {
LOG(INFO) << " index out of range "; LOG(INFO) << " index out of range ";
...@@ -318,7 +319,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -318,7 +319,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
void CLImageConverterDWBlock::NCHWToImage(float *tensor, void CLImageConverterDWBlock::NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1}; size_t new_dims[] = {1, 1, 1, 1};
for (size_t j = 0; j < tensor_dim.size(); ++j) { for (size_t j = 0; j < tensor_dim.size(); ++j) {
...@@ -350,7 +351,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, ...@@ -350,7 +351,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
if (c < C) { if (c < C) {
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4); // (c % 4);
image[i2] = *p; image[i2] = Float2Half(*p);
i2 += 4; i2 += 4;
p++; p++;
} else { } else {
...@@ -365,7 +366,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, ...@@ -365,7 +366,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
} }
} }
void CLImageConverterDWBlock::ImageToNCHW(float *image, void CLImageConverterDWBlock::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
...@@ -384,7 +385,7 @@ void CLImageConverterDWBlock::ImageToNCHW(float *image, ...@@ -384,7 +385,7 @@ void CLImageConverterDWBlock::ImageToNCHW(float *image,
for (size_t h = 0; h < H; h++) { for (size_t h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4; size_t i2 = (i1 << 2) + c % 4;
for (size_t w = 0; w < W; w++) { for (size_t w = 0; w < W; w++) {
*p = image[i2]; *p = Half2Float(image[i2]);
i2 += 4; i2 += 4;
p++; p++;
} }
...@@ -418,7 +419,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) { ...@@ -418,7 +419,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
} }
void CLImageConverterNormal::NCHWToImage(float *tensor, void CLImageConverterNormal::NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) { const DDim &tensor_dim) {
CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
<< " Tensor dim is not support!"; << " Tensor dim is not support!";
...@@ -427,7 +428,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, ...@@ -427,7 +428,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor,
default_converter.NCHWToImage(tensor, image, tensor_dim); default_converter.NCHWToImage(tensor, image, tensor_dim);
} }
void CLImageConverterNormal::ImageToNCHW(float *image, void CLImageConverterNormal::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) { const DDim &tensor_dim) {
...@@ -449,10 +450,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith( ...@@ -449,10 +450,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
} }
void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) {} const DDim &tensor_dim) {}
void CLImageConverterWinoTransWeight::ImageToNCHW(float *image, void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) {} const DDim &tensor_dim) {}
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include "lite/backends/opencl/cl_half.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
namespace paddle { namespace paddle {
...@@ -24,10 +25,10 @@ class CLImageConverterBase { ...@@ -24,10 +25,10 @@ class CLImageConverterBase {
virtual ~CLImageConverterBase() {} virtual ~CLImageConverterBase() {}
virtual void NCHWToImage(float *nchw, virtual void NCHWToImage(float *nchw,
float *image, half_t *image,
const DDim &tensor_dim) = 0; const DDim &tensor_dim) = 0;
virtual void ImageToNCHW(float *image, virtual void ImageToNCHW(half_t *image,
float *nchw, float *nchw,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) = 0; const DDim &tensor_dim) = 0;
...@@ -37,8 +38,8 @@ class CLImageConverterBase { ...@@ -37,8 +38,8 @@ class CLImageConverterBase {
class CLImageConverterDefault : public CLImageConverterBase { class CLImageConverterDefault : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) override; void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
...@@ -48,9 +49,9 @@ class CLImageConverterFolder : public CLImageConverterBase { ...@@ -48,9 +49,9 @@ class CLImageConverterFolder : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor, void NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
...@@ -77,9 +78,9 @@ class CLImageConverterNormal : public CLImageConverterBase { ...@@ -77,9 +78,9 @@ class CLImageConverterNormal : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor, void NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
...@@ -106,9 +107,9 @@ class CLImageConverterNWBlock : public CLImageConverterBase { ...@@ -106,9 +107,9 @@ class CLImageConverterNWBlock : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor, void NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
...@@ -117,9 +118,9 @@ class CLImageConverterDWBlock : public CLImageConverterBase { ...@@ -117,9 +118,9 @@ class CLImageConverterDWBlock : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor, void NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
...@@ -129,9 +130,9 @@ class CLImageConverterWinoTransWeight : public CLImageConverterBase { ...@@ -129,9 +130,9 @@ class CLImageConverterWinoTransWeight : public CLImageConverterBase {
public: public:
DDim InitImageDimInfoWith(const DDim &tensor_dim) override; DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
void NCHWToImage(float *tensor, void NCHWToImage(float *tensor,
float *image, half_t *image,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
void ImageToNCHW(float *image, void ImageToNCHW(half_t *image,
float *tensor, float *tensor,
const DDim &image_dim, const DDim &image_dim,
const DDim &tensor_dim) override; const DDim &tensor_dim) override;
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
// #define DEBUG
// buffer -> image2d // buffer -> image2d
__kernel void buffer_to_image2d(__global CL_DTYPE *in, __kernel void buffer_to_image2d(__global CL_DTYPE *in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
...@@ -27,6 +28,7 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -27,6 +28,7 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H; const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H; const int out_h = out_nh % out_H;
...@@ -47,29 +49,92 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -47,29 +49,92 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
output_pos.x = out_c * out_W + out_w; output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh; output_pos.y = out_nh;
CL_DTYPE4 output = (CL_DTYPE4)0.0f; CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f);
output.x = convert_float(in[input_pos0]); output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE);
if(out_C - 4 * out_c >= 2){
output.y = convert_float(in[input_pos1]); if (out_C - 4 * out_c >= 2) {
output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE);
}
if (out_C - 4 * out_c >= 3) {
output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE);
}
if (out_C - 4 * out_c >= 4) {
output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE);
} }
if(out_C - 4 * out_c >= 3){
output.z = convert_float(in[input_pos2]); #ifdef DEBUG
if (out_w > 2045) {
printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n",
out_w,
out_C - 4 * out_c,
(float)(in[input_pos0]),
(float)(in[input_pos1]),
(float)(in[input_pos2]),
(float)(in[input_pos3]));
printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh,
output_pos.x, output_pos.y,
(float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w));
} }
if(out_C - 4 * out_c >= 4){ #endif
output.w = convert_float(in[input_pos3]);
WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
}
// image2d -> buffer
__kernel void image2d_to_buffer(__read_only image2d_t input,
__private const int in_width,
__private const int in_height,
__global CL_DTYPE* out,
__private const int size_ch,
__private const int size_block,
__private const int size_batch,
__private const int C) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
const int in_n = in_nh / in_height;
const int in_h = in_nh % in_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
#ifdef DEBUG
if (in_w > 2045) {
printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh,
pos_x, in_nh,
(float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w));
}
#endif
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
if (C - 4 * in_c >= 2) {
out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
}
if(C - 4 * in_c >= 3) {
out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
}
if(C - 4 * in_c >= 4) {
out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
} }
write_imagef(output_image, output_pos, output);
} }
#if 0
// buffer -> image2d_nw // buffer -> image2d_nw
__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_H, __private const int out_H,
__private const int out_W, __private const int out_W,
__private const int out_N, __private const int out_N,
__private const int Stride0, __private const int Stride0,
__private const int Stride1, __private const int Stride1,
__private const int Stride2) { __private const int Stride2) {
const int out_n = get_global_id(0); const int out_n = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_ch = get_global_id(2); const int out_ch = get_global_id(2);
...@@ -97,55 +162,23 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, ...@@ -97,55 +162,23 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
output_pos.y = out_ch; output_pos.y = out_ch;
CL_DTYPE4 output = (CL_DTYPE4)0.0f; CL_DTYPE4 output = (CL_DTYPE4)0.0f;
output.x = convert_float(in[input_pos0]); output.x = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos0]);
if (out_N - 4 * out_n >= 2) { if (out_N - 4 * out_n >= 2) {
output.y = convert_float(in[input_pos1]); output.y = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos1]);
} }
if (out_N - 4 * out_n >= 3) { if (out_N - 4 * out_n >= 3) {
output.z = convert_float(in[input_pos2]); output.z = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos2]);
} }
if (out_N - 4 * out_n >= 4) { if (out_N - 4 * out_n >= 4) {
output.w = convert_float(in[input_pos3]); output.w = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos3]);
} }
write_imagef(output_image, output_pos, output);
}
// image2d -> buffer
__kernel void image2d_to_buffer(__read_only image2d_t input,
__private const int in_width,
__private const int in_height,
__global CL_DTYPE* out,
__private const int size_ch,
__private const int size_block,
__private const int size_batch,
__private const int C) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
const int in_n = in_nh / in_height;
const int in_h = in_nh % in_height;
const sampler_t sampler = WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w);
CL_DTYPE4 in = read_imagef(input, sampler, (int2)(pos_x, in_nh));
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = convert_float(in.x);
if (C - 4 * in_c >= 2) {
out[index + size_ch] = convert_float(in.y);
}
if(C - 4 * in_c >= 3) {
out[index + size_ch * 2] = convert_float(in.z);
}
if(C - 4 * in_c >= 4) {
out[index + size_ch * 3] = convert_float(in.w);
}
} }
#endif
#if 0
// image2d -> buffer // image2d -> buffer
__kernel void image2d_to_buffer_2d(__private const int in_height, __kernel void image2d_to_buffer_2d(__private const int in_height,
__private const int in_width, __private const int in_width,
...@@ -157,11 +190,12 @@ __kernel void image2d_to_buffer_2d(__private const int in_height, ...@@ -157,11 +190,12 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
const sampler_t sampler = const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = read_imagef(input, sampler, (int2)(in_w, in_h)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(in_w, in_h));
const int index = (in_h * in_width + in_w) * 4; const int index = (in_h * in_width + in_w) * 4;
out[index] = convert_float(in.x); out[index] = CONVERT_TYPE_TO(CL_DTYPE, in.x);
out[index + 1] = convert_float(in.y); out[index + 1] = CONVERT_TYPE_TO(CL_DTYPE, in.y);
out[index + 2] = convert_float(in.z); out[index + 2] = CONVERT_TYPE_TO(CL_DTYPE, in.z);
out[index + 3] = convert_float(in.w); out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w);
} }
#endif
...@@ -29,11 +29,15 @@ limitations under the License. */ ...@@ -29,11 +29,15 @@ limitations under the License. */
#ifdef CL_DTYPE_float #ifdef CL_DTYPE_float
#define CL_DTYPE float #define CL_DTYPE float
#define CL_DTYPE_CHAR f #define CL_DTYPE_CHAR f
#define CL_COMPUTE_DTYPE half
#define CL_COMPUTE_DTYPE_CHAR h
#endif #endif
#ifdef CL_DTYPE_half #ifdef CL_DTYPE_half
#define CL_DTYPE half #define CL_DTYPE half
#define CL_DTYPE_CHAR h #define CL_DTYPE_CHAR h
#define CL_COMPUTE_DTYPE half
#define CL_COMPUTE_DTYPE_CHAR h
#endif #endif
///////////////////////////////// /////////////////////////////////
...@@ -43,6 +47,7 @@ limitations under the License. */ ...@@ -43,6 +47,7 @@ limitations under the License. */
#define GET_VEC_TYPE(type__, size__) type__##size__ #define GET_VEC_TYPE(type__, size__) type__##size__
#define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__) #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
#define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4) #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
#define CL_COMPUTE_DTYPE4 VECTORIZED_TYPE(CL_COMPUTE_DTYPE, 4)
///////////////////////////////// /////////////////////////////////
// CONVERT_TYPE_TO // CONVERT_TYPE_TO
......
...@@ -14,6 +14,23 @@ limitations under the License. */ ...@@ -14,6 +14,23 @@ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
__kernel void relu(__read_only image2d_t input,
__write_only image2d_t output) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
__kernel void relu6(__read_only image2d_t input, __kernel void relu6(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold){ __private const float threshold){
...@@ -30,3 +47,19 @@ __kernel void relu6(__read_only image2d_t input, ...@@ -30,3 +47,19 @@ __kernel void relu6(__read_only image2d_t input,
in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in); in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = 1 / (1 + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
...@@ -12,26 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,26 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #include <cl_common.h>
__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
__private const float scale_h, __private const float scale_w,
__private const int in_dims_h, __private const int out_dims_h, __kernel void nearest_interp(__read_only image2d_t input,
__private const int in_dims_w, __private const int out_dims_w) { __write_only image2d_t output,
const int c = get_global_id(0); __private const float scale_h,
const int w = get_global_id(1); __private const float scale_w,
const int nh = get_global_id(2); __private const int in_dims_h,
int2 output_pos; __private const int out_dims_h,
output_pos.x = c * out_dims_w + w; __private const int in_dims_w,
output_pos.y = nh; __private const int out_dims_w) {
int out_n = nh / out_dims_h;
int out_h = nh % out_dims_h; const int c = get_global_id(0);
int2 input_pos; const int w = get_global_id(1);
input_pos.x = c * in_dims_w + w / scale_w; const int nh = get_global_id(2);
input_pos.y = out_n * in_dims_h + out_h / scale_h;
int2 output_pos;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | output_pos.x = c * out_dims_w + w;
CLK_ADDRESS_CLAMP | output_pos.y = nh;
CLK_FILTER_NEAREST;
half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y)); int out_n = nh / out_dims_h;
write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data); int out_h = nh % out_dims_h;
int2 input_pos;
input_pos.x = c * in_dims_w + w / scale_w;
input_pos.y = out_n * in_dims_h + out_h / scale_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(input_pos.x, input_pos.y));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(output_pos.x , output_pos.y), input_data);
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = 1 / (1 + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
...@@ -81,10 +81,10 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width, ...@@ -81,10 +81,10 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
return cl_image; return cl_image;
} }
template <> // use int16_t represents half float template <> // use uint16_t represents half float
void *TargetWrapperCL::MallocImage<int16_t>(const size_t cl_image2d_width, void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
const size_t cl_image2d_height, const size_t cl_image2d_height,
void *host_ptr) { void *host_ptr) {
cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16))); cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16)));
cl_int status; cl_int status;
cl::Image2D *cl_image = cl::Image2D *cl_image =
......
...@@ -178,5 +178,6 @@ void PrecisionCastPass::SetValidPlaces(const std::vector<Place>& valid_places) { ...@@ -178,5 +178,6 @@ void PrecisionCastPass::SetValidPlaces(const std::vector<Place>& valid_places) {
REGISTER_MIR_PASS(type_precision_cast_pass, REGISTER_MIR_PASS(type_precision_cast_pass,
paddle::lite::mir::PrecisionCastPass) paddle::lite::mir::PrecisionCastPass)
.BindTargets({TARGET(kAny)}) .BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kOpenCL)})
.BindKernel("calib_once") .BindKernel("calib_once")
.BindKernel("calib"); .BindKernel("calib");
...@@ -103,8 +103,8 @@ const cl::Image2D *TensorLite::data<float, cl::Image2D>() const { ...@@ -103,8 +103,8 @@ const cl::Image2D *TensorLite::data<float, cl::Image2D>() const {
return static_cast<const cl::Image2D *>(buffer_->data()); return static_cast<const cl::Image2D *>(buffer_->data());
} }
template <> // use int16_t represent half float template <> // use uint16_t represent half float
const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const { const cl::Image2D *TensorLite::data<uint16_t, cl::Image2D>() const {
if (nullptr == buffer_->data()) return nullptr; if (nullptr == buffer_->data()) return nullptr;
return static_cast<const cl::Image2D *>(buffer_->data()); return static_cast<const cl::Image2D *>(buffer_->data());
} }
......
...@@ -260,8 +260,8 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) { ...@@ -260,8 +260,8 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) {
template <> template <>
const cl::Image2D *TensorLite::data<float, cl::Image2D>() const; const cl::Image2D *TensorLite::data<float, cl::Image2D>() const;
template <> // use int16_t represent half float template <> // use uint16_t represent half float
const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const; const cl::Image2D *TensorLite::data<uint16_t, cl::Image2D>() const;
#endif #endif
} // namespace lite } // namespace lite
......
...@@ -4,91 +4,136 @@ endif() ...@@ -4,91 +4,136 @@ endif()
set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter) set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter)
add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps}) #####################
add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps}) # image kernel #
add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps}) #####################
add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_compute.cc DEPS ${cl_kernel_deps}) # basic
add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fusion_elementwise_add_activation_opencl add_kernel(fusion_elementwise_add_activation_opencl
OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc OPENCL basic SRCS fusion_elementwise_add_activation_image_compute.cc
DEPS elementwise_add_opencl ${cl_kernel_deps}) DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps}) add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps}) add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(sigmoid_opencl OPENCL basic SRCS sigmoid_compute.cc DEPS ${cl_kernel_deps}) add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl OPENCL basic SRCS concat_compute.cc DEPS ${cl_kernel_deps}) add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps}) add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_compute.cc DEPS ${cl_kernel_deps}) add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc # extra
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context # wait to add ...
######################
# image kernel test #
######################
lite_cc_test(test_activation_image_opencl SRCS activation_image_compute_test.cc
DEPS activation_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_elementwise_mul_opencl SRCS elementwise_mul_compute_test.cc lite_cc_test(test_conv_image_opencl SRCS conv_image_compute_test.cc
DEPS elementwise_mul_opencl op_registry program context DEPS conv_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc lite_cc_test(test_depthwise_conv2d_image_opencl SRCS depthwise_conv2d_image_compute_test.cc
DEPS pool_opencl op_registry program context DEPS conv_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc lite_cc_test(test_nearest_interp_image_opencl SRCS nearest_interp_image_compute_test.cc
DEPS fc_opencl op_registry program context DEPS nearest_interp_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_pool_image_opencl SRCS pool_image_compute_test.cc
DEPS pool_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
# TODO(ysh329): comment for buffer-impl mul lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
#lite_cc_test(test_mul_opencl SRCS mul_compute_test.cc DEPS scale_opencl op_registry program context
# DEPS mul_opencl op_registry program context ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
DEPS io_copy_compute_opencl op_registry program context DEPS reshape_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#TODO(ysh329): comment buffer-impl relu lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc DEPS concat_opencl layout_opencl op_registry program context
DEPS relu_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_sigmoid_opencl SRCS sigmoid_compute_test.cc lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
DEPS sigmoid_opencl layout_opencl op_registry program context DEPS elementwise_mul_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
DEPS depthwise_conv2d_opencl op_registry program context DEPS layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_depthwise_conv2d_image2d_opencl SRCS depthwise_conv2d_image2d_compute_test.cc lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
DEPS conv_opencl op_registry program context DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
DEPS reshape_opencl op_registry program context ######################
# buffer kernel #
######################
# basic
#add_kernel(activation_opencl OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(conv_opencl OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(fusion_elementwise_add_activation_opencl
# OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
# DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
# extra
# wait to add ...
######################
# buffer kernel test #
######################
#lite_cc_test(test_activation_buffer_opencl SRCS activation_buffer_compute_test.cc
# DEPS activation_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
# DEPS conv_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
# DEPS depthwise_conv2d_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_pool_buffer_opencl SRCS pool_buffer_compute_test.cc
# DEPS pool_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#lite_cc_test(test_concat_buffer_opencl SRCS concat_buffer_compute_test.cc
# DEPS concat_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
DEPS fc_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
DEPS conv_opencl op_registry program context DEPS mul_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc #lite_cc_test(test_elementwise_add_buffer_opencl SRCS elementwise_add__buffer_compute_test.cc
DEPS conv_opencl op_registry program context cl_image_converter # DEPS elementwise_add_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) # ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc
DEPS layout_opencl op_registry program context cl_image_converter DEPS io_copy_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_concat_opencl SRCS concat_compute_test.cc
DEPS concat_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_scale_opencl SRCS scale_compute_test.cc
DEPS scale_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ReluCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
size_t count = x_dims.production();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)count);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{count};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class SigmoidCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Sigmoid using cl::Buffer, kFloat";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
size_t count = x_dims.production();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)count);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{count};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
private:
std::string kernel_func_name_{"sigmoid"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
// Relu
REGISTER_LITE_KERNEL(relu,
kOpenCL,
kFloat,
kNCHW,
paddle::lite::kernels::opencl::ReluCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
// Sigmoid
REGISTER_LITE_KERNEL(sigmoid,
kOpenCL,
kFloat,
kNCHW,
paddle::lite::kernels::opencl::SigmoidCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle {
namespace lite {
template <typename dtype>
void relu_compute_ref(const dtype *x_data,
const DDim &x_dim,
dtype *out_data,
float threshold = 0.f) {
if (abs(threshold) < 1e-5) {
// relu
for (int i = 0; i < x_dim.production(); ++i) {
out_data[i] = (x_data[i] > threshold) ? x_data[i] : threshold;
}
} else {
// relu6 or relu with threshold
for (int i = 0; i < x_dim.production(); ++i) {
auto out_tmp = (x_data[i] > 0) ? x_data[i] : 0;
out_data[i] = (out_tmp < threshold) ? out_tmp : threshold;
}
}
}
template <typename dtype>
void sigmoid_compute_ref(const dtype *x_data,
const DDim &x_dim,
dtype *out_data) {
for (int i = 0; i < x_dim.production(); ++i) {
out_data[i] = 1 / (1 + expf(-x_data[i]));
}
}
TEST(opencl_relu_buffer, compute) {
// prepare data
const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
lite::Tensor x, out;
x.Resize(x_dim);
out.Resize(x_dim);
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x = static_cast<float *>(
TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
mapped_x[i] = dist(engine);
}
// set param and kernel, then run
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"relu", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> relu_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(relu_context->As<OpenCLContext>()));
kernel->SetContext(std::move(relu_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.Out->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
relu_compute_ref<float>(mapped_x, x_dim, out_ref.get());
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x_data, mapped_x);
}
TEST(opencl_sigmoid_buffer, compute) {
// prepare data
const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
lite::Tensor x, out;
x.Resize(x_dim);
out.Resize(x_dim);
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x = static_cast<float *>(
TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
mapped_x[i] = dist(engine);
}
// set param and kernel, then run
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"sigmoid", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> sigmoid_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(sigmoid_context->As<OpenCLContext>()));
kernel->SetContext(std::move(sigmoid_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.Out->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
sigmoid_compute_ref<float>(mapped_x, x_dim, out_ref.get());
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x_data, mapped_x);
}
} // namespace lite
} // namespace paddle
// sigmoid buffer
USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
// relu buffer
USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
...@@ -24,44 +24,55 @@ namespace lite { ...@@ -24,44 +24,55 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class SigmoidCompute class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> { PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ActivationParam; using param_t = operators::ActivationParam;
std::string doc() const override { std::string doc() const override {
return "Sigmoid using cl::Buffer, kFloat"; return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16";
} }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_); kernel_func_name_, "image/activation_kernel.cl", build_options_);
} }
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
size_t count = x_dims.production(); auto* x_buf = param.X->data<uint16_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
int arg_idx = 0; int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf); cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)count);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{count}; VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
...@@ -70,40 +81,42 @@ class SigmoidCompute ...@@ -70,40 +81,42 @@ class SigmoidCompute
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_); // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
} }
private: private:
std::string kernel_func_name_{"sigmoid"}; std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"}; std::string build_options_{"-DCL_DTYPE_half -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class SigmoidComputeFloatImageDefault class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
: public KernelLite<TARGET(kOpenCL), PRECISION(kFP16),
PRECISION(kFloat), DATALAYOUT(kImageDefault)> {
DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ActivationParam; using param_t = operators::ActivationParam;
std::string doc() const override { std::string doc() const override {
return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFloat"; return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
} }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/sigmoid_kernel.cl", build_options_); kernel_func_name_, "image/activation_kernel.cl", build_options_);
} }
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<float, cl::Image2D>(); auto* x_buf = param.X->data<uint16_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<float, cl::Image2D>( auto* out_buf = param.Out->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]); image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
...@@ -116,6 +129,8 @@ class SigmoidComputeFloatImageDefault ...@@ -116,6 +129,8 @@ class SigmoidComputeFloatImageDefault
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target()); VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target()); VLOG(4) << TargetToStr(param.Out->target());
...@@ -125,6 +140,7 @@ class SigmoidComputeFloatImageDefault ...@@ -125,6 +140,7 @@ class SigmoidComputeFloatImageDefault
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "threshold:" << threshold;
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]), cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
...@@ -143,12 +159,12 @@ class SigmoidComputeFloatImageDefault ...@@ -143,12 +159,12 @@ class SigmoidComputeFloatImageDefault
} }
private: private:
std::string kernel_func_name_{"sigmoid"}; std::string kernel_func_name_{"relu6"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"}; std::string build_options_{"-DCL_DTYPE_half -DRELU6"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class SigmoidComputeFP16ImageDefault class SigmoidComputeImageDefault
: public KernelLite<TARGET(kOpenCL), : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
...@@ -162,19 +178,19 @@ class SigmoidComputeFP16ImageDefault ...@@ -162,19 +178,19 @@ class SigmoidComputeFP16ImageDefault
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/sigmoid_kernel.cl", build_options_); kernel_func_name_, "image/activation_kernel.cl", build_options_);
} }
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
auto* x_buf = auto* x_buf =
param.X->data<int16_t, param.X->data<uint16_t,
cl::Image2D>(); // use int16_t represents half float cl::Image2D>(); // use uint16_t represents half float
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = auto* out_buf =
param.Out->mutable_data<int16_t, cl::Image2D>( // use int16_t param.Out->mutable_data<uint16_t, cl::Image2D>( // use uint16_t
// represents half float // represents half float
image_shape["width"], image_shape["width"],
image_shape["height"]); image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only const auto& y_dims = param.Out->dims(); // useless: check dim only
...@@ -227,40 +243,47 @@ class SigmoidComputeFP16ImageDefault ...@@ -227,40 +243,47 @@ class SigmoidComputeFP16ImageDefault
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// REGISTER_LITE_KERNEL(sigmoid, // Relu
// kOpenCL, REGISTER_LITE_KERNEL(relu,
// kFloat, kOpenCL,
// kNCHW, kFP16,
// paddle::lite::kernels::opencl::SigmoidCompute, kImageDefault,
// def) paddle::lite::kernels::opencl::ReluComputeImageDefault,
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))}) ImageDefault)
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(
sigmoid,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::SigmoidComputeFloatImageDefault,
ImageDefault)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Relu6
REGISTER_LITE_KERNEL(relu6,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::Relu6ComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL( // Sigmoid
sigmoid, REGISTER_LITE_KERNEL(sigmoid,
kOpenCL, kOpenCL,
kFP16, kFP16,
kImageDefault, kImageDefault,
paddle::lite::kernels::opencl::SigmoidComputeFP16ImageDefault, paddle::lite::kernels::opencl::SigmoidComputeImageDefault,
ImageDefault) ImageDefault)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16), PRECISION(kFP16),
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ConcatCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConcatParam;
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
auto global_work_size = cl::NDRange{axis_size_};
int total = axis_size_ * post_size_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
auto axis0 = inputs[0]->dims()[axis_];
int total0 = axis0 * post_size_;
int total1 = (axis_size_ - axis0) * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total1);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
int size = inputs[i]->dims()[axis_];
auto* x_buf = inputs[i]->data<float, cl::Buffer>();
global_work_size = cl::NDRange{static_cast<size_t>(size)};
int total0 = size * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(size));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
start += size;
}
}
}
std::string doc() { return "Concat using cl::Buffer, kFloat"; }
int axis_size_ = 1;
int post_size_ = 1;
int pre_size_ = 1;
int axis_ = 1;
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::ConcatCompute Concat_buffer;
REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// // Licensed under the Apache License, Version 2.0 (the "License");
// // you may not use this file except in compliance with the License.
// // You may obtain a copy of the License at
// //
// // http://www.apache.org/licenses/LICENSE-2.0
// //
// // Unless required by applicable law or agreed to in writing, software
// // distributed under the License is distributed on an "AS IS" BASIS,
// // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// // See the License for the specific language governing permissions and
// // limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle {
namespace lite {
template <typename dtype>
void concat2_compute_ref(const dtype *in0,
const dtype *in1,
const int axis,
const DDim in0_dim,
const DDim in1_dim,
const DDim out_dim,
dtype *out_data) {
int pre_size = 1;
int post_size = 1;
for (int i = 0; i < axis; i++) {
pre_size *= in0_dim[i];
}
for (int i = axis + 1; i < in0_dim.size(); i++) {
post_size *= in0_dim[i];
}
int axis_size = out_dim[axis];
for (int i = 0; i < pre_size; i++) {
for (int j = 0; j < axis_size; j++) {
if (j < in0_dim[axis]) {
memcpy(out_data, in0, sizeof(dtype) * post_size);
in0 += post_size;
out_data += post_size;
}
}
}
}
template <typename dtype>
void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
std::vector<const DDim> ins_dim,
int axis,
const DDim out_dim,
dtype *out_data) {
int pre_size = 1;
int post_size = 1;
for (int i = 0; i < axis; i++) {
pre_size *= ins_dim[0][i];
}
for (int i = axis + 1; i < ins_dim[0].size(); i++) {
post_size *= ins_dim[0][i];
}
int axis_size = out_dim[axis];
for (int i = 0; i < pre_size; i++) {
for (int j = 0; j < ins_data.size(); j++) {
int size = post_size * ins_dim[j][axis];
memcpy(out_data, ins_data[j], sizeof(dtype) * size);
out_data += size;
}
}
}
TEST(opencl_concat_buffer, compute) {
// prepare data
const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
lite::Tensor x0, x1, x2, out, out_ref;
x0.Resize(x0_dim);
x1.Resize(x1_dim);
x2.Resize(x2_dim);
out.Resize(out_dim);
out_ref.Resize(out_dim);
auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x0 = static_cast<float *>(
TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
auto *mapped_x1 = static_cast<float *>(
TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
auto *mapped_x2 = static_cast<float *>(
TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
for (int i = 0; i < x0_dim.production(); i++) {
mapped_x0[i] = dist(engine);
}
for (int i = 0; i < x1_dim.production(); i++) {
mapped_x1[i] = dist(engine);
}
for (int i = 0; i < x2_dim.production(); i++) {
mapped_x2[i] = dist(engine);
}
// set param and kernel, then run
operators::ConcatParam param;
std::vector<lite::Tensor *> ins;
ins.push_back(&x0);
ins.push_back(&x1);
ins.push_back(&x2);
auto axis = 1;
param.x = ins;
param.output = &out;
param.axis = axis;
std::vector<const float *> ins_data;
std::vector<const DDim> ins_dim;
ins_data.push_back(mapped_x0);
ins_data.push_back(mapped_x1);
ins_data.push_back(mapped_x2);
ins_dim.push_back(x0_dim);
ins_dim.push_back(x1_dim);
ins_dim.push_back(x2_dim);
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> concat_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(concat_context->As<OpenCLContext>()));
kernel->SetContext(std::move(concat_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.output->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x0_data, mapped_x0);
TargetWrapperCL::Unmap(x1_data, mapped_x1);
TargetWrapperCL::Unmap(x2_data, mapped_x2);
}
} // namespace lite
} // namespace paddle
// concat buffer
USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/concat_compute.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
template <>
void ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kImageDefault)>::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "image/concat_kernel.cl", build_options_);
// UpdateParams<kFloat, kImageDefault>();
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
// auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::Run() {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
int width = inputs[0]->dims()[-1];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int flag = 1; // cxw
switch (axis_) {
case 0:
width = x_dims[2]; // n
flag = 0;
break;
case 1:
width = x_dims[3]; // c
break;
case 2:
width = x_dims[0]; // h
flag = 0;
break;
case 3:
case -1:
width = x_dims[1]; // w
break;
default:
printf("this axis: %d does not support \n", axis_);
}
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status =
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto* x_buf = inputs[i]->data<float, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
start += inputs[i]->dims()[axis_];
}
}
}
template <>
std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::doc() {
return "Concat using cl::Image, kFloat";
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
// UpdateParams<kFloat, kImageDefault>();
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
template <>
void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::Run() {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
auto global_work_size = cl::NDRange{axis_size_};
int total = axis_size_ * post_size_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
auto axis0 = inputs[0]->dims()[axis_];
int total0 = axis0 * post_size_;
int total1 = (axis_size_ - axis0) * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total1);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
int size = inputs[i]->dims()[axis_];
auto* x_buf = inputs[i]->data<float, cl::Buffer>();
global_work_size = cl::NDRange{static_cast<size_t>(size)};
int total0 = size * post_size_;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(size));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, pre_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, post_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
start += size;
}
}
}
template <>
std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::doc() {
return "Concat using cl::Buffer, kFloat";
}
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
Concat_buffer;
typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
DATALAYOUT(kImageDefault)>
Concat_image;
REGISTER_LITE_KERNEL(
concat, kOpenCL, kFloat, kImageDefault, Concat_image, ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
// REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
// .BindInput("X",
// {LiteType::GetTensorTy(TARGET(kOpenCL),
// PRECISION(kFloat),
// DATALAYOUT(kNCHW))})
// .BindInput("AxisTensor",
// {LiteType::GetTensorTy(TARGET(kOpenCL),
// PRECISION(kInt32),
// DATALAYOUT(kNCHW))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kOpenCL),
// PRECISION(kFloat),
// DATALAYOUT(kNCHW))})
// .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConcatParam;
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "image/concat_kernel.cl", build_options_);
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
// auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
}
}
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.output->mutable_data<uint16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto inputs = param.x;
int arg_idx = 0;
int width = inputs[0]->dims()[-1];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int flag = 1; // cxw
switch (axis_) {
case 0:
width = x_dims[2]; // n
flag = 0;
break;
case 1:
width = x_dims[3]; // c
break;
case 2:
width = x_dims[0]; // h
flag = 0;
break;
case 3:
case -1:
width = x_dims[1]; // w
break;
default:
printf("this axis: %d does not support \n", axis_);
}
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<uint16_t, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<uint16_t, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status =
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto* x_buf = inputs[i]->data<uint16_t, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
CL_CHECK_FATAL(status);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
start += inputs[i]->dims()[axis_];
}
}
}
std::string doc() { return "Concat using cl::Image, kFP16"; }
int axis_size_ = 1;
int post_size_ = 1;
int pre_size_ = 1;
int axis_ = 1;
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::ConcatComputeImage Concat_image;
REGISTER_LITE_KERNEL(
concat, kOpenCL, kFP16, kImageDefault, Concat_image, ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kInt32),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -73,106 +76,10 @@ void concat_mul_compute_ref(std::vector<const dtype *> ins_data, ...@@ -73,106 +76,10 @@ void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
} }
} }
} }
#if 0 // concat_buffer
TEST(opencl_concat_buffer, compute) {
// prepare data
const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
lite::Tensor x0, x1, x2, out, out_ref;
x0.Resize(x0_dim);
x1.Resize(x1_dim);
x2.Resize(x2_dim);
out.Resize(out_dim);
out_ref.Resize(out_dim);
auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-10, 10);
auto *mapped_x0 = static_cast<float *>(
TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
auto *mapped_x1 = static_cast<float *>(
TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
auto *mapped_x2 = static_cast<float *>(
TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
for (int i = 0; i < x0_dim.production(); i++) {
mapped_x0[i] = dist(engine);
}
for (int i = 0; i < x1_dim.production(); i++) {
mapped_x1[i] = dist(engine);
}
for (int i = 0; i < x2_dim.production(); i++) {
mapped_x2[i] = dist(engine);
}
// set param and kernel, then run
operators::ConcatParam param;
std::vector<lite::Tensor *> ins;
ins.push_back(&x0);
ins.push_back(&x1);
ins.push_back(&x2);
auto axis = 1;
param.x = ins;
param.output = &out;
param.axis = axis;
std::vector<const float *> ins_data;
std::vector<const DDim> ins_dim;
ins_data.push_back(mapped_x0);
ins_data.push_back(mapped_x1);
ins_data.push_back(mapped_x2);
ins_dim.push_back(x0_dim);
ins_dim.push_back(x1_dim);
ins_dim.push_back(x2_dim);
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
auto kernels = KernelRegistry::Global().Create(
"concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
kernel->SetParam(param);
std::unique_ptr<KernelContext> concat_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(concat_context->As<OpenCLContext>()));
kernel->SetContext(std::move(concat_context));
kernel->Launch();
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = param.output->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
// run compute ref and check
auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
}
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x0_data, mapped_x0);
TargetWrapperCL::Unmap(x1_data, mapped_x1);
TargetWrapperCL::Unmap(x2_data, mapped_x2);
}
#endif // concat_buffer
// #define LOOP_TEST // #define LOOP_TEST
// #define PRINT_RESULT // #define PRINT_RESULT
TEST(concat_image2d_fp32, compute) { TEST(concat_image2d, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> " LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> "
"layout(img2buf) " "layout(img2buf) "
"-> host"; "-> host";
...@@ -209,7 +116,7 @@ TEST(concat_image2d_fp32, compute) { ...@@ -209,7 +116,7 @@ TEST(concat_image2d_fp32, compute) {
auto concat_img_kernels = auto concat_img_kernels =
KernelRegistry::Global().Create("concat", KernelRegistry::Global().Create("concat",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels1.empty()); ASSERT_FALSE(buf_to_img_kernels1.empty());
...@@ -284,14 +191,18 @@ TEST(concat_image2d_fp32, compute) { ...@@ -284,14 +191,18 @@ TEST(concat_image2d_fp32, compute) {
for (int i = 0; i < out_dim.production(); ++i) { for (int i = 0; i < out_dim.production(); ++i) {
mapped_y[i] = static_cast<int>(0); mapped_y[i] = static_cast<int>(0);
} }
auto *concat_in_data0 = concat_in0.mutable_data<float, cl::Image2D>( auto *concat_in_data0 =
concat_image2d_shape_in0["width"], concat_in0.mutable_data<uint16_t, cl::Image2D>(
concat_image2d_shape_in0["height"]); concat_image2d_shape_in0["width"],
auto *concat_in_data1 = concat_in1.mutable_data<float, cl::Image2D>( concat_image2d_shape_in0["height"]);
concat_image2d_shape_in1["width"], auto *concat_in_data1 =
concat_image2d_shape_in1["height"]); concat_in1.mutable_data<uint16_t, cl::Image2D>(
auto *concat_out_data = concat_out.mutable_data<float, cl::Image2D>( concat_image2d_shape_in1["width"],
concat_image2d_shape["width"], concat_image2d_shape["height"]); concat_image2d_shape_in1["height"]);
auto *concat_out_data =
concat_out.mutable_data<uint16_t, cl::Image2D>(
concat_image2d_shape["width"],
concat_image2d_shape["height"]);
// set context and kernel args // set context and kernel args
LOG(INFO) << "set context and kernel args"; LOG(INFO) << "set context and kernel args";
...@@ -347,22 +258,35 @@ TEST(concat_image2d_fp32, compute) { ...@@ -347,22 +258,35 @@ TEST(concat_image2d_fp32, compute) {
#ifdef PRINT_RESULT #ifdef PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----"; LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < out_dim.production(); ++eidx) { for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
std::cout << mapped_x0[eidx] << ", " << mapped_x1[eidx] << " -> " std::cout << "x0[" << eidx << "]:" << mapped_x0[eidx] << ",\t x1["
<< mapped_y[eidx] << std::endl; << eidx << "]:" << mapped_x1[eidx] << " -> y[" << eidx
<< "]:" << mapped_y[eidx] << "\t, y_ref[" << eidx
<< "]:" << y_data_ref[eidx] << ",\t IS_DIFF_PASSED:"
<< IS_DIFF_PASSED(
y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF)
<< std::endl;
} }
#endif // PRINT_RESULT #endif // PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref) // check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < out_dim.production(); eidx++) { for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6); auto abs_diff = abs(y_data_ref[i] - mapped_y[i]);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) { auto relative_diff =
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]);
<< " / " << x0_dim.production() << ", y_data_ref[" EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y[" (abs_diff <= FP16_MAX_DIFF),
<< eidx << "]:" << mapped_y[eidx]; true);
if ((relative_diff > FP16_MAX_DIFF) &&
(abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " mapped_y[" << i
<< "]:" << mapped_y[i] << " y_data_ref[" << i
<< "]:" << y_data_ref[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
break; break;
} }
} }
// free // free
LOG(INFO) << "free: unmap x, y"; LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data0, mapped_x0); TargetWrapperCL::Unmap(x_data0, mapped_x0);
...@@ -382,9 +306,9 @@ TEST(concat_image2d_fp32, compute) { ...@@ -382,9 +306,9 @@ TEST(concat_image2d_fp32, compute) {
} // namespace paddle } // namespace paddle
// concat buffer // concat buffer
// USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def); // USE_LITE_KERNEL(concat, kOpenCL, kFP16, kNCHW, def);
// concat image2d fp32 // concat image2d fp32
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(concat, kOpenCL, kFloat, kImageDefault, ImageDefault); USE_LITE_KERNEL(concat, kOpenCL, kFP16, kImageDefault, ImageDefault);
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/opencl/conv_compute.h" #include "lite/kernels/opencl/conv_buffer_compute.h"
#include <sstream> #include <sstream>
...@@ -1431,50 +1431,14 @@ void ConvImageCompute::Run() { (this->*impl_)(); } ...@@ -1431,50 +1431,14 @@ void ConvImageCompute::Run() { (this->*impl_)(); }
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// REGISTER_LITE_KERNEL(conv2d,
// kOpenCL,
// kFloat,
// kNCHW,
// paddle::lite::kernels::opencl::ConvCompute,
// def)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(conv2d, REGISTER_LITE_KERNEL(conv2d,
kOpenCL, kOpenCL,
kFloat, kFloat,
kImageDefault, kNCHW,
paddle::lite::kernels::opencl::ConvImageCompute, paddle::lite::kernels::opencl::ConvCompute,
image2d) def)
.BindInput("Input", .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
{LiteType::GetTensorTy(TARGET(kOpenCL), .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
PRECISION(kFloat), .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
DATALAYOUT(kImageDefault))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(depthwise_conv2d,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::ConvImageCompute,
image2d)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
...@@ -58,34 +58,6 @@ class ConvCompute ...@@ -58,34 +58,6 @@ class ConvCompute
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)();
void PrepareForRun() override;
void Run() override;
private:
void Conv2d1x1();
void Conv2d3x3();
void Conv2d5x5();
void Conv2d7x7();
void DepthwiseConv2d3x3s1();
void DepthwiseConv2d3x3();
void DepthwiseConv2d();
kernel_t impl_;
std::vector<std::string> kernel_func_names_{};
std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{};
std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_;
Tensor bias_gpu_image_;
};
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -167,7 +167,6 @@ void PrintData(std::string name, ...@@ -167,7 +167,6 @@ void PrintData(std::string name,
} }
// buffer // buffer
#if 0
// #define PRINT_RESULT // #define PRINT_RESULT
#define LOOP_TEST #define LOOP_TEST
TEST(conv2d, compute_conv2d_1x1) { TEST(conv2d, compute_conv2d_1x1) {
...@@ -625,9 +624,8 @@ TEST(conv2d, compute_conv2d_gemm) { ...@@ -625,9 +624,8 @@ TEST(conv2d, compute_conv2d_gemm) {
} // batch_size } // batch_size
#endif #endif
} }
#endif
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def); USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
此差异已折叠。
...@@ -11,41 +11,50 @@ ...@@ -11,41 +11,50 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/tensor.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
template <PrecisionType Ptype, DataLayoutType layout> class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
class ConcatCompute : public KernelLite<TARGET(kOpenCL), Ptype, layout> { PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ConcatParam; using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)();
void PrepareForRun() override; void PrepareForRun() override;
void Run() override; void Run() override;
std::string doc(); // override; private:
void Conv2d1x1();
// protected: void Conv2d3x3();
// void UpdateParams(); void Conv2d5x5();
void Conv2d7x7();
int axis_size_ = 1; void DepthwiseConv2d3x3s1();
int post_size_ = 1; void DepthwiseConv2d3x3();
int pre_size_ = 1; void DepthwiseConv2d();
int axis_ = 1;
param_t* concat_param_{nullptr}; kernel_t impl_;
std::string kernel_func_name_{}; std::vector<std::string> kernel_func_names_{};
std::string build_options_{"-DCL_DTYPE_float"}; std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_;
Tensor bias_gpu_image_;
}; };
} // namespace opencl } // namespace opencl
......
...@@ -105,6 +105,7 @@ int ConvOutputSize(int input_size, ...@@ -105,6 +105,7 @@ int ConvOutputSize(int input_size,
return output_size; return output_size;
} }
// #define LOOP_TEST
TEST(depthwise_conv2d_basic, compute) { TEST(depthwise_conv2d_basic, compute) {
// conv infos // conv infos
// const int ksize = 1; // const int ksize = 1;
...@@ -144,7 +145,7 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -144,7 +145,7 @@ TEST(depthwise_conv2d_basic, compute) {
auto kernels = auto kernels =
KernelRegistry::Global().Create("depthwise_conv2d", KernelRegistry::Global().Create("depthwise_conv2d",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
...@@ -252,14 +253,14 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -252,14 +253,14 @@ TEST(depthwise_conv2d_basic, compute) {
paddle::lite::CLImageConverterDefault default_convertor; paddle::lite::CLImageConverterDefault default_convertor;
VLOG(4) << "set mapped input ..."; VLOG(4) << "set mapped input ...";
std::vector<float> x_image_v(input_image_width * input_image_height * std::vector<uint16_t> x_image_v(input_image_width *
4); // 4 : RGBA input_image_height * 4); // 4 : RGBA
std::vector<float> filter_image_v( std::vector<uint16_t> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 : RGBA filter_image_width * filter_image_height * 4); // 4 : RGBA
std::vector<float> bias_image_v(bias_image_width * bias_image_height * std::vector<uint16_t> bias_image_v(
4); // 4 : RGBA bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v(out_image_width * out_image_height * std::vector<uint16_t> out_image_v(out_image_width * out_image_height *
4); // 4 : RGBA 4); // 4 : RGBA
default_convertor.NCHWToImage( default_convertor.NCHWToImage(
input_v.data(), x_image_v.data(), input_dim); input_v.data(), x_image_v.data(), input_dim);
...@@ -269,9 +270,9 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -269,9 +270,9 @@ TEST(depthwise_conv2d_basic, compute) {
nw_convertor.NCHWToImage( nw_convertor.NCHWToImage(
filter_v.data(), filter_image_v.data(), filter_dim); filter_v.data(), filter_image_v.data(), filter_dim);
auto* input_image2d = input.mutable_data<float, cl::Image2D>( auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data()); input_image_width, input_image_height, x_image_v.data());
auto* filter_image2d = filter.mutable_data<float, cl::Image2D>( auto* filter_image2d = filter.mutable_data<uint16_t, cl::Image2D>(
filter_image_width, filter_image_height, filter_image_v.data()); filter_image_width, filter_image_height, filter_image_v.data());
if (bias_flag) { if (bias_flag) {
...@@ -284,7 +285,7 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -284,7 +285,7 @@ TEST(depthwise_conv2d_basic, compute) {
CLImageConverterFolder folder_convertor; CLImageConverterFolder folder_convertor;
folder_convertor.NCHWToImage( folder_convertor.NCHWToImage(
bias_v.data(), bias_image_v.data(), bias_dim); bias_v.data(), bias_image_v.data(), bias_dim);
auto* bias_data = bias.mutable_data<float, cl::Image2D>( auto* bias_data = bias.mutable_data<uint16_t, cl::Image2D>(
bias_image_width, bias_image_height, bias_image_v.data()); bias_image_width, bias_image_height, bias_image_v.data());
} }
...@@ -300,11 +301,11 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -300,11 +301,11 @@ TEST(depthwise_conv2d_basic, compute) {
VLOG(4) << "kernel launch ..."; VLOG(4) << "kernel launch ...";
kernel->Launch(); kernel->Launch();
VLOG(4) << "mutable output ..."; VLOG(4) << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>( auto* output_image2d = output.mutable_data<uint16_t, cl::Image2D>(
out_image_width, out_image_height); out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
...@@ -318,7 +319,7 @@ TEST(depthwise_conv2d_basic, compute) { ...@@ -318,7 +319,7 @@ TEST(depthwise_conv2d_basic, compute) {
} }
TargetWrapperCL::ImgcpySync(out_image_v.data(), TargetWrapperCL::ImgcpySync(out_image_v.data(),
output.data<float, cl::Image2D>(), output.data<uint16_t, cl::Image2D>(),
out_image_width, out_image_width,
out_image_height, out_image_height,
cl_image2d_row_pitch, cl_image2d_row_pitch,
...@@ -387,7 +388,7 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -387,7 +388,7 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
LOG(INFO) << "to get kernel ..."; LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create("depthwise_conv2d", auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
...@@ -433,11 +434,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -433,11 +434,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
default_converter->InitImageDimInfoWith(input.dims()); default_converter->InitImageDimInfoWith(input.dims());
LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " " LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
<< input_image_shape[1]; << input_image_shape[1];
std::vector<float> input_image_data(input_image_shape.production() * std::vector<uint16_t> input_image_data(input_image_shape.production() *
4); // 4 : RGBA 4); // 4 : RGBA
default_converter->NCHWToImage( default_converter->NCHWToImage(
input_v.data(), input_image_data.data(), input.dims()); input_v.data(), input_image_data.data(), input.dims());
auto* input_image = input.mutable_data<int16_t, cl::Image2D>( auto* input_image = input.mutable_data<uint16_t, cl::Image2D>(
input_image_shape[0], input_image_shape[1], input_image_data.data()); input_image_shape[0], input_image_shape[1], input_image_data.data());
LOG(INFO) << "prepare kernel"; LOG(INFO) << "prepare kernel";
...@@ -446,11 +447,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -446,11 +447,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims()); DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " " LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
<< filter_image_shape[1]; << filter_image_shape[1];
std::vector<float> filter_image_data(filter_image_shape.production() * std::vector<uint16_t> filter_image_data(filter_image_shape.production() *
4); // 4 : RGBA 4); // 4 : RGBA
nw_converter->NCHWToImage( nw_converter->NCHWToImage(
filter_v.data(), filter_image_data.data(), filter.dims()); filter_v.data(), filter_image_data.data(), filter.dims());
auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>( auto* filter_image = filter.mutable_data<uint16_t, cl::Image2D>(
filter_image_shape[0], filter_image_shape[1], filter_image_data.data()); filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
LOG(INFO) << "launch"; LOG(INFO) << "launch";
...@@ -459,13 +460,13 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -459,13 +460,13 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
default_converter->InitImageDimInfoWith(output.dims()); default_converter->InitImageDimInfoWith(output.dims());
LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " " LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
<< output_image_shape[1]; << output_image_shape[1];
auto* output_image = output.mutable_data<int16_t, cl::Image2D>( auto* output_image = output.mutable_data<uint16_t, cl::Image2D>(
output_image_shape[0], output_image_shape[1]); output_image_shape[0], output_image_shape[1]);
kernel->Launch(); kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<int16_t, cl::Image2D>(); auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
...@@ -490,7 +491,8 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -490,7 +491,8 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0}; const size_t cl_image2d_slice_pitch{0};
float* output_image_data = new float[output_image_shape.production() * 4]; uint16_t* output_image_data =
new uint16_t[output_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(output_image_data, TargetWrapperCL::ImgcpySync(output_image_data,
output_image, output_image,
output_image_shape[0], output_image_shape[0],
...@@ -512,4 +514,4 @@ TEST(depthwise_conv2d_image2d_fp16, compute) { ...@@ -512,4 +514,4 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kImageDefault, image2d); USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFP16, kImageDefault, image2d);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/opencl/elementwise_add_buffer_compute.h"
#include <memory>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
void ElementwiseAddCompute::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
ele_param_ = param_.get_mutable<param_t>();
UpdateParams();
}
void ElementwiseAddCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(ele_param_->X->target());
VLOG(4) << TargetToStr(ele_param_->Y->target());
VLOG(4) << TargetToStr(ele_param_->Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)batch_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)channels_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)num_);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{channels_, batch_};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
void ElementwiseAddCompute::UpdateParams() {
auto axis = ele_param_->axis;
const auto& x_dims = ele_param_->X->dims();
const auto& y_dims = ele_param_->Y->dims();
const auto& out_dims = ele_param_->Out->dims();
if (axis < 0) {
axis = static_cast<int>(x_dims.size() - y_dims.size());
}
for (int i = 0; i < axis; ++i) {
batch_ *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels_ *= y_dims[i];
}
for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
num_ *= x_dims[i];
}
VLOG(4) << "axis: " << axis;
VLOG(4) << "batch: " << batch_;
VLOG(4) << "channels: " << channels_;
VLOG(4) << "num: " << num_;
}
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
namespace ocl = paddle::lite::kernels::opencl;
REGISTER_LITE_KERNEL(
elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
...@@ -49,28 +49,6 @@ class ElementwiseAddCompute ...@@ -49,28 +49,6 @@ class ElementwiseAddCompute
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class ElementwiseAddImageCompute
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ElementwiseParam;
void PrepareForRun() override;
void Run() override;
std::string doc() const override {
return "ElementwiseAdd using cl::Image2D, kFloat";
}
protected:
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{" -DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/opencl/elementwise_add_compute.h" #include "lite/kernels/opencl/elementwise_add_image_compute.h"
#include <memory> #include <memory>
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
...@@ -23,80 +23,6 @@ namespace lite { ...@@ -23,80 +23,6 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
/* Buffer */
#if 0
void ElementwiseAddCompute::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
ele_param_ = param_.get_mutable<param_t>();
UpdateParams();
}
void ElementwiseAddCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(ele_param_->X->target());
VLOG(4) << TargetToStr(ele_param_->Y->target());
VLOG(4) << TargetToStr(ele_param_->Out->target());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)batch_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)channels_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, (const int)num_);
CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange{channels_, batch_};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
}
void ElementwiseAddCompute::UpdateParams() {
auto axis = ele_param_->axis;
const auto& x_dims = ele_param_->X->dims();
const auto& y_dims = ele_param_->Y->dims();
const auto& out_dims = ele_param_->Out->dims();
if (axis < 0) {
axis = static_cast<int>(x_dims.size() - y_dims.size());
}
for (int i = 0; i < axis; ++i) {
batch_ *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels_ *= y_dims[i];
}
for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
num_ *= x_dims[i];
}
VLOG(4) << "axis: " << axis;
VLOG(4) << "batch: " << batch_;
VLOG(4) << "channels: " << channels_;
VLOG(4) << "num: " << num_;
}
#endif
/* Image2D */
void ElementwiseAddImageCompute::PrepareForRun() { void ElementwiseAddImageCompute::PrepareForRun() {
ele_param_ = param_.get_mutable<param_t>(); ele_param_ = param_.get_mutable<param_t>();
auto* x = ele_param_->X; auto* x = ele_param_->X;
...@@ -152,10 +78,10 @@ void ElementwiseAddImageCompute::Run() { ...@@ -152,10 +78,10 @@ void ElementwiseAddImageCompute::Run() {
default_convertor.InitImageDimInfoWith(out->dims()); // w, h default_convertor.InitImageDimInfoWith(out->dims()); // w, h
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims()); auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
auto* x_img = x->data<float, cl::Image2D>(); auto* x_img = x->data<uint16_t, cl::Image2D>();
auto* y_img = y->data<float, cl::Image2D>(); auto* y_img = y->data<uint16_t, cl::Image2D>();
auto* out_img = auto* out_img = out->mutable_data<uint16_t, cl::Image2D>(out_img_shape[0],
out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]); out_img_shape[1]);
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height; VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1]; VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
...@@ -220,14 +146,7 @@ void ElementwiseAddImageCompute::Run() { ...@@ -220,14 +146,7 @@ void ElementwiseAddImageCompute::Run() {
namespace ocl = paddle::lite::kernels::opencl; namespace ocl = paddle::lite::kernels::opencl;
// REGISTER_LITE_KERNEL( // TODO(ysh329): May need fix.
// elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
// TODO(ysh329): Not fix.
// "Y" may from constant value like conv bias (kARM, need do cl_image_converter // "Y" may from constant value like conv bias (kARM, need do cl_image_converter
// on CPU); // on CPU);
// may from anther branch like "X" (kOpenCL, nothing to do). // may from anther branch like "X" (kOpenCL, nothing to do).
...@@ -235,20 +154,20 @@ namespace ocl = paddle::lite::kernels::opencl; ...@@ -235,20 +154,20 @@ namespace ocl = paddle::lite::kernels::opencl;
// set target of "Y" as kOpenCL temporarily. // set target of "Y" as kOpenCL temporarily.
REGISTER_LITE_KERNEL(elementwise_add, REGISTER_LITE_KERNEL(elementwise_add,
kOpenCL, kOpenCL,
kFloat, kFP16,
kImageDefault, kImageDefault,
ocl::ElementwiseAddImageCompute, ocl::ElementwiseAddImageCompute,
def) def)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindInput("Y", .BindInput("Y",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
...@@ -25,25 +24,25 @@ namespace lite { ...@@ -25,25 +24,25 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class ElementwiseMulFloatImageCompute class ElementwiseAddImageCompute
: public KernelLite<TARGET(kOpenCL), : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ElementwiseParam; using param_t = operators::ElementwiseParam;
std::string doc() const override {
return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
}
void PrepareForRun() override; void PrepareForRun() override;
void Run() override; void Run() override;
std::string doc() const override {
return "ElementwiseAdd using cl::Image2D, kFP16";
}
protected: protected:
param_t* ele_param_{nullptr}; param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_mul"}; std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
此差异已折叠。
...@@ -111,7 +111,7 @@ void elementwise_compute_ref(const dtype *x_data, ...@@ -111,7 +111,7 @@ void elementwise_compute_ref(const dtype *x_data,
} }
// #define PRINT_RESULT // #define PRINT_RESULT
TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { TEST(elementwise_mul_image, compute) {
LOG(INFO) LOG(INFO)
<< "main steps of test: host -> layout(buf2img on cpu) -> elemul(img) -> " << "main steps of test: host -> layout(buf2img on cpu) -> elemul(img) -> "
"layout(img2buf on cpu) " "layout(img2buf on cpu) "
...@@ -151,9 +151,10 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { ...@@ -151,9 +151,10 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim); // w, h auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim); // w, h
auto x_img_w = x_img_shape[0]; auto x_img_w = x_img_shape[0];
auto x_img_h = x_img_shape[1]; auto x_img_h = x_img_shape[1];
std::vector<float> x_img_v(x_img_w * x_img_h * 4); // 4: RGBA std::vector<uint16_t> x_img_v(x_img_w * x_img_h * 4); // 4: RGBA
default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim); default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
elemul_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data()); elemul_x.mutable_data<uint16_t, cl::Image2D>(
x_img_w, x_img_h, x_img_v.data());
// y // y
std::vector<float> y_v(y_dim.production()); std::vector<float> y_v(y_dim.production());
...@@ -161,19 +162,21 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { ...@@ -161,19 +162,21 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim); // w, h auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim); // w, h
auto y_img_w = y_img_shape[0]; auto y_img_w = y_img_shape[0];
auto y_img_h = y_img_shape[1]; auto y_img_h = y_img_shape[1];
std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4); // 4: RGBA std::vector<uint16_t> y_img_v(y_img_shape[0] * y_img_shape[1] *
4); // 4: RGBA
default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim); default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
elemul_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data()); elemul_y.mutable_data<uint16_t, cl::Image2D>(
y_img_w, y_img_h, y_img_v.data());
// out // out
auto out_img_shape = auto out_img_shape =
default_convertor.InitImageDimInfoWith(out_dim); // w, h default_convertor.InitImageDimInfoWith(out_dim); // w, h
auto out_img_w = out_img_shape[0]; auto out_img_w = out_img_shape[0];
auto out_img_h = out_img_shape[1]; auto out_img_h = out_img_shape[1];
elemul_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h); elemul_out.mutable_data<uint16_t, cl::Image2D>(out_img_w, out_img_h);
std::vector<float> out_img_v(out_img_w * out_img_h * 4); std::vector<uint16_t> out_img_v(out_img_w * out_img_h * 4);
fill_data<float>( fill_data<uint16_t>(
out_img_v.data(), out_img_v.size(), 0); // fill with zero value out_img_v.data(), out_img_v.size(), 0); // fill with zero value
std::vector<float> out_v(out_dim.production()); std::vector<float> out_v(out_dim.production());
...@@ -189,7 +192,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { ...@@ -189,7 +192,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
auto elemul_img_kernels = auto elemul_img_kernels =
KernelRegistry::Global().Create("elementwise_mul", KernelRegistry::Global().Create("elementwise_mul",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(elemul_img_kernels.empty()); ASSERT_FALSE(elemul_img_kernels.empty());
...@@ -215,7 +218,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { ...@@ -215,7 +218,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0}; const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(out_img_v.data(), TargetWrapperCL::ImgcpySync(out_img_v.data(),
elemul_out.data<float, cl::Image2D>(), elemul_out.data<uint16_t, cl::Image2D>(),
out_img_w, out_img_w,
out_img_h, out_img_h,
cl_image2d_row_pitch, cl_image2d_row_pitch,
...@@ -266,4 +269,4 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) { ...@@ -266,4 +269,4 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFloat, kImageDefault, def); USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFP16, kImageDefault, def);
...@@ -102,7 +102,7 @@ class MulCompute ...@@ -102,7 +102,7 @@ class MulCompute
private: private:
int m_, n_, k_; int m_, n_, k_;
std::string kernel_func_name_{"mat_mul"}; std::string kernel_func_name_{"mat_mul"};
std::string build_options_{"-DCL_DTYPE=float"}; std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册