提交 b629d3a5 编写于 作者: D dolphin8

Merge branch 'opencl' of https://github.com/PaddlePaddle/paddle-mobile into opencl

...@@ -487,13 +487,13 @@ static const uint8_t shifttable[512] = { ...@@ -487,13 +487,13 @@ static const uint8_t shifttable[512] = {
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
half_t float2half(float f) { half_t Float2Half(float f) {
uint32_t v = *reinterpret_cast<uint32_t *>(&f); uint32_t v = *reinterpret_cast<uint32_t *>(&f);
return basetable[(v >> 23) & 0x1ff] + return basetable[(v >> 23) & 0x1ff] +
((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
} }
float half2float(half_t h) { float Half2Float(half_t h) {
uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
exponenttable[h >> 10]; exponenttable[h >> 10];
return *reinterpret_cast<float *>(&v); return *reinterpret_cast<float *>(&v);
...@@ -501,12 +501,12 @@ float half2float(half_t h) { ...@@ -501,12 +501,12 @@ float half2float(half_t h) {
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) { void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
h_array[i] = float2half(f_array[i]); h_array[i] = Float2Half(f_array[i]);
} }
} }
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) { void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
f_array[i] = float2half(h_array[i]); f_array[i] = Half2Float(h_array[i]);
} }
} }
...@@ -17,9 +17,9 @@ limitations under the License. */ ...@@ -17,9 +17,9 @@ limitations under the License. */
typedef uint16_t half_t; typedef uint16_t half_t;
half_t float2half(float f); half_t Float2Half(float f);
float half2float(half_t h); float Half2Float(half_t h);
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
......
...@@ -52,7 +52,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor, ...@@ -52,7 +52,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
for (int h = 0; h < H; h++) { for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4; size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) { for (int w = 0; w < W; w++) {
*p = half2float(imageData[i2]); *p = Half2Float(imageData[i2]);
i2 += 4; i2 += 4;
p++; p++;
} }
...@@ -106,7 +106,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, ...@@ -106,7 +106,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
for (int h = 0; h < H; h++) { for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4; size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) { for (int w = 0; w < W; w++) {
imageData[i2] = float2half(*p); imageData[i2] = Float2Half(*p);
i2 += 4; i2 += 4;
p++; p++;
} }
......
...@@ -177,7 +177,7 @@ class CLImage { ...@@ -177,7 +177,7 @@ class CLImage {
} }
assert(i2 < width * height * 4); assert(i2 < width * height * 4);
imageData[i2] = float2half(*p); imageData[i2] = Float2Half(*p);
i2 += 4; i2 += 4;
p++; p++;
// count++; // count++;
...@@ -219,7 +219,6 @@ class CLImage { ...@@ -219,7 +219,6 @@ class CLImage {
&err); &err);
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
// TODO(HaiPeng): error handling
CL_CHECK_ERRORS(err); CL_CHECK_ERRORS(err);
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
} }
......
...@@ -29,23 +29,39 @@ template <> ...@@ -29,23 +29,39 @@ template <>
void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
cl_int status; cl_int status;
DLOG << " feed 0";
auto output = param.Out(); auto output = param.Out();
DLOG << " feed 1";
const Tensor *input = param.InputX(); const Tensor *input = param.InputX();
DLOG << " feed 2";
const float *input_data = nullptr; const float *input_data = nullptr;
DLOG << " feed 3";
input_data = input->data<float>(); input_data = input->data<float>();
DLOG << " feed 4";
cl_mem cl_image = output->GetCLImage(); cl_mem cl_image = output->GetCLImage();
DLOG << " feed 5";
int height = output->dims()[2]; int height = output->dims()[2];
int width = output->dims()[3]; int width = output->dims()[3];
DLOG << output->dims(); DLOG << output->dims();
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_data); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_data);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_image); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &width); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &height); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &height);
CL_CHECK_ERRORS(status);
size_t global_work_size[2] = {height, width}; size_t global_work_size[2] = {height, width};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
global_work_size, NULL, 0, NULL, NULL); NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} }
template class FeedKernel<GPU_CL, float>; template class FeedKernel<GPU_CL, float>;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册