提交 7fb2e727 编写于 作者: Y yangfei

imp some function

上级 af0be6e0
...@@ -36,13 +36,13 @@ class CLTensor : TensorBase { ...@@ -36,13 +36,13 @@ class CLTensor : TensorBase {
return *this; return *this;
} }
// template<typename T> template <typename T>
// inline T *mutable_with_data(void *data) { inline T *mutable_with_data(void *data) {
// int64_t size = numel() * sizeof(float); int64_t size = numel() * sizeof(float);
// holder_.reset(new PlaceholderImpl(size, data, typeid(T))); holder_.reset(new PlaceholderImpl(size, data, typeid(T), context_));
// return reinterpret_cast<T *>(reinterpret_cast<void *>( return reinterpret_cast<T *>(
// reinterpret_cast<uintptr_t>(holder_->ptr()))); reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(holder_->ptr())));
// } }
inline void *mutable_data(std::type_index type) { inline void *mutable_data(std::type_index type) {
if (holder_ != nullptr) { if (holder_ != nullptr) {
...@@ -51,7 +51,7 @@ class CLTensor : TensorBase { ...@@ -51,7 +51,7 @@ class CLTensor : TensorBase {
PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
int64_t size = numel() * SizeOfType(type); int64_t size = numel() * SizeOfType(type);
if (holder_ == nullptr || holder_->size() < size + offset_) { if (holder_ == nullptr || holder_->size() < size + offset_) {
holder_.reset(new PlaceholderImpl(size, type)); holder_.reset(new PlaceholderImpl(size, type, context_));
offset_ = 0; offset_ = 0;
} }
return reinterpret_cast<void *>( return reinterpret_cast<void *>(
......
__kernel void feed(__global float* in, __write_only image2d_t outputImage,int h,int w) #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w)
{ {
int j = get_global_id(0); int j = get_global_id(0);
int i = get_global_id(1); int i = get_global_id(1);
float4 pixel; half4 pixel;
pixel.x = in[(i * w + j)]; pixel.x = convert_half(in[(i * w + j)]);
pixel.y = in[h * w + (i * w + j)]; pixel.y = convert_half(in[h * w + (i * w + j)]);
pixel.z = in[2 * h * w + (i * w + j)]; pixel.z = convert_half(in[2 * h * w + (i * w + j)]);
pixel.w = 0; pixel.w = 0.0;
int2 coords; int2 coords;
coords.x = j; coords.x = j;
coords.y = i; coords.y = i;
write_imagef(outputImage,coords,pixel); write_imageh(outputImage,coords,pixel);
} }
...@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/kernel/feed_kernel.h" #include "operators/kernel/feed_kernel.h"
#include "common/log.h" #include "framework/cl/cl_tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -29,39 +28,46 @@ template <> ...@@ -29,39 +28,46 @@ template <>
void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
cl_int status; cl_int status;
DLOG << " feed 0";
auto output = param.Out(); auto output = param.Out();
DLOG << " feed 1";
const Tensor *input = param.InputX(); const Tensor *input = param.InputX();
DLOG << " feed 2"; const float *input_data = input->data<float>();
const float *input_data = nullptr; int numel = input->numel();
DLOG << " feed 3"; DLOG << "numel = " << numel;
input_data = input->data<float>();
DLOG << " feed 4";
cl_mem cl_image = output->GetCLImage(); cl_mem cl_image = output->GetCLImage();
DLOG << " feed 5";
int height = output->dims()[2]; int height = output->dims()[2];
int width = output->dims()[3]; int width = output->dims()[3];
DLOG << output->dims(); DLOG << output->dims();
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_data); CLTensor input_cl_tensor(this->cl_helper_.CLContext());
CL_CHECK_ERRORS(status); input_cl_tensor.Resize(input->dims());
cl_mem *inputBuffer =
input_cl_tensor.mutable_with_data<cl_mem>((void *)input_data);
DLOG << "yangfei";
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_image); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&cl_image);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &width);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&width);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &height); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&height);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
size_t global_work_size[2] = {height, width}; size_t global_work_size[2] = {height, width};
DLOG << "yangfei";
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL); NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
int len = 4 * 224 * 224;
half *out = new half[len];
DLOG << "yangfei";
cl_command_queue commandQueue = this->cl_helper_.CLCommandQueue();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {height, width, 1};
clEnqueueReadImage(commandQueue, cl_image, CL_TRUE, origin, region, 0, 0, out,
0, NULL, NULL);
DLOG << "yangfei";
for (int i = 0; i < 100; i++) DLOG << out[i];
} }
template class FeedKernel<GPU_CL, float>; template class FeedKernel<GPU_CL, float>;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册