提交 5945175f 编写于 作者: J Jiaying Zhao 提交者: xiebaiyuan

fix memcpy size in opencl fetch kernel (#1630)

上级 b05774ca
......@@ -26,7 +26,7 @@ limitations under the License. */
namespace paddle_mobile {
namespace framework {
class CLTensor : TensorBase {
class CLTensor : public TensorBase {
public:
CLTensor(cl_context context, cl_command_queue command_queue)
: context_(context), command_queue_(command_queue) {}
......
......@@ -31,8 +31,6 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
cl_int status;
param.Out()->InitEmptyImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue(), param.Out()->dims());
auto output = param.Out();
const Tensor *input = &param.InputX()->at(col);
// DLOG << *input;
......
......@@ -14,19 +14,13 @@ limitations under the License. */
#include "operators/kernel/fetch_kernel.h"
#include "framework/cl/cl_tensor.h"
// #include "common/common.h"
// #include <iostream>
namespace paddle_mobile {
namespace operators {
template <>
bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
// if (param->InputX()->dims().size() <= 2) {
// this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
// } else {
this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
// }
return true;
}
......@@ -40,24 +34,27 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
auto *out = &param.Out()->at(col);
out->Resize(param.InputX()->dims());
out->mutable_data<float>();
const auto &dim = param.InputX()->dims();
DLOG << "fetch kernel out dims = " << out->dims();
DLOG << "fetch kernel out memory size = " << out->memory_size();
auto dim = param.InputX()->dims();
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < dim.size(); ++j) {
new_dims[4 - dim.size() + j] = dim[j];
}
size_t C, in_height, in_width;
size_t in_ch, in_height, in_width;
C = new_dims[1];
in_ch = new_dims[1];
in_height = new_dims[2];
// if (dim.size() <= 2) {
// in_width = param.InputX()->ImageWidth();
// } else {
in_width = new_dims[3];
// }
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * in_ch;
CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
framework::CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
out_cl_tensor.Resize(out->dims());
cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
......@@ -66,35 +63,28 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
clSetKernelArg(kernel, 1, sizeof(int), &in_width);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
// if (dim.size() > 2) {
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
int out_c = new_dims[1];
clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
clSetKernelArg(kernel, 5, sizeof(int), &size_block);
clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
clSetKernelArg(kernel, 7, sizeof(int), &out_c);
// }
clSetKernelArg(kernel, 7, sizeof(int), &in_ch);
// cl_event wait_event = param.InpdutX()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
// auto time1 = paddle_mobile::time();
// printf(" before finish \n");
// clFlsh(this->cl_helper_.CLCommandQueue());
clFinish(this->cl_helper_.CLCommandQueue());
// printf(" after finish \n");
// auto time2 = paddle_mobile::time();
//
//
// std::cout << " finish cost :" << paddle_mobile::time_diff(time1, time2)
// << "ms" << std::endl;
DLOG << "fetch kernel out dims = " << out->dims();
DLOG << "fetch kernel out memory size = " << out->memory_size();
memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
DLOG << "fetch kernel out_cl_tensor dims = " << out_cl_tensor.dims();
DLOG << "fetch kernel out_cl_tensor memery size = "
<< out_cl_tensor.memory_size();
memcpy(out->data<float>(), out_cl_tensor.Data<float>(),
sizeof(float) * out->numel());
}
template class FetchKernel<GPU_CL, float>;
......
......@@ -21,12 +21,14 @@ int main() {
paddle_mobile::PaddleMobileConfigInternal config;
config.load_when_predict = true;
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
// paddle_mobile.SetThreadNum(4);
auto time1 = paddle_mobile::time();
#ifdef PADDLE_MOBILE_CL
paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
paddle_mobile.SetCLPath("/data/local/tmp/bin");
#else
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
#endif
// paddle_mobile.SetThreadNum(4);
auto isok = paddle_mobile.Load(std::string(g_super) + "/model",
std::string(g_super) + "/params", true, false,
......@@ -131,12 +133,12 @@ int main() {
auto time5 = paddle_mobile::time();
vec_result4 = paddle_mobile.Predict(input4, dims4);
auto time6 = paddle_mobile::time();
std::cout << "224*224 predict cost :第" << i << ": "
std::cout << "300*300 predict cost :第" << i << ": "
<< paddle_mobile::time_diff(time5, time6) << "ms" << std::endl;
}
auto time4 = paddle_mobile::time();
std::cout << "224*224 predict cost :"
std::cout << "300*300 predict cost :"
<< paddle_mobile::time_diff(time3, time4) / max << "ms"
<< std::endl;
// biggest = std::max_element(std::begin(vec_result4),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册