提交 5945175f 编写于 作者: J Jiaying Zhao 提交者: xiebaiyuan

fix memcpy size in opencl fetch kernel (#1630)

上级 b05774ca
...@@ -26,7 +26,7 @@ limitations under the License. */ ...@@ -26,7 +26,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
class CLTensor : TensorBase { class CLTensor : public TensorBase {
public: public:
CLTensor(cl_context context, cl_command_queue command_queue) CLTensor(cl_context context, cl_command_queue command_queue)
: context_(context), command_queue_(command_queue) {} : context_(context), command_queue_(command_queue) {}
......
...@@ -31,8 +31,6 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { ...@@ -31,8 +31,6 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
cl_int status; cl_int status;
param.Out()->InitEmptyImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue(), param.Out()->dims());
auto output = param.Out(); auto output = param.Out();
const Tensor *input = &param.InputX()->at(col); const Tensor *input = &param.InputX()->at(col);
// DLOG << *input; // DLOG << *input;
......
...@@ -14,19 +14,13 @@ limitations under the License. */ ...@@ -14,19 +14,13 @@ limitations under the License. */
#include "operators/kernel/fetch_kernel.h" #include "operators/kernel/fetch_kernel.h"
#include "framework/cl/cl_tensor.h" #include "framework/cl/cl_tensor.h"
// #include "common/common.h"
// #include <iostream>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) { bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
// if (param->InputX()->dims().size() <= 2) {
// this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
// } else {
this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
// }
return true; return true;
} }
...@@ -40,25 +34,28 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) { ...@@ -40,25 +34,28 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
auto *out = &param.Out()->at(col); auto *out = &param.Out()->at(col);
out->Resize(param.InputX()->dims()); out->Resize(param.InputX()->dims());
out->mutable_data<float>(); out->mutable_data<float>();
const auto &dim = param.InputX()->dims();
DLOG << "fetch kernel out dims = " << out->dims();
DLOG << "fetch kernel out memory size = " << out->memory_size();
auto dim = param.InputX()->dims();
size_t new_dims[] = {1, 1, 1, 1}; size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < dim.size(); ++j) { for (int j = 0; j < dim.size(); ++j) {
new_dims[4 - dim.size() + j] = dim[j]; new_dims[4 - dim.size() + j] = dim[j];
} }
size_t C, in_height, in_width; size_t in_ch, in_height, in_width;
C = new_dims[1]; in_ch = new_dims[1];
in_height = new_dims[2]; in_height = new_dims[2];
// if (dim.size() <= 2) {
// in_width = param.InputX()->ImageWidth();
// } else {
in_width = new_dims[3]; in_width = new_dims[3];
// } int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * in_ch;
CLTensor out_cl_tensor(this->cl_helper_.CLContext(), framework::CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue()); this->cl_helper_.CLCommandQueue());
out_cl_tensor.Resize(out->dims()); out_cl_tensor.Resize(out->dims());
cl_mem outBuffer = out_cl_tensor.mutable_data<float>(); cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
...@@ -66,35 +63,28 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) { ...@@ -66,35 +63,28 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
clSetKernelArg(kernel, 1, sizeof(int), &in_width); clSetKernelArg(kernel, 1, sizeof(int), &in_width);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer); clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
// if (dim.size() > 2) {
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
int out_c = new_dims[1];
clSetKernelArg(kernel, 4, sizeof(int), &size_ch); clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
clSetKernelArg(kernel, 5, sizeof(int), &size_block); clSetKernelArg(kernel, 5, sizeof(int), &size_block);
clSetKernelArg(kernel, 6, sizeof(int), &size_batch); clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
clSetKernelArg(kernel, 7, sizeof(int), &out_c); clSetKernelArg(kernel, 7, sizeof(int), &in_ch);
// }
// cl_event wait_event = param.InpdutX()->GetClEvent(); // cl_event wait_event = param.InpdutX()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 0, NULL, NULL);
// auto time1 = paddle_mobile::time();
// printf(" before finish \n"); // printf(" before finish \n");
// clFlsh(this->cl_helper_.CLCommandQueue()); // clFlsh(this->cl_helper_.CLCommandQueue());
clFinish(this->cl_helper_.CLCommandQueue()); clFinish(this->cl_helper_.CLCommandQueue());
// printf(" after finish \n"); // printf(" after finish \n");
// auto time2 = paddle_mobile::time(); DLOG << "fetch kernel out dims = " << out->dims();
// DLOG << "fetch kernel out memory size = " << out->memory_size();
//
// std::cout << " finish cost :" << paddle_mobile::time_diff(time1, time2)
// << "ms" << std::endl;
memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size()); DLOG << "fetch kernel out_cl_tensor dims = " << out_cl_tensor.dims();
DLOG << "fetch kernel out_cl_tensor memery size = "
<< out_cl_tensor.memory_size();
memcpy(out->data<float>(), out_cl_tensor.Data<float>(),
sizeof(float) * out->numel());
} }
template class FetchKernel<GPU_CL, float>; template class FetchKernel<GPU_CL, float>;
......
...@@ -21,12 +21,14 @@ int main() { ...@@ -21,12 +21,14 @@ int main() {
paddle_mobile::PaddleMobileConfigInternal config; paddle_mobile::PaddleMobileConfigInternal config;
config.load_when_predict = true; config.load_when_predict = true;
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
// paddle_mobile.SetThreadNum(4);
auto time1 = paddle_mobile::time(); auto time1 = paddle_mobile::time();
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
paddle_mobile.SetCLPath("/data/local/tmp/bin"); paddle_mobile.SetCLPath("/data/local/tmp/bin");
#else
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
#endif #endif
// paddle_mobile.SetThreadNum(4);
auto isok = paddle_mobile.Load(std::string(g_super) + "/model", auto isok = paddle_mobile.Load(std::string(g_super) + "/model",
std::string(g_super) + "/params", true, false, std::string(g_super) + "/params", true, false,
...@@ -131,12 +133,12 @@ int main() { ...@@ -131,12 +133,12 @@ int main() {
auto time5 = paddle_mobile::time(); auto time5 = paddle_mobile::time();
vec_result4 = paddle_mobile.Predict(input4, dims4); vec_result4 = paddle_mobile.Predict(input4, dims4);
auto time6 = paddle_mobile::time(); auto time6 = paddle_mobile::time();
std::cout << "224*224 predict cost :第" << i << ": " std::cout << "300*300 predict cost :第" << i << ": "
<< paddle_mobile::time_diff(time5, time6) << "ms" << std::endl; << paddle_mobile::time_diff(time5, time6) << "ms" << std::endl;
} }
auto time4 = paddle_mobile::time(); auto time4 = paddle_mobile::time();
std::cout << "224*224 predict cost :" std::cout << "300*300 predict cost :"
<< paddle_mobile::time_diff(time3, time4) / max << "ms" << paddle_mobile::time_diff(time3, time4) / max << "ms"
<< std::endl; << std::endl;
// biggest = std::max_element(std::begin(vec_result4), // biggest = std::max_element(std::begin(vec_result4),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册