提交 462f4649 编写于 作者: Y Yanzhan Yang 提交者: zp7

add cl status check test=develop (#1956)

上级 31ee212a
......@@ -38,21 +38,31 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
auto input_image = cl_image->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(int), &in_height);
clSetKernelArg(kernel, 1, sizeof(int), &in_width);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &in_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &in_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
CL_CHECK_ERRORS(status);
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
clSetKernelArg(kernel, 5, sizeof(int), &size_block);
clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
clSetKernelArg(kernel, 7, sizeof(int), &C);
status = clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(int), &size_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(int), &C);
CL_CHECK_ERRORS(status);
size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3],
new_dims[0] * new_dims[2]};
clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL, global_work_size, NULL,
0, NULL, NULL);
status = clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL,
global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
memcpy(tensor->data<float>(), out_cl_tensor.Data<float>(),
tensor->memory_size());
}
......
......@@ -87,18 +87,20 @@ void BatchNormKernel<GPU_CL, float>::Compute(
DLOG << out_width;
DLOG << *param.OutputY();
cl_int status;
clSetKernelArg(kernel, 0, sizeof(cl_int), &out_width);
status = clSetKernelArg(kernel, 0, sizeof(cl_int), &out_width);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_scale);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_scale);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_bias);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_bias);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &out);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &out);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
}
template class BatchNormKernel<GPU_CL, float>;
......
......@@ -59,23 +59,31 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
out_cl_tensor.Resize(out->dims());
cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
clSetKernelArg(kernel, 0, sizeof(int), &in_height);
clSetKernelArg(kernel, 1, sizeof(int), &in_width);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
clSetKernelArg(kernel, 5, sizeof(int), &size_block);
clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
clSetKernelArg(kernel, 7, sizeof(int), &in_ch);
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &in_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &in_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(int), &size_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(int), &in_ch);
CL_CHECK_ERRORS(status);
// cl_event wait_event = param.InpdutX()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
// printf(" before finish \n");
// clFlsh(this->cl_helper_.CLCommandQueue());
clFinish(this->cl_helper_.CLCommandQueue());
// printf(" after finish \n");
DLOG << "fetch kernel out dims = " << out->dims();
DLOG << "fetch kernel out memory size = " << out->memory_size();
......
......@@ -76,24 +76,26 @@ void InstanceNormKernel<GPU_CL, float>::Compute(
<< " " << local_work_size[2];
cl_int status;
clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
status = clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
status = clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
status = clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
status = clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
work_size, local_work_size, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
work_size, local_work_size, 0, NULL, NULL);
}
template class InstanceNormKernel<GPU_CL, float>;
......
......@@ -57,23 +57,38 @@ void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {
const int ksize_h = ksize[0];
const int ksize_w = ksize[1];
clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height);
clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width);
clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height);
clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width);
clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top);
clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left);
clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h);
clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w);
clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h);
clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w);
clSetKernelArg(kernel, 10, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 11, sizeof(cl_mem), &out);
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(cl_mem), &out);
CL_CHECK_ERRORS(status);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class PoolKernel<GPU_CL, float>;
......
......@@ -43,8 +43,11 @@ void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
auto outputImage = output->GetCLImage();
// auto tImage =
// const_cast<ReluParam<GPU_CL>&>(param).getMidImage().GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
CL_CHECK_ERRORS(status);
// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage);
// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage);
// clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage);
......@@ -54,8 +57,9 @@ void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
work_size, NULL, 0, NULL, NULL);
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3,
// NULL,
// work_size, NULL, 0, NULL, NULL);
......
......@@ -36,14 +36,22 @@ void ScaleKernel<GPU_CL, float>::Compute(const ScaleParam<GPU_CL>& param) {
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
int out_width = (output->dims().size() == 4) ? output->dims()[3] : 1;
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
clSetKernelArg(kernel, 2, sizeof(float), &scale);
clSetKernelArg(kernel, 3, sizeof(float), &bias);
clSetKernelArg(kernel, 4, sizeof(int), &out_width);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(float), &scale);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(float), &bias);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &out_width);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class ScaleKernel<GPU_CL, float>;
......
......@@ -32,12 +32,16 @@ void TanhKernel<GPU_CL, float>::Compute(const TanhParam<GPU_CL>& param) {
auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
CL_CHECK_ERRORS(status);
const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
work_size, NULL, 0, NULL, NULL);
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class TanhKernel<GPU_CL, float>;
......
......@@ -482,73 +482,74 @@ def check_mobile_results(args, fuse, mem_opt):
pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
pp_yellow("paddle mobile results are : ", 1)
pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
if not fuse and not mem_opt:
error_index = None
error_values1 = None
error_values2 = None
checked_names = []
fetch_names = []
for fetch in fetches:
fetch_names.append(fetch.name)
for index in op_cache:
op_output_var_name, op = op_cache[index]
if mem_opt:
found_in_fetch = False
for fetch in fetches:
if op_output_var_name == fetch.name:
found_in_fetch = True
break
if not found_in_fetch:
if not fuse and not mem_opt:
pp_yellow("checking individual ops : ", 1)
error_index = None
error_values1 = None
error_values2 = None
checked_names = []
fetch_names = []
for fetch in fetches:
fetch_names.append(fetch.name)
for index in op_cache:
op_output_var_name, op = op_cache[index]
if mem_opt:
found_in_fetch = False
for fetch in fetches:
if op_output_var_name == fetch.name:
found_in_fetch = True
break
if not found_in_fetch:
continue
if not op_output_var_name in output_var_cache:
continue
if not op_output_var_name in output_var_cache:
continue
if not op_output_var_name in mobile_var_cache:
continue
if fuse or mem_opt:
if op_output_var_name not in fetch_names:
if not op_output_var_name in mobile_var_cache:
continue
values1 = output_var_cache[op_output_var_name]
values2 = mobile_var_cache[op_output_var_name]
shape = get_var_shape(op_output_var_name) if check_shape else []
if len(values1) + len(shape) != len(values2):
error_index = index
for i in range(len(shape)):
v1 = shape[i]
v2 = values2[i]
if v1 != v2:
if fuse or mem_opt:
if op_output_var_name not in fetch_names:
continue
values1 = output_var_cache[op_output_var_name]
values2 = mobile_var_cache[op_output_var_name]
shape = get_var_shape(op_output_var_name) if check_shape else []
if len(values1) + len(shape) != len(values2):
error_index = index
break
if error_index == None:
for i in range(len(values1)):
v1 = values1[i]
v2 = values2[len(shape) + i]
if abs(v1 - v2) > diff_threshold:
for i in range(len(shape)):
v1 = shape[i]
v2 = values2[i]
if v1 != v2:
error_index = index
break
checked_names.append(op_output_var_name)
if error_index != None:
error_values1 = values1
error_values2 = values2
break
if error_index == None:
for name in fetch_names:
if name not in checked_names:
error_index = -1
if error_index == None:
for i in range(len(values1)):
v1 = values1[i]
v2 = values2[len(shape) + i]
if abs(v1 - v2) > diff_threshold:
error_index = index
break
checked_names.append(op_output_var_name)
if error_index != None:
error_values1 = values1
error_values2 = values2
break
if error_index == None:
pp_green("outputs are all correct", 1)
elif error_index == -1:
pp_red("outputs are missing")
else:
error_values1 = np.array(error_values1)
error_values2 = np.array(error_values2)
# pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
error_index,op_cache[error_index][1].type,op_output_var_name), 1)
pp_red("fluid results are : ", 1)
pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
pp_yellow("paddle mobile results are : ", 1)
pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
if error_index == None:
for name in fetch_names:
if name not in checked_names:
error_index = -1
break
if error_index == None:
pp_green("outputs are all correct", 1)
elif error_index == -1:
pp_red("outputs are missing")
else:
error_values1 = np.array(error_values1)
error_values2 = np.array(error_values2)
# pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
error_index,op_cache[error_index][1].type,op_output_var_name), 1)
pp_red("fluid results are : ", 1)
pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
pp_yellow("paddle mobile results are : ", 1)
pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
# print(output_var_cache)
# print(mobile_var_cache)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册