提交 641601f7 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!4139 add image2d format for concat op

Merge pull request !4139 from pengyongrong/concat_debug_pr
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void Concat(__global float *input0, __global float *input1, __global float *output, const int4 input_shape0, #define FLT4 float4
const int4 input_shape1, const int4 output_shape, const int axis) { __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
uint oh = get_global_id(0); __kernel void Concat(__write_only image2d_t output_image2d, __read_only image2d_t input0_image2d,
uint ow = get_global_id(1); __read_only image2d_t input1_image2d, int2 shared_int0, int4 shared_out) {
uint oc = get_global_id(2); int X = get_global_id(0); // H
uint index_output; int Y = get_global_id(1); // W
uint input_idx; int S = 0;
if ((oh >= output_shape.y || oh < 0) || (ow >= output_shape.z || ow < 0) || (oc >= output_shape.w || oc < 0)) { if (X >= shared_out.y || Y >= shared_out.z) return;
return; for (int i = 0; i < shared_int0.x; i++) {
} FLT4 result0 = read_imagef(input0_image2d, smp_none, (int2)((Y)*shared_int0.x + (i), (X)));
if (axis == 3) { write_imagef(output_image2d, (int2)((Y)*shared_out.w + (S), (X)), result0);
index_output = oh * output_shape.z * output_shape.w + ow * output_shape.w + oc; S++;
if (oc < input_shape0.w) {
input_idx = (input_shape0.z * oh + ow) * input_shape0.w + oc;
output[index_output] = input0[input_idx];
} else if ((input_shape0.w <= oc) && oc < (input_shape0.w + input_shape1.w)) {
input_idx = (input_shape1.z * oh + ow) * input_shape1.w + (oc - input_shape0.w);
output[index_output] = input1[input_idx];
} else {
output[index_output] = 0;
} }
for (int i = 0; i < shared_int0.y; i++) {
FLT4 result1 = read_imagef(input1_image2d, smp_none, (int2)((Y)*shared_int0.y + (i), (X)));
write_imagef(output_image2d, (int2)((Y)*shared_out.w + (S), (X)), result1);
S++;
} }
} }
__kernel void Concat3input(__global float *input0, __global float *input1, __global float *input2, __kernel void Concat3input(__write_only image2d_t output_image2d, __read_only image2d_t input0_image2d,
__global float *output, const int4 input_shape0, const int4 input_shape1, __read_only image2d_t input1_image2d, __read_only image2d_t input2_image2d, int3 shared_int0,
const int4 input_shape2, const int4 output_shape, const int axis) { int4 shared_out) {
uint oh = get_global_id(0); int X = get_global_id(0); // H
uint ow = get_global_id(1); int Y = get_global_id(1); // W
uint oc = get_global_id(2); int S = 0;
uint index_output; if (X >= shared_out.y || Y >= shared_out.z) return;
uint input_idx; for (int i = 0; i < shared_int0.x; i++) {
if ((oh >= output_shape.y || oh < 0) || (ow >= output_shape.z || ow < 0) || (oc >= output_shape.w || oc < 0)) { FLT4 result0 = read_imagef(input0_image2d, smp_none, (int2)((Y)*shared_int0.x + (i), (X)));
return; write_imagef(output_image2d, (int2)((Y)*shared_out.w + (S), (X)), result0);
} S++;
index_output = oh * output_shape.z * output_shape.w + ow * output_shape.w + oc;
if (oc < (input_shape0.w + input_shape1.w)) {
if (oc < input_shape0.w) {
input_idx = (input_shape0.z * oh + ow) * input_shape0.w + oc;
output[index_output] = input0[input_idx];
} else {
input_idx = (input_shape1.z * oh + ow) * input_shape1.w + (oc - input_shape0.w);
output[index_output] = input1[input_idx];
} }
} else { for (int i = 0; i < shared_int0.y; i++) {
if ((input_shape0.w + input_shape1.w + input_shape2.w) <= oc) { FLT4 result1 = read_imagef(input1_image2d, smp_none, (int2)((Y)*shared_int0.y + (i), (X)));
output[index_output] = 0; write_imagef(output_image2d, (int2)((Y)*shared_out.w + (S), (X)), result1);
} else { S++;
input_idx = (input_shape2.z * oh + ow) * input_shape2.w + (oc - input_shape0.w - input_shape1.w);
output[index_output] = input2[input_idx];
} }
for (int i = 0; i < shared_int0.z; i++) {
FLT4 result2 = read_imagef(input2_image2d, smp_none, (int2)((Y)*shared_int0.z + (i), (X)));
write_imagef(output_image2d, (int2)((Y)*shared_out.w + (S), (X)), result2);
S++;
} }
} }
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#include <cstring>
#include <string> #include <string>
#include <algorithm> #include <algorithm>
#include <set> #include <set>
...@@ -27,6 +28,26 @@ using mindspore::schema::PrimitiveType_Concat; ...@@ -27,6 +28,26 @@ using mindspore::schema::PrimitiveType_Concat;
namespace mindspore::kernel { namespace mindspore::kernel {
int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
size_t im_dst_x, im_dst_y;
if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
im_dst_x = outputs_[0]->Width() * CO4;
im_dst_y = outputs_[0]->Height();
} else {
im_dst_y = outputs_[0]->Height() * CO4;
im_dst_x = outputs_[0]->Width();
}
#ifdef ENABLE_FP16
size_t img_dtype = CL_HALF_FLOAT;
#else
size_t img_dtype = CL_FLOAT;
#endif
img_size->clear();
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
*img_size = vec;
return 1;
}
int ConcatOpenCLKernel::Init() { int ConcatOpenCLKernel::Init() {
if (inputs_[0]->shape().size() != 4) { if (inputs_[0]->shape().size() != 4) {
MS_LOG(ERROR) << "only support dim=4"; MS_LOG(ERROR) << "only support dim=4";
...@@ -132,72 +153,45 @@ int ConcatOpenCLKernel::Run() { ...@@ -132,72 +153,45 @@ int ConcatOpenCLKernel::Run() {
} }
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
std::vector<size_t> local; MS_LOG(INFO) << " judge the numbers of input vector";
std::vector<size_t> global;
if (inputs_.size() == 2) {
auto input0_shape = inputs_[0]->shape(); auto input0_shape = inputs_[0]->shape();
auto input1_shape = inputs_[1]->shape(); auto input1_shape = inputs_[1]->shape();
auto input2_shape = inputs_[2]->shape();
auto output_shape = outputs_[0]->shape(); auto output_shape = outputs_[0]->shape();
cl_int4 input0_shape_ = {input0_shape[0], input0_shape[1], input0_shape[2], input0_shape[3]}; cl_int2 input0_shape2_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4)}; // change
cl_int4 input1_shape_ = {input1_shape[0], input1_shape[1], input1_shape[2], input1_shape[3]}; cl_int3 input0_shape3_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4),
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], output_shape[3]}; DivideRoundUp(input2_shape[3], 4)};
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], DivideRoundUp(output_shape[3], 4)};
uint32_t OH = output_shape[0] * output_shape[1]; // N*H uint32_t OH = output_shape[0] * output_shape[1]; // N*H
uint32_t OW = output_shape[2]; uint32_t OW = output_shape[2];
uint32_t OC = output_shape[3];
global = {OH, OW, OC}; // HWC std::vector<size_t> local = {1, 1};
ConcatGetWorkGroup(global, &local, 384); std::vector<size_t> global = {OH, OW};
std::cout << "local size=:" << std::endl; // ConcatGetWorkGroup(global, &local, 512);
for (int i = 0; i < local.size(); i++) {
std::cout << local[i] << " ";
}
std::cout << std::endl;
int arg_cn = 0; int arg_cn = 0;
if (inputs_.size() == 2) {
MS_LOG(INFO) << " SetKernelArg";
ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape2_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input1_shape_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_); } else if (inputs_.size() == 3) {
} MS_LOG(INFO) << " SetKernelArg";
if (inputs_.size() == 3) { ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data());
auto input0_shape = inputs_[0]->shape();
auto input1_shape = inputs_[1]->shape();
auto input2_shape = inputs_[2]->shape();
auto output_shape = outputs_[0]->shape();
cl_int4 input0_shape_ = {input0_shape[0], input0_shape[1], input0_shape[2], input0_shape[3]};
cl_int4 input1_shape_ = {input1_shape[0], input1_shape[1], input1_shape[2], input1_shape[3]};
cl_int4 input2_shape_ = {input2_shape[0], input2_shape[1], input2_shape[2], input2_shape[3]};
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], output_shape[3]};
uint32_t OH = output_shape[0] * output_shape[1]; // N*H
uint32_t OW = output_shape[2];
uint32_t OC = output_shape[3];
global = {OH, OW, OC}; // HWC
ConcatGetWorkGroup(global, &local, 384);
std::cout << "local size=:" << std::endl;
for (int i = 0; i < local.size(); i++) {
std::cout << local[i] << " ";
}
std::cout << std::endl;
int arg_cn = 0;
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[2]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[2]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape3_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input1_shape_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input2_shape_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
} }
ocl_runtime->RunKernel(kernel_, global, local, nullptr); ocl_runtime->RunKernel(kernel_, global, local, nullptr);
return 0; return 0;
} } // namespace mindspore::kernel
kernel::LiteKernel *OpenCLConcatKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, kernel::LiteKernel *OpenCLConcatKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const std::vector<lite::tensor::Tensor *> &outputs,
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
* limitations under the License. * limitations under the License.
*/ */
#ifndef MINDSPORE_LITE_SRC_BACKEND_OPENCL_Concat_H_ #ifndef MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONCAT_H_
#define MINDSPORE_LITE_SRC_BACKEND_OPENCL_Concat_H_ #define MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONCAT_H_
#include <vector> #include <vector>
#include "ir/anf.h" #include "ir/anf.h"
...@@ -25,11 +25,11 @@ ...@@ -25,11 +25,11 @@
namespace mindspore::kernel { namespace mindspore::kernel {
class ConcatOpenCLKernel : public LiteKernel { class ConcatOpenCLKernel : public OpenCLKernel {
public: public:
explicit ConcatOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, explicit ConcatOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs) const std::vector<lite::tensor::Tensor *> &outputs)
: LiteKernel(parameter, inputs, outputs) {} : OpenCLKernel(parameter, inputs, outputs) {}
~ConcatOpenCLKernel() override{}; ~ConcatOpenCLKernel() override{};
...@@ -40,6 +40,7 @@ class ConcatOpenCLKernel : public LiteKernel { ...@@ -40,6 +40,7 @@ class ConcatOpenCLKernel : public LiteKernel {
int Run_axis0(); int Run_axis0();
int Run() override; int Run() override;
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
private: private:
cl::Kernel kernel_; cl::Kernel kernel_;
......
...@@ -69,8 +69,8 @@ void *OpenCLAllocator::Malloc(size_t size) { ...@@ -69,8 +69,8 @@ void *OpenCLAllocator::Malloc(size_t size) {
host_ptr = clSVMAlloc((*ocl_runtime->Context())(), flags, size, 0); host_ptr = clSVMAlloc((*ocl_runtime->Context())(), flags, size, 0);
} else { } else {
cl_int ret = CL_SUCCESS; cl_int ret = CL_SUCCESS;
cl::Buffer *buffer = cl::Buffer *buffer = new cl::Buffer(*ocl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
new cl::Buffer(*ocl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret); size, NULL, &ret);
if (ret != CL_SUCCESS) { if (ret != CL_SUCCESS) {
MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")";
UnLock(); UnLock();
...@@ -125,8 +125,8 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t>& img_size) ...@@ -125,8 +125,8 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t>& img_size)
cl_int ret = CL_SUCCESS; cl_int ret = CL_SUCCESS;
// CL_HALF_FLOAT, CL_FLOAT // CL_HALF_FLOAT, CL_FLOAT
cl::ImageFormat image_format(CL_RGBA, img_size[2]); cl::ImageFormat image_format(CL_RGBA, img_size[2]);
cl::Image2D *buffer = new cl::Image2D(*ocl_runtime->Context(), CL_MEM_READ_WRITE, cl::Image2D *buffer = new cl::Image2D(*ocl_runtime->Context(), CL_MEM_READ_WRITE, image_format,
image_format, img_size[0], img_size[1], 0, nullptr, &ret); img_size[0], img_size[1], 0, nullptr, &ret);
if (ret != CL_SUCCESS) { if (ret != CL_SUCCESS) {
MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")"; MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")";
UnLock(); UnLock();
...@@ -164,20 +164,26 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v ...@@ -164,20 +164,26 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v
auto iter = free_list_.lower_bound(size); auto iter = free_list_.lower_bound(size);
if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
auto mem_buf = iter->second; auto mem_buf = iter->second;
bool is_match{mem_buf->img_size.size() == img_size.size()};
for (int i = 0; i < img_size.size() && is_match; ++i) {
is_match = img_size[i] == mem_buf->img_size[i];
}
if (is_match) {
free_list_.erase(iter); free_list_.erase(iter);
allocated_list_[mem_buf->host_ptr_] = mem_buf; allocated_list_[mem_buf->host_ptr_] = mem_buf;
UnLock(); UnLock();
MS_LOG(DEBUG) << "Malloc Image2D from free list. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_ MS_LOG(DEBUG) << "Malloc Image2D from free list. size: " << mem_buf->size_
<< ", device addr: " << mem_buf->device_ptr_; << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_;
return mem_buf->host_ptr_; return mem_buf->host_ptr_;
} }
}
void *host_ptr = nullptr; void *host_ptr = nullptr;
void *device_ptr = nullptr; void *device_ptr = nullptr;
cl_int ret = CL_SUCCESS; cl_int ret = CL_SUCCESS;
// CL_HALF_FLOAT, CL_FLOAT // CL_HALF_FLOAT, CL_FLOAT
cl::ImageFormat image_format(CL_RGBA, img_size[2]); cl::ImageFormat image_format(CL_RGBA, img_size[2]);
cl::Image2D *buffer = new cl::Image2D(*ocl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_format, cl::Image2D *buffer = new cl::Image2D(*ocl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
img_size[0], img_size[1], 0, data, &ret); image_format, img_size[0], img_size[1], 0, data, &ret);
if (ret != CL_SUCCESS) { if (ret != CL_SUCCESS) {
MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")"; MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")";
UnLock(); UnLock();
...@@ -372,4 +378,3 @@ int OpenCLAllocator::GetImageSize(void *host_ptr, std::vector<size_t>* img_size) ...@@ -372,4 +378,3 @@ int OpenCLAllocator::GetImageSize(void *host_ptr, std::vector<size_t>* img_size)
} }
} // namespace mindspore::lite::opencl } // namespace mindspore::lite::opencl
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h" #include "mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h"
int DivideRoundUp(int n, int div) { int DivideRoundUp(int n, int div) {
int q = n / div; int q = n / div;
return n % div == 0 ? q : q + 1; return n % div == 0 ? q : q + 1;
...@@ -77,15 +76,26 @@ void ConcatComputeByCPU_3input_dim4_axis3(float *input0, float *input1, float *i ...@@ -77,15 +76,26 @@ void ConcatComputeByCPU_3input_dim4_axis3(float *input0, float *input1, float *i
postion = i * output_shape[1] * output_shape[2] * output_shape[3] + j * output_shape[2] * output_shape[3] + postion = i * output_shape[1] * output_shape[2] * output_shape[3] + j * output_shape[2] * output_shape[3] +
k * output_shape[3]; k * output_shape[3];
for (int w = 0; w < output_shape[3]; w++) { for (int w = 0; w < output_shape[3]; w++) {
if (w < input_shape0[3] + input_shape1[3]) { if (w < input_shape0[3]) {
output[postion++] = (w < input_shape0[3]) ? input0[index0++] : input1[index1++]; int align = DivideRoundUp(input_shape0[3], 4) * 4;
index0 = i * input_shape0[1] * input_shape0[2] * align + j * input_shape0[2] * align + k * align + w;
output[postion++] = input0[index0];
} else if (w >= input_shape0[3] && w < (input_shape0[3] + input_shape1[3])) {
int align = DivideRoundUp(input_shape1[3], 4) * 4;
index1 = i * input_shape1[1] * input_shape1[2] * align + j * input_shape1[2] * align + k * align + w -
input_shape0[3];
output[postion++] = input1[index1];
} else if ((input_shape0[3] + input_shape1[3]) <= w && } else if ((input_shape0[3] + input_shape1[3]) <= w &&
w < (input_shape0[3] + input_shape1[3] + input_shape2[3])) { w < (input_shape0[3] + input_shape1[3] + input_shape2[3])) {
output[postion++] = input2[index2++]; int align = DivideRoundUp(input_shape2[3], 4) * 4;
index2 = i * input_shape2[1] * input_shape2[2] * align + j * input_shape2[2] * align + k * align + w -
input_shape0[3] - input_shape1[3];
output[postion++] = input2[index2];
} else { } else {
for (int ind = input_shape0[3] + input_shape1[3]; ind < output_shape[3]; ind++) { for (int ind = input_shape0[3] + input_shape1[3] + input_shape2[3]; ind < output_shape[3]; ind++) {
output[postion++] = 0; output[postion++] = 0;
} }
break;
} }
} }
} }
...@@ -96,18 +106,31 @@ void ConcatComputeByCPU_3input_dim4_axis3(float *input0, float *input1, float *i ...@@ -96,18 +106,31 @@ void ConcatComputeByCPU_3input_dim4_axis3(float *input0, float *input1, float *i
namespace mindspore { namespace mindspore {
class TestConcatOpenCL : public mindspore::Common { class TestConcatOpenCL : public mindspore::Common {
public: public:
TestConcatOpenCL(){} TestConcatOpenCL() {}
}; };
template <typename T>
void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bound) {
for (size_t i = 0; i < size; i++) {
T abs = fabs(output_data[i] - correct_data[i]);
// printf("i=%d %.3f %.3f\n", i, output_data[i], correct_data[i]);
ASSERT_LE(abs, err_bound);
}
}
TEST_F(TestConcatOpenCL, ConcatFp32_2input_dim4_axis3) { TEST_F(TestConcatOpenCL, ConcatFp32_2input_dim4_axis3) {
MS_LOG(INFO) << "begin test"; MS_LOG(INFO) << "begin test";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
auto allocator = ocl_runtime->GetAllocator();
MS_LOG(INFO) << "init tensors"; MS_LOG(INFO) << "init tensors";
constexpr int INPUT_NUM = 3; constexpr int INPUT_NUM = 2;
std::array<std::vector<int>, INPUT_NUM> input_shapes = { // std::array<std::vector<int>, INPUT_NUM> input_shapes = {
std::vector<int>{1, 240, 240, 16}, std::vector<int>{1, 240, 240, 16}, std::vector<int>{1, 240, 240, 64}}; // std::vector<int>{1, 120, 120, 16}, std::vector<int>{1, 120, 120, 16},std::vector<int>{1, 120, 120, 96}};
std::vector<int> output_shape = {1, 240, 240, 96}; std::array<std::vector<int>, INPUT_NUM> input_shapes = {std::vector<int>{1, 32, 512, 48},
std::vector<int>{1, 32, 512, 48}};
std::vector<int> output_shape = {1, 32, 512, 96};
output_shape[3] = DivideRoundUp(output_shape[3], 4) * 4; output_shape[3] = DivideRoundUp(output_shape[3], 4) * 4;
auto data_type = kNumberTypeFloat32; auto data_type = kNumberTypeFloat32;
auto tensor_type = schema::NodeType_ValueNode; auto tensor_type = schema::NodeType_ValueNode;
...@@ -118,32 +141,30 @@ TEST_F(TestConcatOpenCL, ConcatFp32_2input_dim4_axis3) { ...@@ -118,32 +141,30 @@ TEST_F(TestConcatOpenCL, ConcatFp32_2input_dim4_axis3) {
auto *output_tensor = new lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type); auto *output_tensor = new lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
std::vector<lite::tensor::Tensor *> outputs{output_tensor}; std::vector<lite::tensor::Tensor *> outputs{output_tensor};
std::cout << "input_shapes size=: " << input_shapes.size() << std::endl; std::cout << "input_shapes size=: " << input_shapes.size() << std::endl;
MS_LOG(INFO) << "initialize tensors";
std::cout << "initialize tensors";
auto param = new ConcatParameter(); auto param = new ConcatParameter();
param->axis_ = 3; param->axis_ = 3;
auto *concat_kernel = new kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); auto *concat_kernel = new kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
concat_kernel->Init(); concat_kernel->Init();
MS_LOG(INFO) << "initialize sub_graph"; MS_LOG(INFO) << "initialize sub_graph";
std::vector<kernel::LiteKernel *> kernels{concat_kernel}; std::vector<kernel::LiteKernel *> kernels{concat_kernel};
auto *sub_graph = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); auto *sub_graph = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
// to do allocate memory for inputs and outputs
for (auto &input_tensor : inputs) {
input_tensor->MallocData(allocator);
}
sub_graph->Init(); sub_graph->Init();
unsigned int seed = 123;
MS_LOG(INFO) << "initialize input data"; MS_LOG(INFO) << "initialize input data";
srand(time(NULL));
for (auto &input_tensor : inputs) { for (auto &input_tensor : inputs) {
auto input_data = reinterpret_cast<float *>(input_tensor->Data()); auto input_data = reinterpret_cast<float *>(input_tensor->Data());
static unsigned int seed = 123;
for (int i = 0; i < input_tensor->ElementsNum(); ++i) { for (int i = 0; i < input_tensor->ElementsNum(); ++i) {
input_data[i] = static_cast<float>(rand_r(&seed) % 10 + 1); input_data[i] = static_cast<float>(rand_r(&seed) % 10 + 1);
} }
printf("\n");
} }
MS_LOG(INFO) << "==================output data================"; // compute the result for CPU
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->Data());
printf("\n");
auto *input_data0 = reinterpret_cast<float *>(inputs[0]->Data()); auto *input_data0 = reinterpret_cast<float *>(inputs[0]->Data());
auto *input_data1 = reinterpret_cast<float *>(inputs[1]->Data()); auto *input_data1 = reinterpret_cast<float *>(inputs[1]->Data());
std::vector<float> output_data_cpu(output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3]); std::vector<float> output_data_cpu(output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3]);
...@@ -156,8 +177,10 @@ TEST_F(TestConcatOpenCL, ConcatFp32_2input_dim4_axis3) { ...@@ -156,8 +177,10 @@ TEST_F(TestConcatOpenCL, ConcatFp32_2input_dim4_axis3) {
ConcatComputeByCPU_3input_dim4_axis3(input_data0, input_data1, input_data2, output_data_cpu.data(), input_shapes[0], ConcatComputeByCPU_3input_dim4_axis3(input_data0, input_data1, input_data2, output_data_cpu.data(), input_shapes[0],
input_shapes[1], input_shapes[2], output_shape, param->axis_); input_shapes[1], input_shapes[2], output_shape, param->axis_);
} }
printf("\n");
CompareOutputData(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001); std::cout << "==================output data================" << std::endl;
MS_LOG(INFO) << "Testconcat passed"; sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->Data());
CompareOutputData1(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001);
} }
} // namespace mindspore } // namespace mindspore
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册