提交 b4b76b61 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!4727 [MS][LITE][GPU]softmax support Buffer output

Merge pull request !4727 from chenzupeng/master-lite
......@@ -48,3 +48,57 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t
slices_count++;
} while (slices_count < slices_x32);
}
__kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *output, const float4 mask, const int slices,
const int slices_x32) {
int tid = get_local_id(0);
float sum = 0.0f;
for (size_t i = tid; i < slices - 1; i += 32) {
float4 src = read_imagef(input, smp_none, (int2)(i, 0));
sum += dot((float4)(1.0f), exp(src));
}
if ((slices - 1) % 32 == tid) {
float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0));
sum += dot(mask, exp(src));
}
__local float4 tmp[8];
__local float *tmpx1 = (__local float *)tmp;
tmpx1[tid] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0) {
sum = dot((float4)(1.0f), tmp[0]);
sum += dot((float4)(1.0f), tmp[1]);
sum += dot((float4)(1.0f), tmp[2]);
sum += dot((float4)(1.0f), tmp[3]);
sum += dot((float4)(1.0f), tmp[4]);
sum += dot((float4)(1.0f), tmp[5]);
sum += dot((float4)(1.0f), tmp[6]);
sum += dot((float4)(1.0f), tmp[7]);
tmpx1[0] = 1.0f / sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = tmpx1[0];
for (size_t i = tid; i < slices - 1; i += 32) {
float4 result = read_imagef(input, smp_none, (int2)(i, 0));
result = exp(result) * sum;
output[i] = result;
}
if ((slices - 1) % 32 == tid) {
float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0));
result = exp(result) * sum;
__global float4 *remain_ptr4 = output;
remain_ptr4 += slices - 1;
__global float *remain_ptr = (__global float *)remain_ptr4;
remain_ptr[0] = result.x;
if (mask.y > 0.f) {
remain_ptr[1] = result.y;
}
if (mask.z > 0.f) {
remain_ptr[2] = result.z;
}
if (mask.w > 0.f) {
remain_ptr[3] = result.w;
}
}
}
......@@ -62,7 +62,7 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
int h = shapex[1];
int w = shapex[2];
int c = shapex[3];
im_dst_x = UP_DIV(w * c, C4NUM);
im_dst_x = w * UP_DIV(c, C4NUM);
im_dst_y = h;
#ifdef ENABLE_FP16
size_t img_dtype = CL_HALF_FLOAT;
......
......@@ -90,7 +90,8 @@ int SoftmaxOpenCLKernel::Init() {
std::string program_name = "SoftMax";
std::string source = softmax_source_fp32;
runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
// framework not set this param yet! just use default.
parameter_->axis_ = 1;
if (in_tensors_[0]->shape().size() == 4 && parameter_->axis_ == 3) {
// support 4d tensor
onexone_flag_ = false;
......@@ -106,7 +107,10 @@ int SoftmaxOpenCLKernel::Init() {
#ifdef PROGRAM_WITH_IL
runtime_->CreateKernelFromIL(kernel_(), kernel_name);
#else
if (mem_type_ == MEM_TYPE::BUF) {
if (!is_image_out_) {
out_mem_type_ = OpenCLMemType::BUF;
}
if (out_mem_type_ == OpenCLMemType::BUF) {
kernel_name += "_BUF";
program_name += "_BUF";
} else {
......@@ -119,6 +123,10 @@ int SoftmaxOpenCLKernel::Init() {
#endif
ori_format_ = out_tensors_[0]->GetFormat();
out_tensors_[0]->SetFormat(schema::Format_NHWC4);
if (!is_image_out_) {
ori_format_ = schema::Format_NC;
out_tensors_[0]->SetFormat(schema::Format_NC);
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return lite::RET_OK;
}
......
......@@ -41,17 +41,17 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
int InitGlobalSize();
int SetWorkGroupSize1x1();
int SetWorkGroupSize();
std::vector<float> GetMaskForLastChannel(int channels);
std::vector<float> GetMaskForLastChannel(int channels);
private:
cl::Kernel kernel_;
SoftmaxParameter *parameter_;
lite::opencl::OpenCLRuntime *runtime_;
enum class MEM_TYPE { BUF, IMG } mem_type_{MEM_TYPE::IMG};
bool onexone_flag_{false};
std::vector<size_t> local_size_;
std::vector<size_t> global_size_;
bool is_image_out_{false};
};
} // namespace mindspore::kernel
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册