// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "lite/kernels/cuda/softmax_compute.h" #include #include #include #include #include #include namespace paddle { namespace lite { namespace kernels { namespace cuda { using Tensor = lite::Tensor; using DDim = lite::DDim; template static void softmax_compute_ref(const operators::SoftmaxParam& param) { const dtype* x_data = param.x->mutable_data(); dtype* output_data = param.output->mutable_data(); DDim x_dims = param.x->dims(); ASSERT_EQ(x_dims.data(), param.output->dims().data()); auto x_rank = x_dims.size(); int axis = param.axis; if (axis < 0) { axis += x_rank; } int axis_size = x_dims[axis]; int outer_num = x_dims.Slice(0, axis).production(); int inner_num = x_dims.Slice(axis + 1, x_rank).production(); int compute_size = outer_num * inner_num; for (int i = 0; i < compute_size; i++) { int idx_inner = i % inner_num; int idx_outer = (i / inner_num) * axis_size; int start = idx_outer * inner_num + idx_inner; int offset; offset = start; dtype max_data = std::numeric_limits::lowest(); for (int j = 0; j < axis_size; j++) { max_data = x_data[offset] > max_data ? x_data[offset] : max_data; offset += inner_num; } offset = start; dtype sum_data = (dtype)0; for (int j = 0; j < axis_size; j++) { output_data[offset] = exp(x_data[offset] - max_data); sum_data += output_data[offset]; offset += inner_num; } offset = start; for (int j = 0; j < axis_size; j++) { output_data[offset] /= sum_data; offset += inner_num; } } } TEST(softmax_cuda, compute) { std::unique_ptr ctx(new KernelContext); auto& context = ctx->As(); cudaStream_t stream; cudaStreamCreate(&stream); context.SetExecStream(stream); SoftmaxCompute softmax; operators::SoftmaxParam param; softmax.SetContext(std::move(ctx)); lite::Tensor x; lite::Tensor x_cpu; lite::Tensor output; lite::Tensor output_cpu; lite::Tensor output_ref; for (auto n : {1, 3}) { for (auto c : {1, 4}) { for (auto h : {5, 1, 112}) { for (auto w : {1, 6, 112}) { for (auto axis : {-2, -1, 0, 1, 2}) { x.Resize({n, c, h, w}); x_cpu.Resize({n, c, h, w}); output.Resize({n, c, h, w}); output_cpu.Resize({n, c, h, w}); output_ref.Resize({n, c, h, w}); auto* x_cpu_data = x_cpu.mutable_data(); auto* output_data = output.mutable_data(TARGET(kCUDA)); auto* output_cpu_data = output_ref.mutable_data(); auto* output_ref_data = output_ref.mutable_data(); for (int i = 0; i < x.dims().production(); i++) { x_cpu_data[i] = i; } x.Assign(x_cpu_data, x_cpu.dims()); param.x = &x; param.axis = axis; param.output = &output; softmax.SetParam(param); softmax.Launch(); param.x = &x_cpu; param.output = &output_ref; softmax_compute_ref(param); cudaDeviceSynchronize(); CopySync(output_cpu_data, output_data, sizeof(float) * output.numel(), IoDirection::DtoH); for (int i = 0; i < output.dims().production(); i++) { EXPECT_NEAR(output_cpu_data[i], output_ref_data[i], 1e-5); } } } } } } } } // namespace cuda } // namespace kernels } // namespace lite } // namespace paddle