diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt index 3e92a69c54072faed5cf3cadd7f84221873285a3..86203657aeab36ab3083bee75be420705e1a78b7 100644 --- a/lite/kernels/cuda/CMakeLists.txt +++ b/lite/kernels/cuda/CMakeLists.txt @@ -47,6 +47,7 @@ add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_ma add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm) add_kernel(var_conv_2d_compute_cuda CUDA extra SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(topk_pooling_compute_cuda CUDA extra SRCS topk_pooling_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(assign_value_compute_cuda CUDA extra SRCS assign_value_compute.cu DEPS ${lite_kernel_deps}) # unit test lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda) @@ -85,4 +86,5 @@ if(LITE_BUILD_EXTRA) nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda) #nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda) nv_test(topk_pooling_compute_cuda_test SRCS topk_pooling_compute_test.cc DEPS topk_pooling_compute_cuda) + nv_test(assign_value_compute_cuda_test SRCS assign_value_compute_test.cc DEPS assign_value_compute_cuda) endif() diff --git a/lite/kernels/cuda/assign_value_compute.cu b/lite/kernels/cuda/assign_value_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..89f2937f10399361951c3c8deb47e3700f93e288 --- /dev/null +++ b/lite/kernels/cuda/assign_value_compute.cu @@ -0,0 +1,76 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "lite/backends/cuda/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" +#include "lite/kernels/cuda/assign_value_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void TensorFromVector(const std::vector& src, + lite::Tensor* dst, + cudaStream_t* stream) { + auto* src_ptr = static_cast(src.data()); + auto* dst_ptr = static_cast(dst->mutable_data(TARGET(kCUDA))); + auto size = src.size() * sizeof(T); + TargetWrapperCuda::MemcpyAsync( + dst_ptr, src_ptr, size, IoDirection::HtoD, *stream); +} + +void AssignValueCompute::Run() { + auto& param = Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + int dtype = param.dtype; + std::vector fp32_values = param.fp32_values; + std::vector int32_values = param.int32_values; + std::vector int64_values = param.int64_values; + std::vector bool_values = param.bool_values; + auto* out = param.Out; + + if (dtype == static_cast(lite::core::FluidType::INT32)) { + TensorFromVector(int32_values, out, &stream); + } else if (dtype == static_cast(lite::core::FluidType::FP32)) { + TensorFromVector(fp32_values, out, &stream); + } else if (dtype == static_cast(lite::core::FluidType::INT64)) { + TensorFromVector(int64_values, out, &stream); + } else if (dtype == static_cast(lite::core::FluidType::BOOL)) { + TensorFromVector(bool_values, out, &stream); + } else { + LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype; + } + return; +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(assign_value, + kCUDA, + kAny, + kNCHW, + paddle::lite::kernels::cuda::AssignValueCompute, + def) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kAny))}) + .Finalize(); diff --git a/lite/kernels/cuda/assign_value_compute.h b/lite/kernels/cuda/assign_value_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c334e36d8061437881a4ea67d960f87b7ffb3ceb --- /dev/null +++ b/lite/kernels/cuda/assign_value_compute.h @@ -0,0 +1,34 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class AssignValueCompute : public KernelLite { + public: + using param_t = operators::AssignValueParam; + + void Run() override; + virtual ~AssignValueCompute() = default; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/assign_value_compute_test.cc b/lite/kernels/cuda/assign_value_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d0eb0cc9da34b7ad9408c9bad84d6085d5b710a3 --- /dev/null +++ b/lite/kernels/cuda/assign_value_compute_test.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/assign_value_compute.h" + +#include + +#include +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class AssignValueTest : public ::testing::Test { + protected: + AssignValueTest() : dtype(5), shape({1}) { + int num = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + fp32_values.resize(num); + int32_values.resize(num); + int64_values.resize(num); + bool_values.resize(num); + for (int i = 0; i < num; ++i) { + fp32_values[i] = i + 5; + int32_values[i] = i; + int64_values[i] = i; + bool_values[i] = i; + } + std::vector out_shape(shape.size(), 0); + for (size_t i = 0; i < shape.size(); ++i) out_shape[i] = shape[i]; + Out_ref.Resize(lite::DDim(out_shape)); + Out_gpu.Resize(Out_ref.dims()); + Out_cpu.Resize(Out_ref.dims()); + + cpu_base(&Out_ref); + + device_init(); + } + + void device_init() { + ctx.reset(new KernelContext); + cudaStreamCreate(&stream); + param.shape = shape; + param.dtype = dtype; + param.fp32_values = fp32_values; + param.int32_values = int32_values; + param.int64_values = int64_values; + param.bool_values = bool_values; + param.Out = &Out_gpu; + } + + void float_data_init() {} + + void half_data_init() {} + + void cpu_base(lite::Tensor* Out) { + if (dtype == static_cast(lite::core::FluidType::INT32)) { + for (size_t i = 0; i < int32_values.size(); ++i) { + Out->mutable_data()[i] = int32_values[i]; + } + } else if (dtype == static_cast(lite::core::FluidType::FP32)) { + for (size_t i = 0; i < fp32_values.size(); ++i) { + Out->mutable_data()[i] = fp32_values[i]; + } + } else if (dtype == static_cast(lite::core::FluidType::INT64)) { + for (size_t i = 0; i < int64_values.size(); ++i) { + Out->mutable_data()[i] = int64_values[i]; + } + } else if (dtype == static_cast(lite::core::FluidType::BOOL)) { + for (size_t i = 0; i < bool_values.size(); ++i) { + Out->mutable_data()[i] = bool_values[i]; + } + } else { + LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype; + } + } + + int dtype; + std::vector shape; + std::vector fp32_values; + std::vector int32_values; + std::vector int64_values; + std::vector bool_values; + + lite::Tensor Out_ref; + lite::Tensor Out_gpu; + lite::Tensor Out_cpu; + + operators::AssignValueParam param; + std::unique_ptr ctx; + cudaStream_t stream; +}; + +TEST_F(AssignValueTest, fp32) { + float_data_init(); + auto& context = ctx->As(); + context.SetExecStream(stream); + AssignValueCompute kernel; + kernel.SetParam(param); + kernel.SetContext(std::move(ctx)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(Out_cpu.mutable_data(), + Out_gpu.data(), + sizeof(float) * Out_gpu.numel(), + IoDirection::DtoH); + for (int i = 0; i < Out_gpu.numel(); ++i) { + EXPECT_NEAR(Out_cpu.data()[i], Out_ref.data()[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/operators/assign_value_op.cc b/lite/operators/assign_value_op.cc index ff5b55735f7b58aa2eaa2274574336dadd8061e6..f6f8cb7e3c8958693dd7234b7a21b29b769aa96c 100644 --- a/lite/operators/assign_value_op.cc +++ b/lite/operators/assign_value_op.cc @@ -26,12 +26,15 @@ bool AssignValueOpLite::CheckShape() const { auto shape = param_.shape; auto int32_values = param_.int32_values; auto fp32_values = param_.fp32_values; + auto int64_values = param_.int64_values; + auto bool_values = param_.bool_values; size_t shape_num = 1; - for (int i = 0; i < shape.size(); i++) { + for (size_t i = 0; i < shape.size(); i++) { shape_num *= shape[i]; } - CHECK_OR_FALSE(shape_num == int32_values.size() || - shape_num == fp32_values.size()); + CHECK_OR_FALSE( + shape_num == int32_values.size() || shape_num == fp32_values.size() || + shape_num == int64_values.size() || shape_num == bool_values.size()); return true; } @@ -47,9 +50,18 @@ bool AssignValueOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { param_.shape = op_desc.GetAttr>("shape"); param_.dtype = op_desc.GetAttr("dtype"); - param_.fp32_values = op_desc.GetAttr>("fp32_values"); - param_.int32_values = op_desc.GetAttr>("int32_values"); - + if (op_desc.HasAttr("fp32_values")) { + param_.fp32_values = op_desc.GetAttr>("fp32_values"); + } + if (op_desc.HasAttr("int32_values")) { + param_.int32_values = op_desc.GetAttr>("int32_values"); + } + if (op_desc.HasAttr("int64_values")) { + param_.int64_values = op_desc.GetAttr>("int64_values"); + } + if (op_desc.HasAttr("bool_values")) { + param_.bool_values = op_desc.GetAttr>("bool_values"); + } auto out = op_desc.Output("Out").front(); param_.Out = scope->FindVar(out)->GetMutable(); return true; diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 9c97ce4ed27a3a04d9eee6ef00980f450725a04d..9a72834ff1052b9bbd92e933dff90163c7e85475 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -1338,6 +1338,8 @@ struct AssignValueParam : ParamBase { int dtype{}; std::vector fp32_values{}; std::vector int32_values{}; + std::vector int64_values{}; + std::vector bool_values{}; lite::Tensor* Out{}; };