// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "lite/kernels/cuda/search_seq_fc_compute.h" #include #include #include #include namespace paddle { namespace lite { namespace kernels { namespace cuda { template void search_seq_fc_compute_ref(const operators::SearchSeqFcParam& param) { auto x = param.x; auto w = param.w; auto b = param.b; auto out = param.out; auto out_size = param.out_size; const auto x_dims = x->dims(); const auto w_dims = w->dims(); const auto& x_lod = x->lod(); CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; CHECK(!x_lod.empty()) << "The Input(X) must hold lod info."; const auto& x_lod_0 = x_lod[0]; CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted."; CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) << "The Input(X)'s lod info mismatches the actual tensor shape."; CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size"; int M = x_dims[0]; int K = x_dims[1]; int N = w_dims[0]; auto x_data = x->data(); auto w_data = w->data(); auto out_data = out->mutable_data(); #pragma omp parallel for for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { auto sum = static_cast(0); for (int l = 0; l < K; l++) { T xv = x_data[i * K + l]; T wv = w_data[j * K + l]; sum += xv * wv; } out_data[i * N + j] = sum; } } if (b != nullptr) { auto b_dims = b->dims(); CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; auto b_data = b->data(); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { out_data[i * N + j] += b_data[j]; } } } } TEST(search_seq_fc_compute, normal) { Env::Init(); for (auto x_lod_0 : {std::vector({0, 1, 3}), std::vector({0, 3, 4, 5})}) { for (auto feature_size : {2, 9}) { for (auto out_size : {3, 5}) { for (auto has_bias : {true, false}) { // infer x_dims, w_dims, b_dims and out_dims DDim x_dims({static_cast(x_lod_0.back()), feature_size}); DDim w_dims({out_size, feature_size}); DDim b_dims({has_bias ? out_size : 0}); DDim out_dims({static_cast(x_lod_0.back()), out_size}); LoD x_lod; x_lod.push_back(x_lod_0); LoD out_lod; out_lod.push_back(x_lod_0); // prepare input&output tensors Tensor x_dev, x_host, w_dev, w_host, b_dev, b_host, out_dev, out_host, out_ref; x_host.Resize(x_dims); w_host.Resize(w_dims); b_host.Resize(b_dims); out_host.Resize(out_dims); x_dev.Resize(x_dims); w_dev.Resize(w_dims); b_dev.Resize(b_dims); out_dev.Resize(out_dims); out_ref.Resize(out_dims); x_host.set_lod(x_lod); out_host.set_lod(out_lod); x_dev.set_lod(x_lod); out_dev.set_lod(out_lod); out_ref.set_lod(out_lod); auto out_dev_data = out_dev.mutable_data(TARGET(kCUDA)); auto x_host_data = x_host.mutable_data(); auto w_host_data = w_host.mutable_data(); auto out_host_data = out_host.mutable_data(); auto out_ref_data = out_ref.mutable_data(); for (int i = 0; i < x_host.dims().production(); i++) { x_host_data[i] = i * 0.125f; } for (int i = 0; i < w_host.dims().production(); i++) { w_host_data[i] = i * 0.5f; } x_dev.Assign(x_host_data, x_host.dims()); w_dev.Assign(w_host_data, w_host.dims()); // prepare cuda context, initialize param, and run kernel operators::SearchSeqFcParam param; param.x = &x_dev; param.w = &w_dev; param.out = &out_dev; param.out_size = out_size; if (has_bias) { auto b_host_data = b_host.mutable_data(); for (int i = 0; i < b_host.dims().production(); i++) { b_host_data[i] = i * 0.5f; } b_dev.Assign(b_host_data, b_host.dims()); param.b = &b_dev; } std::unique_ptr ctx(new KernelContext); auto& cuda_ctx = ctx->As(); cuda_ctx.InitOnce(); int dev_id = TargetWrapper::GetCurDevice(); cuda_ctx.Init(dev_id); SearchSeqFcCompute search_seq_fc; search_seq_fc.SetParam(param); search_seq_fc.SetContext(std::move(ctx)); search_seq_fc.Launch(); cudaDeviceSynchronize(); CopySync(out_host_data, out_dev_data, sizeof(float) * out_dev.dims().production(), IoDirection::DtoH); // run reference param.x = &x_host; param.w = &w_host; param.out = &out_ref; if (has_bias) { param.b = &b_host; } search_seq_fc_compute_ref(param); // verify result for (int i = 0; i < out_ref.dims().production(); i++) { EXPECT_NEAR(out_host_data[i], out_ref_data[i], 1e-5); } } } } } } } // namespace cuda } // namespace kernels } // namespace lite } // namespace paddle