diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc index 9b4843f714f11484860056711fd223edc8a5d037..192a6414260ce06048b8c765402d89882cabc51b 100644 --- a/paddle/contrib/inference/demo/simple_on_word2vec.cc +++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include <glog/logging.h> #include <gtest/gtest.h> #include <memory> +#include <thread> #include "paddle/contrib/inference/paddle_inference_api.h" - namespace paddle { namespace demo { @@ -61,13 +61,67 @@ void Main(bool use_gpu) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) { LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i]; } + // TODO(Superjomn): this is should be free automatically + free(outputs[0].data.data); + } +} + +void MainThreads(int num_threads, bool use_gpu) { + // Multi-threads only support on CPU + // 0. Create PaddlePredictor with a config. + NativeConfig config; + config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.use_gpu = use_gpu; + config.fraction_of_gpu_memory = 0.15; + config.device = 0; + auto main_predictor = + CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); + + std::vector<std::thread> threads; + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // 1. clone a predictor which shares the same parameters + auto predictor = main_predictor->Clone(); + constexpr int num_batches = 3; + for (int batch_id = 0; batch_id < num_batches; ++batch_id) { + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleBuf buf{.data = data, .length = sizeof(data)}; + PaddleTensor tensor{.name = "", + .shape = std::vector<int>({4, 1}), + .data = buf, + .dtype = PaddleDType::INT64}; + std::vector<PaddleTensor> inputs(4, tensor); + std::vector<PaddleTensor> outputs; + // 3. Run + CHECK(predictor->Run(inputs, &outputs)); + + // 4. Get output. + ASSERT_EQ(outputs.size(), 1UL); + LOG(INFO) << "TID: " << tid << ", " + << "output buffer size: " << outputs.front().data.length; + const size_t num_elements = outputs.front().data.length / sizeof(float); + // The outputs' buffers are in CPU memory. + for (size_t i = 0; i < std::min(5UL, num_elements); i++) { + LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i]; + } + free(outputs[0].data.data); + } + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); } } TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); } #ifdef PADDLE_WITH_CUDA TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); } #endif } // namespace demo diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc index 1f960677163988be6f4c502738861bf86588f406..4b6cb7b051d1ad2c63e895017c7faf1245c22612 100644 --- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc +++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include <glog/logging.h> #include <gtest/gtest.h> +#include <thread> + #include "gflags/gflags.h" #include "paddle/contrib/inference/paddle_inference_api_impl.h" #include "paddle/fluid/inference/tests/test_helper.h" @@ -45,14 +47,19 @@ NativeConfig GetConfig() { config.model_dir = FLAGS_dirname + "word2vec.inference.model"; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; +#ifdef PADDLE_WITH_CUDA config.use_gpu = true; +#else + config.use_gpu = false; +#endif config.device = 0; return config; } -TEST(paddle_inference_api_impl, word2vec) { +void MainWord2Vec(bool use_gpu) { NativeConfig config = GetConfig(); auto predictor = CreatePaddlePredictor<NativeConfig>(config); + config.use_gpu = use_gpu; framework::LoDTensor first_word, second_word, third_word, fourth_word; framework::LoD lod{{0, 1}}; @@ -100,11 +107,12 @@ TEST(paddle_inference_api_impl, word2vec) { free(outputs[0].data.data); } -TEST(paddle_inference_api_impl, image_classification) { +void MainImageClassification(bool use_gpu) { int batch_size = 2; bool use_mkldnn = false; bool repeat = false; NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; config.model_dir = FLAGS_dirname + "image_classification_resnet.inference.model"; @@ -149,4 +157,143 @@ TEST(paddle_inference_api_impl, image_classification) { free(data); } +void MainThreadsWord2Vec(bool use_gpu) { + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + auto main_predictor = CreatePaddlePredictor<NativeConfig>(config); + + // prepare inputs data and reference results + constexpr int num_jobs = 3; + std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs); + std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs); + std::vector<framework::LoDTensor> refs(num_jobs); + for (size_t i = 0; i < jobs.size(); ++i) { + // each job has 4 words + jobs[i].resize(4); + for (size_t j = 0; j < 4; ++j) { + framework::LoD lod{{0, 1}}; + int64_t dict_size = 2073; // The size of dictionary + SetupLoDTensor(&jobs[i][j], lod, static_cast<int64_t>(0), dict_size - 1); + paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j])); + } + + // get reference result of each job + std::vector<paddle::framework::LoDTensor*> ref_feeds; + std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]); + for (auto& word : jobs[i]) { + ref_feeds.push_back(&word); + } + TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches); + } + + // create threads and each thread run 1 job + std::vector<std::thread> threads; + for (int tid = 0; tid < num_jobs; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor = main_predictor->Clone(); + auto& local_inputs = paddle_tensor_feeds[tid]; + std::vector<PaddleTensor> local_outputs; + ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); + + // check outputs range + ASSERT_EQ(local_outputs.size(), 1UL); + const size_t len = local_outputs[0].data.length; + float* data = static_cast<float*>(local_outputs[0].data.data); + for (size_t j = 0; j < len / sizeof(float); ++j) { + ASSERT_LT(data[j], 1.0); + ASSERT_GT(data[j], -1.0); + } + + // check outputs correctness + float* ref_data = refs[tid].data<float>(); + EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float))); + for (int i = 0; i < refs[tid].numel(); ++i) { + EXPECT_NEAR(ref_data[i], data[i], 1e-3); + } + free(data); + }); + } + for (int i = 0; i < num_jobs; ++i) { + threads[i].join(); + } +} + +void MainThreadsImageClassification(bool use_gpu) { + constexpr int num_jobs = 4; // each job run 1 batch + constexpr int batch_size = 1; + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + config.model_dir = + FLAGS_dirname + "image_classification_resnet.inference.model"; + + auto main_predictor = CreatePaddlePredictor<NativeConfig>(config); + std::vector<framework::LoDTensor> jobs(num_jobs); + std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs); + std::vector<framework::LoDTensor> refs(num_jobs); + for (size_t i = 0; i < jobs.size(); ++i) { + // prepare inputs + std::vector<std::vector<int64_t>> feed_target_shapes = + GetFeedTargetShapes(config.model_dir, /*is_combined*/ false); + feed_target_shapes[0][0] = batch_size; + framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]); + SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f); + paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i])); + + // get reference result of each job + std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]); + std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]); + TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches); + } + + // create threads and each thread run 1 job + std::vector<std::thread> threads; + for (int tid = 0; tid < num_jobs; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor = main_predictor->Clone(); + auto& local_inputs = paddle_tensor_feeds[tid]; + std::vector<PaddleTensor> local_outputs; + ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); + + // check outputs correctness + ASSERT_EQ(local_outputs.size(), 1UL); + const size_t len = local_outputs[0].data.length; + float* data = static_cast<float*>(local_outputs[0].data.data); + float* ref_data = refs[tid].data<float>(); + EXPECT_EQ(refs[tid].numel(), len / sizeof(float)); + for (int i = 0; i < refs[tid].numel(); ++i) { + EXPECT_NEAR(ref_data[i], data[i], 1e-3); + } + free(data); + }); + } + for (int i = 0; i < num_jobs; ++i) { + threads[i].join(); + } +} + +TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); } +TEST(inference_api_native, word2vec_cpu_threads) { + MainThreadsWord2Vec(false /*use_gpu*/); +} +TEST(inference_api_native, image_classification_cpu) { + MainThreadsImageClassification(false /*use_gpu*/); +} +TEST(inference_api_native, image_classification_cpu_threads) { + MainThreadsImageClassification(false /*use_gpu*/); +} + +#ifdef PADDLE_WITH_CUDA +TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); } +TEST(inference_api_native, word2vec_gpu_threads) { + MainThreadsWord2Vec(true /*use_gpu*/); +} +TEST(inference_api_native, image_classification_gpu) { + MainThreadsImageClassification(true /*use_gpu*/); +} +TEST(inference_api_native, image_classification_gpu_threads) { + MainThreadsImageClassification(true /*use_gpu*/); +} + +#endif + } // namespace paddle