From eb3050fa9a7ccccf616836e73bf58ef2f25d3e7d Mon Sep 17 00:00:00 2001 From: Qi Li Date: Sat, 20 Feb 2021 13:47:28 +0800 Subject: [PATCH] [ROCM] update fluid inference for rocm (part1), test=develop (#31018) --- paddle/fluid/inference/api/analysis_config.cc | 12 ++++++------ paddle/fluid/inference/api/analysis_predictor.cc | 6 +++--- .../fluid/inference/api/analysis_predictor_tester.cc | 4 ++-- paddle/fluid/inference/api/api_impl.cc | 2 +- paddle/fluid/inference/api/api_impl_tester.cc | 2 +- paddle/fluid/inference/api/demo_ci/vis_demo.cc | 2 +- .../fluid/inference/api/details/zero_copy_tensor.cc | 9 ++++++--- paddle/fluid/inference/api/paddle_pass_builder.cc | 3 +++ paddle/fluid/inference/lite/engine.cc | 2 +- paddle/fluid/inference/lite/tensor_utils.cc | 2 +- paddle/fluid/inference/lite/test_engine_lite.cc | 10 +++++----- paddle/fluid/inference/lite/test_tensor_utils.cc | 6 +++--- .../inference/tests/api/analyzer_ernie_tester.cc | 2 +- .../fluid/inference/tests/api/lite_mul_model_test.cc | 2 +- paddle/fluid/inference/tests/test_helper.h | 2 +- 15 files changed, 36 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 7eb1bb1a24e..0622fb27d9e 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -18,7 +18,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DECLARE_uint64(initial_gpu_memory_in_mb); #endif @@ -71,7 +71,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, } void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) use_gpu_ = true; memory_pool_init_size_mb_ = memory_pool_init_size_mb; FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_; @@ -214,7 +214,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { } void AnalysisConfig::EnableCUDNN() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) use_cudnn_ = use_gpu_; #else LOG(ERROR) << "Please compile with CUDA first to use cuDNN"; @@ -288,7 +288,7 @@ void AnalysisConfig::EnableTensorRtEngine( int workspace_size, int max_batch_size, int min_subgraph_size, AnalysisConfig::Precision precision_mode, bool use_static, bool use_calib_mode) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!use_gpu()) { LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; return; @@ -384,7 +384,7 @@ void AnalysisConfig::Update() { } } if (use_gpu() && use_cudnn_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!enable_ir_optim_) { LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled."; } else { @@ -526,7 +526,7 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads( } float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Get the GPU memory details and calculate the fraction of memory for the // GPU memory pool. size_t gpu_total, gpu_available; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 81c68a65576..215335bf8c6 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -107,7 +107,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place)); @@ -192,7 +192,7 @@ bool AnalysisPredictor::PrepareScope( paddle::framework::InitDevices(); scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) { delete scope; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (int dev_id = 0; dev_id < paddle::platform::GetCUDADeviceCount(); ++dev_id) { memory::Release(platform::CUDAPlace(dev_id)); @@ -244,7 +244,7 @@ bool AnalysisPredictor::CreateExecutor() { platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (config_.thread_local_stream_enabled()) { auto *ctx = static_cast( platform::DeviceContextPool::Instance().Get(place_)); diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index f6c66c2b003..464db9d4d3e 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -63,7 +63,7 @@ TEST(AnalysisPredictor, analysis_on) { AnalysisConfig config; config.SetModel(FLAGS_dirname); config.SwitchIrOptim(true); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) config.EnableUseGpu(100, 0); #else config.DisableGpu(); @@ -486,7 +486,7 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { } #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(AnalysisPredictor, bf16_gpu_pass_strategy) { AnalysisConfig config; config.SetModel(FLAGS_dirname); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 74885ca5ece..6930b3bd2e9 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -242,7 +242,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, platform::is_xpu_place(place_), false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 00efbb528ae..e3fad1fec06 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -297,7 +297,7 @@ TEST(inference_api_native, image_classification_xpu) { } #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(paddle::PaddlePlace::kGPU); } diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 293c90c2028..0b3257da92c 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "utils.h" // NOLINT -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DECLARE_double(fraction_of_gpu_memory_to_use); #endif DEFINE_string(modeldir, "", "Directory of the inference model."); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index a364135aa75..0ed7476bb61 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -116,7 +116,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { auto *t_data = tensor->mutable_data(platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); } else if (place_ == PaddlePlace::kGPU) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::CUDAPlace gpu_place(device_); auto *t_data = tensor->mutable_data(gpu_place); @@ -155,15 +155,18 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { if (platform::is_cpu_place(t_place)) { std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); } else if (place_ == PaddlePlace::kGPU) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place); auto *dev_ctx = static_cast(pool.Get(gpu_place)); memory::Copy(platform::CPUPlace(), static_cast(data), gpu_place, t_data, ele_num * sizeof(T), dev_ctx->stream()); - +#ifdef PADDLE_WITH_HIP + hipStreamSynchronize(dev_ctx->stream()); +#else cudaStreamSynchronize(dev_ctx->stream()); +#endif #else PADDLE_THROW(platform::errors::Unavailable( "Not compile with CUDA, should not reach here.")); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index e5c4f3ee4b0..4d40334cbc0 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -16,6 +16,9 @@ #ifdef PADDLE_WITH_CUDA #include #endif +#ifdef PADDLE_WITH_HIP +#include +#endif #include #include diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index 478ef892ebd..59a786e46c9 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #define LITE_WITH_CUDA 1 #endif diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 25d046f511c..cbc947ea643 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -123,7 +123,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data, if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) { memory::Copy(cpu_place, dst_data, cpu_place, src_data, size); } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_cpu_place(dst_place) && platform::is_gpu_place(src_place)) { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc index 8e65fa2fbe3..080622899eb 100644 --- a/paddle/fluid/inference/lite/test_engine_lite.cc +++ b/paddle/fluid/inference/lite/test_engine_lite.cc @@ -74,7 +74,7 @@ void make_fake_model(std::string* model, std::string* param) { *block_->add_ops() = *fetch->Proto(); framework::Scope scope; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace place; platform::CUDADeviceContext ctx(place); #else @@ -102,11 +102,11 @@ TEST(EngineManager, engine) { const std::string unique_key("engine_0"); config.model_from_memory = true; config.valid_places = { -#ifdef PADDLE_WITH_CUDA - paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}), +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}), #endif - paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}), - paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}), + paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}), }; LOG(INFO) << "Create EngineManager"; diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc index a792fb77d6a..a8ed703da95 100644 --- a/paddle/fluid/inference/lite/test_tensor_utils.cc +++ b/paddle/fluid/inference/lite/test_tensor_utils.cc @@ -115,7 +115,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { // Copy to LoDTensor. framework::LoDTensor lod_tensor_n; TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { platform::GpuStreamSync( static_cast(ctx).stream()); @@ -151,7 +151,7 @@ TEST(LiteEngineOp, TensorCopyAsync) { auto* ctx_cpu = platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); test_tensor_copy(*ctx_cpu); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto* ctx_gpu = platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)); test_tensor_copy(*ctx_gpu); @@ -162,7 +162,7 @@ TEST(LiteEngineOp, TensorShare) { auto* ctx_cpu = platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); test_tensor_share(*ctx_cpu); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto* ctx_gpu = platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)); test_tensor_share(*ctx_gpu); diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc index 87c8d783160..0c2a140023e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc @@ -163,7 +163,7 @@ TEST(Analyzer_ernie, profile_mkldnn) { profile(true, false); } #endif // Check the model by gpu -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(Analyzer_ernie, profile_gpu) { profile(false, true); } #endif diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc index 2c5f1583dce..6d4bb70df6f 100644 --- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc +++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc @@ -118,7 +118,7 @@ TEST(AnalysisPredictor, lite_xpu) { } #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(AnalysisPredictor, thread_local_stream) { const size_t thread_num = 5; std::vector threads(thread_num); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 1f6c8213523..fc2c6a030a6 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -168,7 +168,7 @@ void TestInference(const std::string& dirname, if (paddle::platform::is_cpu_place(place)) { state = paddle::platform::ProfilerState::kCPU; } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) state = paddle::platform::ProfilerState::kAll; // The default device_id of paddle::platform::CUDAPlace is 0. // Users can get the device_id using: -- GitLab