未验证 提交 eb3050fa 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid inference for rocm (part1), test=develop (#31018)

上级 6df1ca54
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(initial_gpu_memory_in_mb);
#endif #endif
...@@ -71,7 +71,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, ...@@ -71,7 +71,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
} }
void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) { int device_id) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
use_gpu_ = true; use_gpu_ = true;
memory_pool_init_size_mb_ = memory_pool_init_size_mb; memory_pool_init_size_mb_ = memory_pool_init_size_mb;
FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_; FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_;
...@@ -214,7 +214,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -214,7 +214,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
} }
void AnalysisConfig::EnableCUDNN() { void AnalysisConfig::EnableCUDNN() {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
use_cudnn_ = use_gpu_; use_cudnn_ = use_gpu_;
#else #else
LOG(ERROR) << "Please compile with CUDA first to use cuDNN"; LOG(ERROR) << "Please compile with CUDA first to use cuDNN";
...@@ -288,7 +288,7 @@ void AnalysisConfig::EnableTensorRtEngine( ...@@ -288,7 +288,7 @@ void AnalysisConfig::EnableTensorRtEngine(
int workspace_size, int max_batch_size, int min_subgraph_size, int workspace_size, int max_batch_size, int min_subgraph_size,
AnalysisConfig::Precision precision_mode, bool use_static, AnalysisConfig::Precision precision_mode, bool use_static,
bool use_calib_mode) { bool use_calib_mode) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!use_gpu()) { if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
return; return;
...@@ -384,7 +384,7 @@ void AnalysisConfig::Update() { ...@@ -384,7 +384,7 @@ void AnalysisConfig::Update() {
} }
} }
if (use_gpu() && use_cudnn_) { if (use_gpu() && use_cudnn_) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!enable_ir_optim_) { if (!enable_ir_optim_) {
LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled."; LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled.";
} else { } else {
...@@ -526,7 +526,7 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads( ...@@ -526,7 +526,7 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads(
} }
float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Get the GPU memory details and calculate the fraction of memory for the // Get the GPU memory details and calculate the fraction of memory for the
// GPU memory pool. // GPU memory pool.
size_t gpu_total, gpu_available; size_t gpu_total, gpu_available;
......
...@@ -107,7 +107,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, ...@@ -107,7 +107,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), false, PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place)); static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
...@@ -192,7 +192,7 @@ bool AnalysisPredictor::PrepareScope( ...@@ -192,7 +192,7 @@ bool AnalysisPredictor::PrepareScope(
paddle::framework::InitDevices(); paddle::framework::InitDevices();
scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) { scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) {
delete scope; delete scope;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (int dev_id = 0; dev_id < paddle::platform::GetCUDADeviceCount(); for (int dev_id = 0; dev_id < paddle::platform::GetCUDADeviceCount();
++dev_id) { ++dev_id) {
memory::Release(platform::CUDAPlace(dev_id)); memory::Release(platform::CUDAPlace(dev_id));
...@@ -244,7 +244,7 @@ bool AnalysisPredictor::CreateExecutor() { ...@@ -244,7 +244,7 @@ bool AnalysisPredictor::CreateExecutor() {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (config_.thread_local_stream_enabled()) { if (config_.thread_local_stream_enabled()) {
auto *ctx = static_cast<platform::CUDADeviceContext *>( auto *ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place_)); platform::DeviceContextPool::Instance().Get(place_));
......
...@@ -63,7 +63,7 @@ TEST(AnalysisPredictor, analysis_on) { ...@@ -63,7 +63,7 @@ TEST(AnalysisPredictor, analysis_on) {
AnalysisConfig config; AnalysisConfig config;
config.SetModel(FLAGS_dirname); config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(true); config.SwitchIrOptim(true);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
#else #else
config.DisableGpu(); config.DisableGpu();
...@@ -486,7 +486,7 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { ...@@ -486,7 +486,7 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(AnalysisPredictor, bf16_gpu_pass_strategy) { TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
AnalysisConfig config; AnalysisConfig config;
config.SetModel(FLAGS_dirname); config.SetModel(FLAGS_dirname);
......
...@@ -242,7 +242,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -242,7 +242,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
platform::is_xpu_place(place_), false, platform::is_xpu_place(place_), false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto *dev_ctx = auto *dev_ctx =
......
...@@ -297,7 +297,7 @@ TEST(inference_api_native, image_classification_xpu) { ...@@ -297,7 +297,7 @@ TEST(inference_api_native, image_classification_xpu) {
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(inference_api_native, word2vec_gpu) { TEST(inference_api_native, word2vec_gpu) {
MainWord2Vec(paddle::PaddlePlace::kGPU); MainWord2Vec(paddle::PaddlePlace::kGPU);
} }
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "utils.h" // NOLINT #include "utils.h" // NOLINT
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
#endif #endif
DEFINE_string(modeldir, "", "Directory of the inference model."); DEFINE_string(modeldir, "", "Directory of the inference model.");
......
...@@ -116,7 +116,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { ...@@ -116,7 +116,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
auto *t_data = tensor->mutable_data<T>(platform::CPUPlace()); auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
std::memcpy(static_cast<void *>(t_data), data, ele_size); std::memcpy(static_cast<void *>(t_data), data, ele_size);
} else if (place_ == PaddlePlace::kGPU) { } else if (place_ == PaddlePlace::kGPU) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
platform::CUDAPlace gpu_place(device_); platform::CUDAPlace gpu_place(device_);
auto *t_data = tensor->mutable_data<T>(gpu_place); auto *t_data = tensor->mutable_data<T>(gpu_place);
...@@ -155,15 +155,18 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { ...@@ -155,15 +155,18 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
if (platform::is_cpu_place(t_place)) { if (platform::is_cpu_place(t_place)) {
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T)); std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
} else if (place_ == PaddlePlace::kGPU) { } else if (place_ == PaddlePlace::kGPU) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
auto *dev_ctx = auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place)); static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place, memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
t_data, ele_num * sizeof(T), dev_ctx->stream()); t_data, ele_num * sizeof(T), dev_ctx->stream());
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(dev_ctx->stream());
#else
cudaStreamSynchronize(dev_ctx->stream()); cudaStreamSynchronize(dev_ctx->stream());
#endif
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Not compile with CUDA, should not reach here.")); "Not compile with CUDA, should not reach here."));
......
...@@ -16,6 +16,9 @@ ...@@ -16,6 +16,9 @@
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cudnn.h> #include <cudnn.h>
#endif #endif
#ifdef PADDLE_WITH_HIP
#include <miopen/miopen.h>
#endif
#include <glog/logging.h> #include <glog/logging.h>
#include <sstream> #include <sstream>
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#define LITE_WITH_CUDA 1 #define LITE_WITH_CUDA 1
#endif #endif
......
...@@ -123,7 +123,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data, ...@@ -123,7 +123,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) { if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) {
memory::Copy(cpu_place, dst_data, cpu_place, src_data, size); memory::Copy(cpu_place, dst_data, cpu_place, src_data, size);
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_cpu_place(dst_place) && if (platform::is_cpu_place(dst_place) &&
platform::is_gpu_place(src_place)) { platform::is_gpu_place(src_place)) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
......
...@@ -74,7 +74,7 @@ void make_fake_model(std::string* model, std::string* param) { ...@@ -74,7 +74,7 @@ void make_fake_model(std::string* model, std::string* param) {
*block_->add_ops() = *fetch->Proto(); *block_->add_ops() = *fetch->Proto();
framework::Scope scope; framework::Scope scope;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
#else #else
...@@ -102,11 +102,11 @@ TEST(EngineManager, engine) { ...@@ -102,11 +102,11 @@ TEST(EngineManager, engine) {
const std::string unique_key("engine_0"); const std::string unique_key("engine_0");
config.model_from_memory = true; config.model_from_memory = true;
config.valid_places = { config.valid_places = {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}), paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
#endif #endif
paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}), paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}), paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
}; };
LOG(INFO) << "Create EngineManager"; LOG(INFO) << "Create EngineManager";
......
...@@ -115,7 +115,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { ...@@ -115,7 +115,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
// Copy to LoDTensor. // Copy to LoDTensor.
framework::LoDTensor lod_tensor_n; framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx); TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
platform::GpuStreamSync( platform::GpuStreamSync(
static_cast<const platform::CUDADeviceContext&>(ctx).stream()); static_cast<const platform::CUDADeviceContext&>(ctx).stream());
...@@ -151,7 +151,7 @@ TEST(LiteEngineOp, TensorCopyAsync) { ...@@ -151,7 +151,7 @@ TEST(LiteEngineOp, TensorCopyAsync) {
auto* ctx_cpu = auto* ctx_cpu =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
test_tensor_copy(*ctx_cpu); test_tensor_copy(*ctx_cpu);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* ctx_gpu = auto* ctx_gpu =
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)); platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0));
test_tensor_copy(*ctx_gpu); test_tensor_copy(*ctx_gpu);
...@@ -162,7 +162,7 @@ TEST(LiteEngineOp, TensorShare) { ...@@ -162,7 +162,7 @@ TEST(LiteEngineOp, TensorShare) {
auto* ctx_cpu = auto* ctx_cpu =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
test_tensor_share(*ctx_cpu); test_tensor_share(*ctx_cpu);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* ctx_gpu = auto* ctx_gpu =
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)); platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0));
test_tensor_share(*ctx_gpu); test_tensor_share(*ctx_gpu);
......
...@@ -163,7 +163,7 @@ TEST(Analyzer_ernie, profile_mkldnn) { profile(true, false); } ...@@ -163,7 +163,7 @@ TEST(Analyzer_ernie, profile_mkldnn) { profile(true, false); }
#endif #endif
// Check the model by gpu // Check the model by gpu
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(Analyzer_ernie, profile_gpu) { profile(false, true); } TEST(Analyzer_ernie, profile_gpu) { profile(false, true); }
#endif #endif
......
...@@ -118,7 +118,7 @@ TEST(AnalysisPredictor, lite_xpu) { ...@@ -118,7 +118,7 @@ TEST(AnalysisPredictor, lite_xpu) {
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(AnalysisPredictor, thread_local_stream) { TEST(AnalysisPredictor, thread_local_stream) {
const size_t thread_num = 5; const size_t thread_num = 5;
std::vector<std::thread> threads(thread_num); std::vector<std::thread> threads(thread_num);
......
...@@ -168,7 +168,7 @@ void TestInference(const std::string& dirname, ...@@ -168,7 +168,7 @@ void TestInference(const std::string& dirname,
if (paddle::platform::is_cpu_place(place)) { if (paddle::platform::is_cpu_place(place)) {
state = paddle::platform::ProfilerState::kCPU; state = paddle::platform::ProfilerState::kCPU;
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
state = paddle::platform::ProfilerState::kAll; state = paddle::platform::ProfilerState::kAll;
// The default device_id of paddle::platform::CUDAPlace is 0. // The default device_id of paddle::platform::CUDAPlace is 0.
// Users can get the device_id using: // Users can get the device_id using:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册