未验证 提交 bf3161bd 编写于 作者: P Pei Yang 提交者: GitHub

fix emb_eltwise_ln gpu_id bug (#33701) (#33706)

上级 cdeffff4
...@@ -421,7 +421,6 @@ void AnalysisConfig::Update() { ...@@ -421,7 +421,6 @@ void AnalysisConfig::Update() {
pass_builder()->AppendPass(pass); pass_builder()->AppendPass(pass);
} }
} }
LOG(INFO) << "use_dlnne_:" << use_dlnne_ << std::endl;
if (use_dlnne_) { if (use_dlnne_) {
pass_builder()->ClearPasses(); pass_builder()->ClearPasses();
for (const auto &pass : kDlnneSubgraphPasses) { for (const auto &pass : kDlnneSubgraphPasses) {
......
...@@ -152,8 +152,8 @@ bool AnalysisPredictor::Init( ...@@ -152,8 +152,8 @@ bool AnalysisPredictor::Init(
: platform::ProfilerState::kCPU; : platform::ProfilerState::kCPU;
platform::EnableProfiler(tracking_device); platform::EnableProfiler(tracking_device);
} else { } else {
LOG(INFO) << "Profiler is deactivated, and no profiling report will be " VLOG(2) << "Profiler is deactivated, and no profiling report will be "
"generated."; "generated.";
} }
// no matter with or without MKLDNN // no matter with or without MKLDNN
......
...@@ -294,7 +294,7 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -294,7 +294,7 @@ struct PD_INFER_DECL AnalysisConfig {
/// workspace. /// workspace.
/// \param max_batch_size The maximum batch size of this prediction task, /// \param max_batch_size The maximum batch size of this prediction task,
/// better set as small as possible for less performance loss. /// better set as small as possible for less performance loss.
/// \param min_subgrpah_size The minimum TensorRT subgraph size needed, if a /// \param min_subgraph_size The minimum TensorRT subgraph size needed, if a
/// subgraph is smaller than this, it will not be transferred to TensorRT /// subgraph is smaller than this, it will not be transferred to TensorRT
/// engine. /// engine.
/// \param precision The precision used in TensorRT. /// \param precision The precision used in TensorRT.
......
...@@ -134,7 +134,7 @@ int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue( ...@@ -134,7 +134,7 @@ int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue(
int batch = id_dims.d[0]; int batch = id_dims.d[0];
int seq_len = id_dims.d[1]; int seq_len = id_dims.d[1];
int input_num = embs_.size(); int input_num = embs_.size();
cudaGetDevice(&device_id_);
auto in_ptr_gpu_d = auto in_ptr_gpu_d =
in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_)); in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
auto emb_ptr_gpu_d = auto emb_ptr_gpu_d =
......
...@@ -29,11 +29,6 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) { ...@@ -29,11 +29,6 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) {
int run_batch = 1; int run_batch = 1;
const int run_seq_len = 128; const int run_seq_len = 128;
std::vector<int64_t> tmp_input;
std::vector<float> tmp_four_input;
tmp_input.reserve(run_batch * run_seq_len);
tmp_four_input.reserve(run_batch * run_seq_len);
int64_t i0[run_seq_len] = { int64_t i0[run_seq_len] = {
1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, 1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321,
4095, 1902, 4, 134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, 4095, 1902, 4, 134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册