提交 c407dfa3 编写于 作者: N nhzlx

cherry-pick from feature/anakin-engine: refine paddle-anakin to new interface. #16276

上级 a25331bc
...@@ -79,7 +79,7 @@ pass_library(anakin_fillconstant_elementwisemul_fuse inference) ...@@ -79,7 +79,7 @@ pass_library(anakin_fillconstant_elementwisemul_fuse inference)
# be detected by our pass. The index here represents the number of structures in the # be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are # pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models. # common in the models.
foreach (index RANGE 3 6) foreach (index RANGE 2 6)
file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n") file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
endforeach() endforeach()
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <memory>
#include <string> #include <string>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
...@@ -123,6 +125,7 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl( ...@@ -123,6 +125,7 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
} }
template class TransposeFlattenConcatFusePass<1>; template class TransposeFlattenConcatFusePass<1>;
template class TransposeFlattenConcatFusePass<2>;
template class TransposeFlattenConcatFusePass<3>; template class TransposeFlattenConcatFusePass<3>;
template class TransposeFlattenConcatFusePass<4>; template class TransposeFlattenConcatFusePass<4>;
template class TransposeFlattenConcatFusePass<5>; template class TransposeFlattenConcatFusePass<5>;
...@@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>; ...@@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>;
REGISTER_PASS(transpose_flatten_concat_fuse_pass, REGISTER_PASS(transpose_flatten_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<1>); paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
REGISTER_PASS(transpose_flatten2_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<2>);
REGISTER_PASS(transpose_flatten3_concat_fuse_pass, REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<3>); paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
......
...@@ -117,27 +117,14 @@ class AnakinOpConverter { ...@@ -117,27 +117,14 @@ class AnakinOpConverter {
} }
temp_max_input_shape[input] = input_shape; temp_max_input_shape[input] = input_shape;
engine->SetInputShape(input, input_shape); engine->SetInputShape(input, input_shape);
// engine->Graph()->RegistVar(input); // For share from data. engine->Graph()->RegistVar(input); // For share from data.
} }
engine->SetMaxInputShape(temp_max_input_shape); engine->SetMaxInputShape(temp_max_input_shape);
engine->Optimize(); engine->Optimize();
// For anakin share with fluid tensor.
engine->AllocTmpMem();
engine->InitGraph(); engine->InitGraph();
/*
for(auto& input : inputs) {
platform::CUDAPlace gpu_place(engine->GetDevice());
auto input_var = scope->Var();
auto input_tensor = input_var->GetMutable<framework::LoDTensor>();
auto input_max_shape = temp_max_input_shape[input];
input_tensor->Resize(framework::make_ddim(input_max_shape));
auto input_data = input_tensor->mutable_data<float>(gpu_place);
auto* anakin_input = engine->Net()->get_in(input);
::anakin::saber::Tensor<::anakin::saber::NV> tmp_anakin_tensor(input_data,
::anakin::saber::NV(), 0, input_max_shape);
anakin_input->share_from(tmp_anakin_tensor);
}
*/
} }
void SetEngine(AnakinNvEngine *engine) { engine_ = engine; } void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
......
...@@ -97,15 +97,14 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( ...@@ -97,15 +97,14 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
anakin_input = net_->get_in(input.first); anakin_input = net_->get_in(input.first);
} }
*/ */
anakin_input->reshape(fluid_input_shape); anakin_input->reshape(fluid_input_shape);
::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0, ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
fluid_input_shape); fluid_input_shape);
anakin_input->copy_from(tmp_anakin_tensor); anakin_input->copy_from(tmp_anakin_tensor);
} }
cudaDeviceSynchronize();
net_->prediction(); net_->prediction();
cudaDeviceSynchronize();
for (const auto &output : outputs) { for (const auto &output : outputs) {
platform::CUDAPlace gpu_place(device_); platform::CUDAPlace gpu_place(device_);
auto *tensor = output.second; auto *tensor = output.second;
......
...@@ -84,17 +84,20 @@ class AnakinEngine { ...@@ -84,17 +84,20 @@ class AnakinEngine {
int GetMaxBatchSize() { return max_batch_size_; } int GetMaxBatchSize() { return max_batch_size_; }
void Freeze(); void Freeze();
void Optimize(); void Optimize();
void AllocTmpMem() {
PADDLE_ENFORCE(net_->alloc_memory_first(*graph_),
"anakin alloc temp memory first failed");
}
void Save(std::string path) { graph_->save(path); } void Save(std::string path) { graph_->save(path); }
bool IsInit() { return initialized_; }
int GetDevice() { return device_; } int GetDevice() { return device_; }
// void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
// }
// void LoadSerializedData(const std::string& data) {
// graph_->load_from_string(data); }
void Execute(const std::map<std::string, framework::LoDTensor *> &inputs, void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs, const std::map<std::string, framework::LoDTensor *> &outputs,
cudaStream_t stream); cudaStream_t stream);
private: private:
bool initialized_{false};
int max_batch_size_; int max_batch_size_;
std::map<std::string, std::vector<int>> max_input_shape_; std::map<std::string, std::vector<int>> max_input_shape_;
int device_; int device_;
......
...@@ -99,7 +99,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { ...@@ -99,7 +99,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
#endif #endif
}); });
for (int i = 6; i >= 3; i--) { for (int i = 6; i >= 2; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) + passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass"); "_concat_fuse_pass");
} }
......
...@@ -97,23 +97,7 @@ class AnakinEngineOp : public framework::OperatorBase { ...@@ -97,23 +97,7 @@ class AnakinEngineOp : public framework::OperatorBase {
if (param_names_.count(x)) continue; if (param_names_.count(x)) continue;
auto &t = auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x); inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
/*
auto t_shape = framework::vectorize(t.dims());
auto *anakin_input = engine->Net()->get_in(x);
auto net_shape = anakin_input->shape();
size_t anakin_net_input_size = net_shape.count() * sizeof(float);
size_t fluid_input_size = t.memory_size();
if (fluid_input_size < anakin_net_input_size) {
framework::LoDTensor temp_t;
auto t_dims = t.dims();
temp_t.Resize(t_dims);
TensorCopySync(t, dev_place, &temp_t);
t.Resize(framework::make_ddim(net_shape));
t.mutable_data<float>(dev_place);
TensorCopySync(temp_t, dev_place, &t);
}
*/
inputs.insert({x, &t}); inputs.insert({x, &t});
} }
...@@ -136,6 +120,41 @@ class AnakinEngineOp : public framework::OperatorBase { ...@@ -136,6 +120,41 @@ class AnakinEngineOp : public framework::OperatorBase {
inference::Singleton<inference::anakin::AnakinEngineManager>::Global() inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
.Get(engine_key_); .Get(engine_key_);
} }
// BUG here, detect that the tensor data pointer here will change sometime.
// Will fix it later.
/*
// For share with the tensor from fluid, We do the net init in the first net
precit.
if (!anakin_engine_->IsInit()) {
auto temp_max_input_shape = anakin_engine_->GetMaxInputShape();
anakin_engine_->AllocTmpMem();
for(auto& input : Inputs("Xs")) {
if (param_names_.count(input)) continue;
platform::CUDAPlace
gpu_place(boost::get<platform::CUDAPlace>(dev_place).device);
auto *input_var = scope.FindVar(input);
auto input_tensor = input_var->GetMutable<framework::LoDTensor>();
auto input_max_shape = temp_max_input_shape[input];
framework::LoDTensor temp_t;
auto t_dims = input_tensor->dims();
temp_t.Resize(t_dims);
TensorCopySync(*input_tensor, dev_place, &temp_t);
input_tensor->Resize(framework::make_ddim(input_max_shape));
input_tensor->mutable_data<float>(dev_place);
TensorCopySync(temp_t, dev_place, input_tensor);
auto* input_data = input_tensor->mutable_data<float>(gpu_place);
auto* anakin_input = anakin_engine_->Net()->get_in(input);
::anakin::saber::Tensor<::anakin::saber::NV>
tmp_anakin_tensor(input_data,
::anakin::saber::NV(), 0, input_max_shape);
anakin_input->share_from(tmp_anakin_tensor);
}
anakin_engine_->InitGraph();
}
*/
return anakin_engine_; return anakin_engine_;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册