// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/inference/anakin/engine.h" #include #include #include #include #include "paddle/fluid/framework/ddim.h" using anakin::Precision; using anakin::OpRunType; using paddle::framework::LoDTensor; template using AnakinNetT = anakin::Net; template using AnakinGraphT = anakin::graph::Graph; namespace paddle { namespace inference { namespace anakin { template AnakinEngine::AnakinEngine( bool need_summary, int device, int max_batch_size, std::map> max_input_shape, std::vector program_inputs) : graph_(new AnakinGraphT()), net_(new AnakinNetT(need_summary)) { device_ = device; max_batch_size_ = max_batch_size; max_input_shape_ = max_input_shape; program_inputs_ = program_inputs; } template AnakinEngine::~AnakinEngine() {} template void AnakinEngine::SetInputShape( const std::string &name, std::vector shape) { graph_->AddOpAttr<::anakin::PTuple>(name, "input_shape", std::move(shape)); } template void AnakinEngine::InitNet() { net_->init(*graph_); } template void AnakinEngine::AddOp( const std::string &name, const std::string &type, const std::vector &inputs, const std::vector &outputs) { PADDLE_ENFORCE(graph_->AddOp(name, type, inputs, outputs), "Add operation."); } template void AnakinEngine::BindInput( const std::map &inputs) { #ifdef PADDLE_WITH_CUDA cudaDeviceSynchronize(); #endif for (const auto &input : inputs) { auto *tensor = input.second; auto *data = tensor->data(); auto fluid_input_shape = framework::vectorize2int(tensor->dims()); while (fluid_input_shape.size() < 4) { fluid_input_shape.push_back(1); } auto *anakin_input = net_->get_in(input.first); std::vector max_input_shape = max_input_shape_[input.first]; int max_shape_sum = std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1, std::multiplies()); if (tensor->numel() > max_shape_sum) { PADDLE_ENFORCE(std::find(program_inputs_.begin(), program_inputs_.end(), input.first) == program_inputs_.end(), "The anakin input max shape should be greater than" " or equal to the real input shape, Please set the max " "input shape using EnableAnakinEngine"); VLOG(3) << "Anakin Net will be reset because of the inputs out of range: " << input.first; graph_->Reshape(input.first, fluid_input_shape); net_.reset(new AnakinNetT(true)); net_->init(*graph_); anakin_input = net_->get_in(input.first); } anakin_input->reshape(fluid_input_shape); ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, fluid_input_shape); anakin_input->copy_from(tmp_anakin_tensor); } } template void AnakinEngine::Execute( const std::map &inputs, const std::map &outputs) { BindInput(inputs); net_->prediction(); for (const auto &output : outputs) { platform::CPUPlace cpu_place; auto *tensor = output.second; auto *anakin_output = net_->get_out(output.first); auto *anakin_data = anakin_output->data(); auto anakin_output_shape = anakin_output->valid_shape(); tensor->Resize(framework::make_ddim(anakin_output_shape)); auto *fluid_data = tensor->mutable_data(cpu_place); memory::Copy(cpu_place, static_cast(fluid_data), cpu_place, static_cast(anakin_data), tensor->numel() * sizeof(float)); } } #ifdef PADDLE_WITH_CUDA template void AnakinEngine::Execute( const std::map &inputs, const std::map &outputs, cudaStream_t stream) { BindInput(inputs); net_->prediction(); cudaDeviceSynchronize(); for (const auto &output : outputs) { platform::CUDAPlace gpu_place(device_); auto *tensor = output.second; auto *anakin_output = net_->get_out(output.first); auto *anakin_data = anakin_output->data(); auto anakin_output_shape = anakin_output->valid_shape(); tensor->Resize(framework::make_ddim(anakin_output_shape)); auto *fluid_data = tensor->mutable_data(gpu_place); memory::Copy(gpu_place, static_cast(fluid_data), gpu_place, static_cast(anakin_data), tensor->numel() * sizeof(float), stream); } cudaDeviceSynchronize(); } #endif template void AnakinEngine::Freeze() { PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph."); } template void AnakinEngine::Optimize() { PADDLE_ENFORCE(graph_->Optimize(), "Graph optimization."); } template void AnakinEngine::RegistBlock( ::anakin::PBlock *block_p) { PADDLE_ENFORCE(graph_->RegistBlock(block_p), "Block register."); } template std::unique_ptr> AnakinEngine::Clone() { auto *engine = new AnakinEngine(); engine->net_ = std::move(net_->Clone()); return std::unique_ptr(engine); } #ifdef PADDLE_WITH_CUDA template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>; template class AnakinEngineManager<::anakin::saber::NV, ::anakin::Precision::FP32>; template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::INT8>; template class AnakinEngineManager<::anakin::saber::NV, ::anakin::Precision::INT8>; #endif template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>; template class AnakinEngineManager<::anakin::saber::X86, ::anakin::Precision::FP32>; template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::INT8>; template class AnakinEngineManager<::anakin::saber::X86, ::anakin::Precision::INT8>; // template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>; } // namespace anakin } // namespace inference } // namespace paddle