// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" namespace paddle { namespace inference { namespace analysis { /* * Memory optimization pass for inference with pre-analysis of memory usage * without GC. * Different from training, the inference memory reuse strategies doesn't * include GC for that overhead is too much when batch size equals one. * * The inference memory reuse tries to pre-determine the tensor reusing strategy * without runtime overhead. * * To improve the strategy's performance, a warm-up running is introduced: * - Before officially deploy the inference program, one should warm it up and * generate some runtime cache, * - Run the inference program with several batches of data, it will persist * some runtime information about memory of tensors to disk, we call the * information the memory reusing cache, * - With the memory reusing cache, user can deploy the inference to a * service, before running the model, the inference program will load the * memory cache, analysis it and generate the best memory reusing strategy, * and adjust the execution of the network. * * With the warm-up and memory reusing cache design, the memory reusing * algorithm can analysis the real memory consume of the tensors, even with the * flexible LoDTensor and special shape changing operators such as * sequence-pooling. */ class MemoryOptimizePass : public AnalysisPass { public: using space_table_t = std::unordered_map; using lifecycle_t = std::pair; struct MemoryAllocation { size_t allocated; // allocated memory in byte. size_t saved; // saved memory in byte. int sort_kind; // the kind of the corresponding sorting algorithm. // Get the memory saving ratio of temporary variables. float GetSavingRatio() const; }; virtual ~MemoryOptimizePass() = default; protected: void RunImpl(Argument *argument) override; private: void CollectLifeCycle( std::unordered_map *lifecycles, int sort_kind) const; void CollectVarMemorySize( const std::unordered_map &batch_var_ave_dim, std::unordered_map *tensor_nodes, space_table_t *space_table) const; // Returns percentage of saved memory. void MakeReusePlan( const std::vector> &var_clusters, const std::unordered_map &var_batch_ave_size, const space_table_t &space_table, std::unordered_map *reuse_table, int sort_kind, MemoryAllocation *memory_allocation) const; void PerformReusePlan( const std::unordered_map &reuse_table, int sort_kind, std::unordered_set *vars2remove) const; public: std::string repr() const override; private: mutable framework::ir::Graph *graph_{nullptr}; mutable int max_lifecycle_{-1}; }; static std::string GetMemoryCachePath(const std::string &model_path, const std::string &prog_path) { auto path = model_path.empty() ? prog_path : model_path; return path + ".memory_cache"; } } // namespace analysis } // namespace inference } // namespace paddle