diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index e5bd3c97b1f53b7b3846ab4bac166d91d20c61a4..a2b21b5fa5f3dcb275676cfc0ce8cf9fdfe00062 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -744,6 +744,12 @@ void build_op_func_list(const platform::Place& place, interpreter::LogDeviceMemoryStats(place); } + + // NOTE(Ruibiao): Release memory cache to avoid memory fragments in Allocator. + // It reduce about 10% memory usage for V100 8-GPU training of + // transformer_base_bs4096_amp_fp16 and transformer_base_bs4096_pure_fp16 + // model. + memory::Release(place); } void add_fetch(const std::vector& fetch_names,