# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import types import paddle import paddle.fluid as fluid import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages import logging from ..common import get_logger _logger = get_logger(__name__, level=logging.INFO) def get_distribution(program, var_names, executor, reader=None, feed_vars=None, scope=None): """ Get the variables distribution in the var_names list Args: program(fluid.Program): program to analyze. var_names(list): name of variables to analyze. When there is activation name in var_names, you should set executor. executor(fluid.Executor, optional): The executor to run program. Default is None. reader(Python Generator, fluid.io.DataLoader, optional): If you only want to get the distribution of weight parameters, you do not need to provide a reader. Otherwise, a reader must be provided. The reader provides calibrate data, and it returns a batch every time. It must be either a python generator or a iterable fluid dataloader. When you use a python generator, please ensure that its behavior is consistent with `batch_generator`。 You can get more detail about batch_generator at https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/io_cn/DataLoader_cn.html#id1 feed_vars(list): feed variables for program. When you use python generator reader to provide data, you should set feed_vars. Default is None. scope(fluid.Scope, optional): The scope to run program, use it to load variables. If scope is None, will use fluid.global_scope(). Returns: dict: numpy array of variables distribution that name in var_names """ scope = fluid.global_scope() if scope is None else scope assert isinstance(var_names, list), 'var_names is a list of variable name' var_changed = [] real_names = [] weight_only = True for var in program.list_vars(): if var.name in var_names: if var.persistable == False: weight_only = False var.persistable = True var_changed.append(var) real_names.append(var.name) def update_var_dist(var_dist): for name in real_names: var = scope.find_var(name) if var is not None: var_array = np.array(var.get_tensor()) var_dist[name] = var_array else: _logger.info("can't find var {} in scope.".format(name)) return var_dist var_dist = {} if weight_only: var_dist = update_var_dist(var_dist) else: assert isinstance(reader, types.GeneratorType) or isinstance( reader, fluid.reader.DataLoaderBase ), "when var_names include activations'name, reader must be either a python generator or a fluid dataloader." assert executor is not None, "when var_names include activations'name, executor must be set" if isinstance(reader, types.GeneratorType): assert feed_vars is not None, "When using batch_generator, feed_vars must be set" dataloader = fluid.io.DataLoader.from_generator( feed_list=feed_vars, capacity=128, iterable=True) dataloader.set_batch_generator(reader, executor.place) elif isinstance(reader, fluid.reader.DataLoaderBase): dataloader = reader else: _logger.info( "When both batch_generator and data_loader is None, var_names can only include weight names" ) return for data in dataloader: executor.run(program=program, feed=data) var_dist = update_var_dist(var_dist) break for var in var_changed: var.persistable = False return var_dist def pdf(var_dist, pdf_save_dir='var_dist_pdf'): """ Draw hist for distributtion of variables in that in var_dist. Args: var_dist(dict): numpy array of variables distribution. pdf_save_dir(str): dirname to save pdf. Default is 'var_dist_pdf' """ numbers = len(var_dist) if pdf_save_dir is not None: if not os.path.exists(pdf_save_dir): os.mkdir(pdf_save_dir) pdf_path = os.path.join(pdf_save_dir, 'result.pdf') with PdfPages(pdf_path) as pdf: for i, name in enumerate(var_dist.keys()): if i % 10 == 0: _logger.info("plt {}/{}".format(i, numbers)) arr = var_dist[name] arr = arr.flatten() weights = np.ones_like(arr) / len(arr) plt.hist(arr, bins=1000, weights=weights) plt.xlabel(name) plt.ylabel("frequency") plt.title("Hist of variable {}".format(name)) plt.show() pdf.savefig() plt.close() _logger.info("variables histogram have been saved as {}".format(pdf_path))