# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle.fluid as fluid import numpy as np import tempfile import os import module_desc_pb2 from collections import defaultdict from downloader import download_and_uncompress __all__ = ["Module", "ModuleConfig", "ModuleUtils"] DICT_NAME = "dict.txt" ASSETS_NAME = "assets" def mkdir(path): """ the same as the shell command mkdir -p " """ if not os.path.exists(path): os.makedirs(path) class Module(object): def __init__(self, module_url=None, module_dir=None): if module_url == None and module_dir == None: raise Exception("Module:module_url and module_dir are None!") self.module_dir = "" self.module_name = "" # donwload module if module_url is not None and module_url.startswith("http"): # if it's remote url link, then download and uncompress it self.module_name, self.module_dir = download_and_uncompress( module_url) #TODO(ZeyuChen): check url link is valid url elif module_dir is not None: # otherwise it's local path, no need to deal with it self.module_dir = module_dir # use the path name as module name by default self.module_name = module_dir.split("/")[-1] #TODO(ZeyuChen) add more check about loading module from local path # load paddle inference model place = fluid.CPUPlace() self.exe = fluid.Executor(fluid.CPUPlace()) [self.inference_program, self.feed_target_names, self.fetch_targets] = fluid.io.load_inference_model( dirname=self.module_dir, executor=self.exe) print("inference_program") print(self.inference_program) print("feed_target_names") print(self.feed_target_names) print("fetch_targets") print(self.fetch_targets) self.config = ModuleConfig(self.module_dir) self.config.load() # load assets # self.dict = defaultdict(int) # self.dict.setdefault(0) # self._load_assets(module_dir) #TODO(ZeyuChen): Need add register more signature to execute different # implmentation def __call__(self, inputs=None, signature=None): """ Call default signature and return results """ # TODO(ZeyuChen): add proto spec to check which task we need to run # if it's NLP word embedding task, then do words preprocessing # if it's image classification or image feature task do the other works # if it's word_ids_lod_tensor = self._process_input(inputs) np_words_id = np.array(word_ids_lod_tensor) print("word_ids_lod_tensor\n", np_words_id) results = self.exe.run( self.inference_program, feed={self.feed_target_names[0]: word_ids_lod_tensor}, fetch_list=self.fetch_targets, return_numpy=False) # return_numpy=Flase is important print("module fetch_target_names", self.feed_target_names) print("module fetch_targets", self.fetch_targets) np_result = np.array(results[0]) return np_result def add_input_desc(var_name): pass def get_vars(self): return self.inference_program.list_vars() def get_input_vars(self): for var in self.inference_program.list_vars(): print(var) if var.name == "words": return var # return self.fetch_targets def get_module_output(self): for var in self.inference_program.list_vars(): print(var) # NOTE: just hack for load Senta's if var.name == "embedding_0.tmp_0": return var def get_inference_program(self): return self.inference_program # for text sequence input, transform to lod tensor as paddle graph's input def _process_input(self, inputs): # words id mapping and dealing with oov # transform to lod tensor seq = [] for s in inputs: seq.append(self._word_id_mapping(s)) lod_tensor = self.seq2lod_tensor(seq) return lod_tensor def seq2lod_tensor(self, seq_inputs, place=fluid.CPUPlace()): """ sequence to lod tensor, need to determine which space""" lod = [] lod.append([]) for s in seq_inputs: # generate lod lod[0].append(len(s)) # print("seq", seq_inputs) # print("lod", lod) lod_tensor = fluid.create_lod_tensor(seq_inputs, lod, place) return lod_tensor def _word_id_mapping(self, inputs): word_dict = self.config.get_dict() return list(map(lambda x: word_dict[x], inputs)) # # load assets folder # def _load_assets(self, module_dir): # assets_dir = os.path.join(module_dir, ASSETS_NAME) # dict_path = os.path.join(assets_dir, DICT_NAME) # word_id = 0 # with open(dict_path) as fi: # words = fi.readlines() # #TODO(ZeyuChen) check whether word id is duplicated and valid # for line in fi: # w, w_id = line.split() # self.dict[w] = int(w_id) def add_module_feed_list(self, feed_list): self.feed_list = feed_list def add_module_output_list(self, output_list): self.output_list = output_list class ModuleConfig(object): def __init__(self, module_dir, module_name=None): # generate model desc protobuf self.module_dir = module_dir self.desc = module_desc_pb2.ModuleDesc() if module_name == None: module_name = module_dir.split("/")[-1] self.desc.name = module_name print("desc.name=", self.desc.name) self.desc.signature = "default" print("desc.signature=", self.desc.signature) self.desc.contain_assets = True print("desc.signature=", self.desc.contain_assets) # init dict self.dict = defaultdict(int) self.dict.setdefault(0) def load(self): """load module config from module dir """ #TODO(ZeyuChen): check module_desc.pb exsitance pb_path = os.path.join(self.module_dir, "module_desc.pb") with open(pb_path, "rb") as fi: self.desc.ParseFromString(fi.read()) if self.desc.contain_assets: # load assets assets_dir = os.path.join(self.module_dir, ASSETS_NAME) dict_path = os.path.join(assets_dir, DICT_NAME) word_id = 0 with open(dict_path) as fi: words = fi.readlines() #TODO(ZeyuChen) check whether word id is duplicated and valid for line in fi: w, w_id = line.split() self.dict[w] = int(w_id) def dump(self): # save module_desc.proto first pb_path = os.path.join(self.module_dir, "module_desc.pb") with open(pb_path, "wb") as fo: fo.write(self.desc.SerializeToString()) # save assets/dictionary assets_dir = os.path.join(self.module_dir, ASSETS_NAME) mkdir(assets_dir) with open(os.path.join(assets_dir, DICT_NAME), "w") as fo: for w in self.dict: w_id = self.dict[w] fo.write("{}\t{}\n".format(w, w_id)) def save_dict(self, word_dict, dict_name=DICT_NAME): """ Save dictionary for NLP module """ mkdir(self.module_dir) with open(os.path.join(self.module_dir, DICT_NAME), "w") as fo: for w in word_dict: self.dict[w] = word_dict[w] def get_dict(self): return self.dict class ModuleUtils(object): def __init__(self): pass @staticmethod def remove_feed_fetch_op(program): """ remove feed and fetch operator and variable for fine-tuning """ print("remove feed fetch op") block = program.global_block() need_to_remove_op_index = [] for i, op in enumerate(block.ops): if op.type == "feed" or op.type == "fetch": need_to_remove_op_index.append(i) for index in need_to_remove_op_index[::-1]: block._remove_op(index) block._remove_var("feed") block._remove_var("fetch") program.desc.flush() print("********************************") print(program) print("********************************") if __name__ == "__main__": url = "http://paddlehub.cdn.bcebos.com/word2vec/word2vec-dim16-simple-example-2.tar.gz" m = Module(module_url=url) inputs = [["it", "is", "new"], ["hello", "world"]] #tensor = m._process_input(inputs) #print(tensor) result = m(inputs) print(result)