diff --git a/tools/ModelConverter/README_cn.md b/tools/ModelConverter/README_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..45e393ea7c7d974e6c0936b9d3d4efba3b71f148 --- /dev/null +++ b/tools/ModelConverter/README_cn.md @@ -0,0 +1,71 @@ +# PaddleJS Model Converter + +PaddleJS Model Converter 是适用于 PaddleJS 的模型转换工具,其作用是将 PaddlePaddle 模型(或称为 fluid 模型)转化为 PaddleJS 模型这种浏览器友好的格式,以供在 PaddleJS 和浏览器中加载预测使用。此外,PaddleJS Model Converter 还提供了强大的模型优化能力,帮助开发者对模型结构进行优化,提高运行时性能。 + +## 1. 使用教程 + +### 1.1. 环境搭建 +#### Python 版本确认 +确认运行平台的 Python 环境与版本是否满足要求,若使用 Python3 ,则可能需要将后续命令中的 `python` 换成 `python3`: +- Python3: 3.5.1+ / 3.6 / 3.7 +- Python2: 2.7.15+ + +#### 安装虚拟环境 +*由于开发环境可能安装了多个版本的 Python,相关依赖包可能存在不同的版本,为避免产生冲突,**强烈建议**使用 Python 虚拟环境执行转换工具所需的各项命令,以免产生各种问题。若不使用虚拟环境或已安装虚拟环境,可跳过该步骤。* + +以 Anaconda 为例: +前往 [Anaconda](https://www.anaconda.com/) 主页,选择对应平台、Python 版本的 Anaconda 按照官方提示,进行安装; + +安装完毕后,在命令行执行以下命令,创建Python 虚拟环境: +``` bash +conda create --name +``` + +执行以下命令,切换至虚拟环境 +``` bash +# Linux 或 macOS下请执行 +source activate + +# Windows 下请执行 +activate +``` + +#### 安装依赖 +- 如果`不需要`使用优化模型的能力,执行命令: +``` bash +python -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +``` +- 如果`需要`使用优化模型的能力,执行命令: +``` bash +python -m pip install paddlepaddle paddlelite==2.6.0 -i https://mirror.baidu.com/pypi/simple +``` + +### 1.2. 快速上手 +- 如果待转换的 fluid 模型为`合并参数文件`,即一个模型对应一个参数文件: +``` bash +python convertToPaddleJSModel.py --modelPath= --paramPath= --outputDir= +``` +- 如果待转换的 fluid 模型为`分片参数文件`,即一个模型文件对应多个参数文件: +``` bash +# 注意,使用这种方式调用转换器,需要保证 inputDir 中,模型文件名为'__model__' +python convertToPaddleJSModel.py --inputDir= --outputDir= +```` +模型转换器将生成以下两种类型的文件以供 PaddleJS 使用: + +- model.json (模型结构与参数清单) +- chunk_\*.dat (二进制参数文件集合) + +## 2. 详细文档 +参数 | 描述 +:-: | :-: +--inputDir | fluid 模型所在目录,当且仅当使用分片参数文件时使用该参数,将忽略 `modelPath` 和 `paramPath` 参数,且模型文件名必须为`__model__` +--modelPath | fluid 模型文件所在路径,使用合并参数文件时使用该参数 +--paramPath | fluid 参数文件所在路径,使用合并参数文件时使用该参数 +--outputDir | `必要参数`, paddleJS 模型输出路径 +--optimize | 是否进行模型优化, `0` 为关闭优化,`1` 为开启优化(需安装 PaddleLite ),默认关闭优化 +--logModelInfo | 是否打印模型结构信息, `0` 为不打印, `1` 为打印,默认不打印 +--sliceDataSize | 分片输出 PaddleJS 参数文件时,每片文件的大小,单位:KB,默认 4096 + +## 3. 其他信息 +若需要转换的模型为 `TensorFlow/Caffe/ONNX` 格式,可使用 PaddlePaddle 项目下的 `X2Paddle`工具,将其他格式的模型转为 fluid 模型后,再使用本工具转化为 PaddleJS 模型。 +详细请参考 [X2Paddle 项目](https://github.com/PaddlePaddle/X2Paddle) \ No newline at end of file diff --git a/tools/ModelConverter/convertModel.py b/tools/ModelConverter/convertModel.py new file mode 100644 index 0000000000000000000000000000000000000000..deb4570fdac3db5c96f939f3c8cdad8a110c7316 --- /dev/null +++ b/tools/ModelConverter/convertModel.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +import json +import collections +import math +import sys +import os +import struct +import argparse +import shutil +import stat +import traceback +import numpy as np +import paddle.fluid as fluid + + +# 输入模型所在目录 +modelDir = None +# 输入模型名 +modelName = None +# 输入参数名,当且仅当所有模型参数被保存在一个单独的二进制文件中,它才需要被指定,若为分片模型,请设置为None +paramsName = None +# 是否打印模型信息 +enableLogModelInfo = False +# 输出模型目录 +outputDir = None +# 分片文件大小,单位:KB +sliceDataSize = 4 * 1024 +# paddlepaddle运行程序实例 +program = None +# 存放模型结构 +modelInfo = {"vars": [], "ops": []} +# 存放参数数值(未排序) +paramValuesDict = {} + +def logModel(info): + """ 打印信息 """ + if enableLogModelInfo: + print(info) + +def sortDict(oldDict, reverse=False): + """ 对字典进行排序,返回有序字典,默认升序 """ + # 获得排序后的key list + keys = sorted(oldDict.keys(), reverse=reverse) + orderDict = collections.OrderedDict() + # 遍历 key 列表 + for key in keys: + orderDict[key] = oldDict[key] + return orderDict + +def dumpModelToJsonFile(): + """ 导出模型数据到json文件 """ + print("Dumping model structure to json file...") + if not os.path.exists(outputDir): + os.makedirs(outputDir) + outputModelPath = os.path.join(outputDir, "model.json") + with open(outputModelPath, 'w') as outputFile: + json.dump(modelInfo, outputFile, indent=4, separators=(", ", ": "), sort_keys=True) + print("Dumping model structure to json file successfully") + +def sliceDataToBinaryFile(paramValueList): + """ 将参数数据分片输出到文件,默认分片策略为按4M分片 """ + totalParamValuesCount = len(paramValueList) + countPerSlice = int(sliceDataSize * 1024 / 4) + + if not os.path.exists(outputDir): + os.makedirs(outputDir) + currentChunkIndex = 0 + currentParamDataIndex = 0 + + while currentParamDataIndex < totalParamValuesCount - 1: + remainCount = totalParamValuesCount - currentParamDataIndex + if remainCount < countPerSlice: + countPerSlice = remainCount + chunkPath = os.path.join(outputDir, 'chunk_%s.dat' % (currentChunkIndex + 1)) + file = open(chunkPath, 'wb') + for i in paramValueList[currentParamDataIndex : currentParamDataIndex + countPerSlice]: + byte = struct.pack('f', float(i)) + file.write(byte) + file.close() + currentParamDataIndex = currentParamDataIndex + countPerSlice + currentChunkIndex = currentChunkIndex + 1 + print("Output No." + str(currentChunkIndex)+ " binary file, remain " + str(totalParamValuesCount - currentParamDataIndex) + " param values.") + print("Slicing data to binary files successfully. (" + str(currentChunkIndex)+ " output files and " + str(currentParamDataIndex) + " param values)") + +def reorderParamsValue(): + """ 对参数文件中的数值,按照variable.name字母序排序,返回排序后组合完成的value list """ + paramValuesOrderDict = sortDict(paramValuesDict) + paramValues = [] + for value in paramValuesOrderDict.values(): + paramValues += value + return paramValues + +def mapToPaddleJSTypeName(fluidOPName): + """ 处理fluid的OP type与PaddleJS的OP type不对应情况 """ + if fluidOPName == "batch_norm": + return "batchnorm" + return fluidOPName + +def organizeModelVariableInfo(): + """ 组织参数信息 """ + print("Organizing model variables info...") + index = 0 + # 存放var信息(未排序) + varInfoDict = {} + # 获取program中所有的var,遍历并获取所有未排序的var信息和参数数值 + vars = list(program.list_vars()) + for v in vars: + # 跳过feed和fetch + if "feed" == v.name: + continue + if "fetch" == v.name: + continue + + varShape = list(v.shape) + + # FIXME:start paddlejs 不支持shape中为-1,这里需要手动过滤一下,支持了以后可以删除 + varShapeExcludeNegativeOne = [] + for s in varShape: + if s == -1: + continue + varShapeExcludeNegativeOne.append(s) + varShape = varShapeExcludeNegativeOne + # FIXME:end + + # 存放variable信息,在dump成json时排序 + varInfo = {} + varInfo["shape"] = varShape + # 数据是否是持久化数据,如tensor为持久化数据,op的output不是持久化数据 + # 只输出持久化数据,paddlejs中也仅读取持久化数据 + varInfo["persistable"] = v.persistable + varInfoDict[v.name] = varInfo + + logModel("[Var index:" + str(index) + " name:" + v.name + "]") + jsonDumpsIndentStr = json.dumps(varInfo, indent=2) + logModel(jsonDumpsIndentStr) + logModel("") + index += 1 + + # persistable数据存入paramValuesDict,等待排序 + if v.persistable: + data = np.array(fluid.global_scope().find_var(v.name).get_tensor()).flatten().tolist() + paramValuesDict[v.name] = data + + # 对var信息dict,按照key(var名)进行字母顺序排序 + varInfoOrderDict = sortDict(varInfoDict) + + # 将var信息按照顺序,添加到model info的vars中 + for key, value in varInfoOrderDict.items(): + value["name"] = key + modelInfo["vars"].append(value) + print("Organizing model variables info successfully.") + +def organizeModelOpInfo(): + """ 组织模型OP结构信息 """ + print("Organizing model operators info...") + ops = program.current_block().ops + feedOutputName = None + index = 0 + for op in ops: + opInfo = {} + + # 获取OP type,需要映射到PaddleJS的名字 + opInfo["type"] = mapToPaddleJSTypeName(op.type) + + # 获取OP input + inputs = {} + for name in op.input_names: + value = op.input(name) + if len(value) <= 0: + continue + if value[0] == feedOutputName: + # FIXME:workaround,PaddleJSfeed 输入必须是image,且为单输入,这里修改feed后面的OP的input为image,建立前后关联 + inputs[name] = ["image"] + else: + inputs[name] = value + opInfo["inputs"] = inputs + + # 获取OP output + outputs = {} + for name in op.output_names: + value = op.output(name) + if len(value) <= 0: + continue + if op.type == "feed": + # FIXME:workaround,PaddleJSfeed 输入必须是image,且为单输入,这里保存原始的输出名,以便映射 + feedOutputName = value[0] + outputs[name] = ["image"] + else: + outputs[name] = value + opInfo["outputs"] = outputs + + # 获取OP attribute + attrs = {} + for name in op.attr_names: + # 过滤不需要的参数 + if name in ["op_callstack", 'col', 'op_role', 'op_namescope', 'op_role_var']: + continue + value = op.attr(name) + attrs[name] = value + opInfo["attrs"] = attrs + + # 存入modelInfo + modelInfo["ops"].append(opInfo) + logModel("[OP index:" + str(index) + " type:" + op.type + "]") + jsonDumpsIndentStr = json.dumps(opInfo, indent=2) + logModel(jsonDumpsIndentStr) + logModel("") + index += 1 + print("Organizing model operators info successfully.") + +def convertToPaddleJSModel(): + """ 转换fluid modle为paddleJS model """ + # 初始化fluid运行环境和配置 + exe = fluid.Executor(fluid.CPUPlace()) + result = fluid.io.load_inference_model(dirname=modelDir, executor=exe, model_filename=modelName, params_filename=paramsName) + global program + program = result[0] + + # 获取program中所有的op,按op顺序加入到model info + organizeModelOpInfo() + + # 获取program中所有的var,按照字母顺序加入到model info,同时读取参数数值 + organizeModelVariableInfo() + + # 导出模型文件到json + dumpModelToJsonFile() + + # 对参数数值dict,按照key(参数名)进行字母顺序排序,并组合到一起 + paramValues = reorderParamsValue() + + # 导出分片参数文件 + sliceDataToBinaryFile(paramValues) + +if __name__ == "__main__": + try: + p = argparse.ArgumentParser(description='模型转换参数解析') + p.add_argument('--inputDir', help='fluid模型所在目录。当且仅当使用分片参数文件时使用该参数。将过滤modelPath和paramsPath参数,且模型文件名必须为`__model__`', required=False) + p.add_argument('--modelPath', help='fluid模型文件所在路径,使用合并参数文件时使用该参数', required=False) + p.add_argument('--paramPath', help='fluid参数文件所在路径,使用合并参数文件时使用该参数', required=False) + p.add_argument("--outputDir", help='paddleJS模型输出路径,必要参数', required=True) + p.add_argument("--logModelInfo", type=int, default=0, help='是否输出模型结构信息,非必要参数,0为不输出,1为输出,默认不输出', required=False) + p.add_argument("--sliceDataSize", type=int, default=4096, help='分片输出参数文件时,每片文件的大小,单位:KB,非必要参数,默认4096KB', required=False) + + args = p.parse_args() + modelDir = args.inputDir + modelPath = args.modelPath + paramPath = args.paramPath + if not modelDir: + modelDir, modelName = os.path.split(modelPath) + paramDir, paramsName = os.path.split(paramPath) + if paramDir != modelDir: + print("\033[31mModel and param file should be put in a same directory!\033[0m") + raise Exception() + outputDir = args.outputDir + sliceDataSize = args.sliceDataSize + + if args.logModelInfo == 1: + enableLogModelInfo = True + + convertToPaddleJSModel() + + except Exception as identifier: + print("\033[31mA fetal error occured. Failed to convert model.\033[0m") + print(traceback.format_exc()) + pass \ No newline at end of file diff --git a/tools/ModelConverter/convertToPaddleJSModel.py b/tools/ModelConverter/convertToPaddleJSModel.py new file mode 100644 index 0000000000000000000000000000000000000000..3cca9b3759226b9bb3aad7424133fe83b98d27da --- /dev/null +++ b/tools/ModelConverter/convertToPaddleJSModel.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +import sys +import os +import argparse +import shutil +import stat +import traceback + +def cleanTempModel(optimizedModelTempDir): + """ 清理opt优化完的临时模型文件 """ + if os.path.exists(optimizedModelTempDir): + print("Cleaning optimized temporary model...") + shutil.rmtree(optimizedModelTempDir, onerror=grantWritePermission) + +def grantWritePermission(func, path, execinfo): + """ 文件授权 """ + os.chmod(path, stat.S_IWRITE) + func(path) + + +if __name__ == "__main__": + """ + Example: + 'python convertToPaddleJSModel.py --modelPath=../infer_model/MobileNetV2/__model__ --paramPath=../infer_model/MobileNetV2/params --outputDir=../jsmodel --optimize=1' + """ + try: + p = argparse.ArgumentParser(description='转化为PaddleJS模型参数解析') + p.add_argument('--inputDir', help='fluid模型所在目录。当且仅当使用分片参数文件时使用该参数。将过滤modelPath和paramsPath参数,且模型文件名必须为`__model__`', required=False) + p.add_argument('--modelPath', help='fluid模型文件所在路径,使用合并参数文件时使用该参数', required=False) + p.add_argument('--paramPath', help='fluid参数文件所在路径,使用合并参数文件时使用该参数', required=False) + p.add_argument("--outputDir", help='paddleJS模型输出路径,必要参数', required=True) + p.add_argument("--optimize", type=int, default=0, help='是否进行模型优化,非必要参数,0为关闭优化,1为开启优化,默认关闭优化', required=False) + p.add_argument("--logModelInfo", type=int, default=0, help='是否输出模型结构信息,非必要参数,0为不输出,1为输出,默认不输出', required=False) + p.add_argument("--sliceDataSize", type=int, default=4096, help='分片输出参数文件时,每片文件的大小,单位:KB,非必要参数,默认4096KB', required=False) + + args = p.parse_args() + + # TODO: 由于PaddleLite和PaddlePaddle存在包冲突,因此将整个模型转换工具拆成两个python文件,由一个入口python文件通过命令行调用 + optimizeCmd = " optimizeModel.py" + convertCmd = " convertModel.py" + + inputDir = args.inputDir + modelPath = args.modelPath + paramPath = args.paramPath + outputDir = args.outputDir + enableOptimization = args.optimize + enableLogModelInfo = args.logModelInfo + sliceDataSize = args.sliceDataSize + + optimizedModelTempDir = None + if enableOptimization == 1: + optimizedModelTempDir = os.path.join(outputDir, "optimize") + if inputDir: + optimizeCmd = optimizeCmd + " --inputDir=" + inputDir + convertCmd = convertCmd + " --inputDir=" + optimizedModelTempDir + else: + optimizeCmd = optimizeCmd + " --modelPath=" + modelPath + " --paramPath=" + paramPath + # optimizeModelPath, modelName = os.path.split(modelPath) + # optimizeParamPath, paramName = os.path.split(paramPath) + optimizeModelPath = os.path.join(optimizedModelTempDir, "model") + optimizeParamPath = os.path.join(optimizedModelTempDir, "params") + convertCmd = convertCmd + " --modelPath=" + optimizeModelPath + " --paramPath=" + optimizeParamPath + optimizeCmd = optimizeCmd + " --outputDir=" + optimizedModelTempDir + else: + if inputDir: + convertCmd = convertCmd + " --inputDir=" + inputDir + else: + convertCmd = convertCmd + " --modelPath=" + modelPath + " --paramPath=" + paramPath + + convertCmd = convertCmd + " --outputDir=" + outputDir + " --sliceDataSize=" + str(sliceDataSize) + " --logModelInfo=" + str(args.logModelInfo) + + print("============Convert Model Args=============") + if inputDir: + print("inputDir: " + inputDir) + else: + print("modelPath: " + modelPath) + print("paramPath: " + paramPath) + print("outputDir: " + outputDir) + print("enableOptimizeModel: " + str(enableOptimization)) + print("enableLogModelInfo: " + str(enableLogModelInfo)) + print("sliceDataSize:" + str(sliceDataSize)) + + pythonCmd = "python" + + print("Starting...") + if enableOptimization: + print("Optimizing model...") + os.system(pythonCmd + optimizeCmd) + print("\033[32m\nOptimizing model successfully.\033[0m") + else: + print("\033[33mYou choosed not to optimize model, consequently, optimizing model is skiped.\033[0m") + + print("Converting model...") + os.system(pythonCmd + convertCmd) + print("\033[32mConverting model successfully.\033[0m") + + if enableOptimization: + cleanTempModel(optimizedModelTempDir) + print("Temporary files has been deleted successfully.") + print("\033[32m============ALL DONE============\033[0m") + + except Exception as identifier: + print("\033[31mA fetal error occured. Failed to convert model.\033[0m") + print(traceback.format_exc()) + pass \ No newline at end of file diff --git a/tools/ModelConverter/optimizeModel.py b/tools/ModelConverter/optimizeModel.py new file mode 100644 index 0000000000000000000000000000000000000000..2c7a1bf3b7ed9afcc64a7a064df2eedf7a6b75d5 --- /dev/null +++ b/tools/ModelConverter/optimizeModel.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +import collections +import argparse +import traceback +import paddlelite.lite as lite + +def optimizeModel(inputDir, modelPath, paramPath, outputDir): + """ 使用opt python接口执行模型优化 """ + opt = lite.Opt() + if inputDir: + # 分片参数文件优化 + opt.set_model_dir(inputDir) + else: + # 合并参数文件优化 + opt.set_model_file(modelPath) + opt.set_param_file(paramPath) + + opt.set_valid_places("arm") + opt.set_model_type("protobuf") + opt.set_optimize_out(outputDir) + + optimize_passes = [ + "lite_conv_elementwise_fuse_pass", + "lite_conv_bn_fuse_pass", + "lite_conv_elementwise_fuse_pass", + "lite_conv_activation_fuse_pass", + "lite_var_conv_2d_activation_fuse_pass", + "lite_fc_fuse_pass", + "lite_shuffle_channel_fuse_pass", + "lite_transpose_softmax_transpose_fuse_pass", + "lite_interpolate_fuse_pass", + "identity_scale_eliminate_pass", + "elementwise_mul_constant_eliminate_pass", + "lite_sequence_pool_concat_fuse_pass", + "lite_elementwise_add_activation_fuse_pass", + "static_kernel_pick_pass", + "variable_place_inference_pass", + "argument_type_display_pass", + "type_target_cast_pass", + "variable_place_inference_pass", + "argument_type_display_pass", + "io_copy_kernel_pick_pass", + "argument_type_display_pass", + "variable_place_inference_pass", + "argument_type_display_pass", + "type_precision_cast_pass", + "variable_place_inference_pass", + "argument_type_display_pass", + "type_layout_cast_pass", + "argument_type_display_pass", + "variable_place_inference_pass", + "argument_type_display_pass", + "runtime_context_assign_pass", + "argument_type_display_pass" + ] + opt.set_passes_internal(optimize_passes) + opt.run() + + +if __name__ == "__main__": + try: + p = argparse.ArgumentParser('模型优化参数解析') + p.add_argument('--inputDir', help='fluid模型所在目录。当且仅当使用分片参数文件时使用该参数。将过滤modelPath和paramsPath参数,且模型文件名必须为`__model__`', required=False) + p.add_argument('--modelPath', help='fluid模型文件所在路径,使用合并参数文件时使用该参数', required=False) + p.add_argument('--paramPath', help='fluid参数文件所在路径,使用合并参数文件时使用该参数', required=False) + p.add_argument("--outputDir", help='优化后fluid模型目录,必要参数', required=True) + + args = p.parse_args() + inputDir = args.inputDir + modelPath = args.modelPath + paramPath = args.paramPath + outputDir = args.outputDir + + optimizeModel(inputDir, modelPath, paramPath, outputDir) + + except Exception as identifier: + print("\033[31mA fetal error occured. Failed to optimize model.\033[0m") + print(traceback.format_exc()) + pass \ No newline at end of file diff --git a/tools/toWebModel.py b/tools/toWebModel.py deleted file mode 100644 index 2cfb10cceedbd3e2a270aec199bb7f8509baccb5..0000000000000000000000000000000000000000 --- a/tools/toWebModel.py +++ /dev/null @@ -1,313 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -import json -import paddle.fluid as fluid -import paddle -import numpy as np -import collections -import math -import sys as sys -import os -import struct - -#常量控制 -#抽样打印数据数量 -logDataCount = 50 - -# 输入模型所在目录 -modelDir = "humanseg/" -# 输入模型名 -modelName = "model" -# 输入参数名,当且仅当所有模型参数被保存在一个单独的二进制文件中,它才需要被指定,若为分片模型,请设置为None -paramsName = None -# 模型feed shape -inputShape = (1, 3, 192, 192) -# 输入数据 -inputData = np.full(inputShape, 1, "float32") -# 输出模型目录 -outputDir = "../dist/model/humanseg/" -# 权重分片扩展名 -extensionName = ".dat" -# 输出各var数据 -outputVarData = False - -# 确认fluid的版本号 -print(paddle.__version__) - -# 采样输出list数据,采样的个数logDataCount为常量 -def stridePrint1(data): - dataCount = len(data) - stride = math.floor(dataCount / logDataCount) - if stride == 0: - stride = 1 - nums = [] - # outputCount = logDataCount - # if dataCount < logDataCount: - # outputCount = dataCount - # for i in range(outputCount): - # # nums.append(str(i) + ": " + str(data[i])) - # nums.append(data[i]) - - for i in range(0, logDataCount): - item = data[i * stride] - nums.append(str(i * stride) + ": " + str(item)) - print(nums) - -def stridePrint(tensor): - length = len(tensor) -# if length < 3000: -# print(tensor) -# return - size = 20 - stride = math.floor(length / size) - if stride == 0: - stride = 1 - size = math.floor(length / stride) - nums = [] - for i in range(0, size): - item = tensor[i * stride] - nums.append(str(i * stride) + ": " + str(item)) - print(nums) - - - - -# 对字典进行排序,返回有序字典,默认升序 -def sortDict(oldDict, reverse = False): - # 获得排序后的key list - keys = sorted(oldDict.keys(), reverse = reverse) - orderDict = collections.OrderedDict() - # 遍历 key 列表 - for key in keys: - orderDict[key] = oldDict[key] - return orderDict - - -# 将权重数据分片输出到文件,默认分片策略为按4M分片 -def sliceDataToBinaryFile(weightValueList, sliceMethod = 0): - # TODO: 分片这里不太对,待修改 - totalWeightCount = len(weightValueList) - countPerSlice = 0 - # sliceCount = 0 - if sliceMethod == 0: - # 分片策略 0:按4M分片 - countPerSlice = int(4 * 1024 * 1024 / 4) - # sliceCount = math.ceil(totalWeightCount / countPerSlice) - else: - # 分片策略 1:按<=4M等分 - # TODO: 待实现 - countPerSlice = 0 - # sliceCount = 0 - - if not os.path.exists(outputDir): - os.makedirs(outputDir) - currentChunkIndex = 0 - currentWeightIndex = 0 - - while currentWeightIndex < totalWeightCount - 1: - remainCount = totalWeightCount - currentWeightIndex - if remainCount < countPerSlice: - countPerSlice = remainCount - chunkPath = outputDir + 'chunk_%s' % (currentChunkIndex + 1) + extensionName - file = open(chunkPath, 'wb') - for i in weightValueList[currentWeightIndex : currentWeightIndex + countPerSlice]: - byte = struct.pack('f', float(i)) - file.write(byte) - file.close() - currentWeightIndex = currentWeightIndex + countPerSlice - currentChunkIndex = currentChunkIndex + 1 - # for debug - print("第" + str(currentChunkIndex + 1) + "片权重输出完毕,输出个数:" + str(countPerSlice) + " 剩余个数:" + str(totalWeightCount - currentWeightIndex)) - - # for debug - print("========权重输出完毕,共" + str(currentWeightIndex) + "个数据," + str(currentChunkIndex) + "个分片文件" + "========") - -# 处理fluid的OP type与PaddleJS的OP type不对应情况 -def mapToPaddleJSTypeName(fluidOPName): - if fluidOPName == "batch_norm": - return "batchnorm" - return fluidOPName - - -# 将shape扩充为4维 -def padToFourDimShape(shape): - fourDimShape = [] - if len(shape) == 4: - fourDimShape = shape - elif len(shape) < 4: - for i in range(0, 4 - len(shape)): - fourDimShape.append(1) - fourDimShape = fourDimShape + shape - else: - return [] - return fourDimShape - - -# for debug,将NCHW排布的数据转为NHWC排布的数据 -def convertNCHW2NHWC(data, shape): - fourDimShape = padToFourDimShape(shape) - N = fourDimShape[0] - C = fourDimShape[1] - H = fourDimShape[2] - W = fourDimShape[3] - print(fourDimShape) - HXW = H * W - CXHXW = C * H * W - index = 0 - nhwcData = [] - for n in range(0, N): - for h in range(0, H): - for w in range(0, W): - for c in range(0, C): - nhwcData.append(data[n * CXHXW + c * HXW + h * W + w]) - index = index + 1 - return nhwcData - -# for debug 输出特定varName对应的数据 -def writeTempOutputData(name): - # FIXME:待完善 - return - dataList = np.array(fluid.global_scope().find_var(name).get_tensor()).flatten().tolist() - path = '/Users/bluebird/baidu/fluid_tools/check-temp/filter.txt' - if os.path.exists(path): - os.remove() - file = open(path,'a') - for a in range(0, len(dataList)): - file.write(str(dataList[a])) - file.write(",") - file.close() - -def convertToPaddleJSModel(): - # 1. 初始化fluid运行环境和配置 - exe = fluid.Executor(fluid.CPUPlace()) - [prog, feed_target_names, fetch_targets] = fluid.io.load_inference_model(dirname=modelDir, executor=exe, model_filename=modelName, params_filename=paramsName) - out = exe.run(prog, feed={feed_target_names[0]: inputData}, fetch_list=fetch_targets, return_numpy=False) - print(out) - - index = 0 - # 存放模型结构 - modelInfo = {"vars": [], "ops": []} - # 存放var信息(未排序) - varInfoDict = {} - # 存放权重数值(未排序) - weightValueDict = {} - - # 2. 获取program中所有的var,遍历并获取所有未排序的var信息和权重数值 - vars = list(prog.list_vars()) - for v in vars: - # 跳过feed和fetch - if "feed" == v.name: - continue - if "fetch" == v.name: - continue - - print("Var index:" + str(index) + " name:" + v.name) - print(v) - index += 1 - - varShape = list(v.shape) - - # FIXME:start paddlejs 不支持shape中为-1,这里需要手动过滤一下,支持了以后可以删除 - varShapeExcludeNegativeOne = [] - for s in varShape: - if s == -1: - continue - varShapeExcludeNegativeOne.append(s) - varShape = varShapeExcludeNegativeOne - # FIXME:end - - # 存放variable信息,在dump成json时排序 - varInfo = {} - varInfo["shape"] = varShape - # 数据是否是持久化数据,如weight为持久化数据,op的output不是持久化数据 - # 只输出持久化数据,paddlejs中也仅读取持久化数据 - varInfo["persistable"] = v.persistable - varInfoDict[v.name] = varInfo - - # for debug,输出var变量 - if outputVarData: - writeTempOutputData(v.name) - - # persistable数据存入weightDict,等待排序 - if v.persistable: - data = np.array(fluid.global_scope().find_var(v.name).get_tensor()).flatten().tolist() - weightValueDict[v.name] = data - - # 3. 对var信息dict,按照key(var名)进行字母顺序排序 - varInfoOrderDict = sortDict(varInfoDict) - - # 4. 将var信息按照顺序,添加到model info的vars中 - for key, value in varInfoOrderDict.items(): - value["name"] = key - modelInfo["vars"].append(value) - - # 5. 对权重数值dict,按照key(权重名)进行字母顺序排序,并组合到一起 - weightValueOrderDict = sortDict(weightValueDict) - weightValues = [] - for key, value in weightValueOrderDict.items(): - weightValues += value - - # 6. 分片输出权重 - sliceDataToBinaryFile(weightValues) - - # 7. 获取program中所有的op,按op顺序加入到model info - ops = prog.current_block().ops - feedOutputName = None - for op in ops: - opInfo = {} - - # 获取OP type,需要映射到PaddleJS的名字 - opInfo["type"] = mapToPaddleJSTypeName(op.type) - - # 获取OP input - inputs = {} - for name in op.input_names: - value = op.input(name) - if len(value) <= 0: - continue - if value[0] == feedOutputName: - # FIXME:workaround,PaddleJSfeed 输入必须是image,且为单输入 - # 这里修改feed后面的OP的input为image,建立前后关联 - # 这里可能会有问题 - inputs[name] = ["image"] - else: - inputs[name] = value - opInfo["inputs"] = inputs - - # 获取OP output - outputs = {} - for name in op.output_names: - value = op.output(name) - if len(value) <= 0: - continue - if op.type == "feed": - # FIXME:workaround,PaddleJSfeed 输入必须是image,且为单输入 - # 这里可能会有问题 - feedOutputName = value[0] - outputs[name] = ["image"] - else: - outputs[name] = value - opInfo["outputs"] = outputs - - # 获取OP attribute - attrs = {} - for name in op.attr_names: - # 过滤不需要的参数 - if name in ["op_callstack", 'col', 'op_role', 'op_namescope', 'op_role_var']: - continue - value = op.attr(name) - attrs[name] = value - opInfo["attrs"] = attrs - - # 存入modelInfo - modelInfo["ops"].append(opInfo) - - # 8. 模型信息按照key字母顺序导出到json - outputModelPath = outputDir + "model.json" - with open(outputModelPath, 'w') as outputFile: - json.dump(modelInfo, outputFile, indent = 4, separators=(", ", ": "), sort_keys = True) - - print("========模型结构输出完毕========") - -convertToPaddleJSModel()