厂内MPI集群,连续两次加载不同模型,提示『ValueError: No such parameter ___fc_layer_6__.w0』错误
Created by: lzj0314139
import sys
import math
import os
import shutil
import pickle
import paddle.v2 as paddle
import gzip
import hashlib
input_feature_count=2453
cluster_train_dir = "./train_data_dir/make_join_fea_train"
cluster_test_dir = "./test_data_dir/make_join_fea_test_title"
#cluster_test_dir=cluster_train_dir
node_id = int(os.getenv("OMPI_COMM_WORLD_RANK"))
mpi_nodes=int(os.getenv("OMPI_COMM_WORLD_SIZE"))
def cluster_data_reader(file_dir, node_id):
def data_reader():
files = os.listdir(file_dir)
for fi in files:
file_name = file_dir + '/' + fi
m2 = hashlib.md5()
m2.update(file_name)
logid = int(m2.hexdigest()[:16], 16)
if logid % mpi_nodes != node_id:
continue
with open(file_name, 'r') as f:
print "node_id" , node_id ,": " , file_dir + '/' + fi
for line in f:
items = line.strip().split('\t')[0].split(' ')
fea = []
label = []
sign = []
for item in items[3:]:
fea.append(float(item.split(':')[1]))
label.append(float(items[1]))
sign.append(items[0])
yield fea, label , sign
return data_reader
def dnn_infer(file_name , epoch):
"""
predict instance labels by dnn network
:params file_name: network parameter file
:type file_name: str
"""
print("Begin to predict... model : " + file_name + " mpi_nodes : " + str(mpi_nodes) )
x = paddle.layer.data(name='x' + str(epoch), type=paddle.data_type.dense_vector(input_feature_count))
x1 = paddle.layer.fc(input=x, size=1024, act=paddle.activation.Relu())
x2 = paddle.layer.fc(input=x1, size=512, act=paddle.activation.Relu())
x3 = paddle.layer.fc(input=x2, size=512, act=paddle.activation.Relu())
x4 = paddle.layer.fc(input=x3, size=256, act=paddle.activation.Relu())
x5 = paddle.layer.fc(input=x4, size=256, act=paddle.activation.Relu())
output = paddle.layer.fc(input=[x,x1,x2,x3,x4,x5], size=1, act=paddle.activation.Sigmoid())
infer_data = []
infer_data_label = []
sign = []
print("Begin to predict...1")
result=""
result_list = []
batch_count = 100000
count = 0
for item in cluster_data_reader(cluster_test_dir, node_id)():
count += 1
infer_data.append([item[0]])
infer_data_label.append(item[1])
sign.append(item[2])
#print("Begin to predict...2")
if count % batch_count == 0:
predictions = paddle.infer(output_layer = output,
parameters = parameters,
input = infer_data)
for i, prob in enumerate(predictions):
result_tmp = "%s\t%s\t%s" % ( sign[i][0], infer_data_label[i][0] , prob[0])
result_list.append(result_tmp)
infer_data = []
infer_data_label = []
print "process record count: " , count
sign = []
if len(infer_data) != 0:
predictions = paddle.infer(output_layer = output,
parameters = parameters,
input = infer_data)
for i, prob in enumerate(predictions):
result_tmp = "%s\t%s\t%s" % ( sign[i][0], infer_data_label[i][0] , prob[0])
result_list.append(result_tmp)
print "end to predict ... 2"
result = "\n".join(result_list)
dir_name = "output"
if not os.path.exists(dir_name):
os.makedirs(dir_name)
file_object = open(dir_name + "/dnn_params_node_predict_" + str(node_id) + "_" + str(epoch), 'w+')
file_object.write(result)
file_object.close( )
if __name__ == "__main__":
paddle.init(use_gpu=False,
trainer_count=int(os.getenv("PADDLE_TRAINER_COUNT", "1")),
port=int(os.getenv("PADDLE_PORT", "7164")),
ports_num=int(os.getenv("PADDLE_PORTS_NUM", "1")),
num_gradient_servers=int(os.getenv("PADDLE_NUM_GRADIENT_SERVERS", "1")),
trainer_id=int(os.getenv("PADDLE_TRAINER_ID", "0")),
pservers=os.getenv("PADDLE_PSERVERS", "127.0.0.1"))
num_pass = 12
for epoch in range(10 , num_pass):
param_file_name = "init_model_path/dnn_params_pass" + str(epoch) + ".tar.gz"
dnn_infer(file_name=param_file_name , epoch = epoch)