CNN输入term序列和postag序列,可以正常训练,predict时出错: Check failed: index[i] < (int)tableSize
Created by: stonyhu
CNN网络配置的脚本如下
# edit-mode: -*- python -*-
import numpy as np
from paddle.trainer_config_helpers import *
emb_file = "/home/disk/xiejian01/word2vec/data/embedding-vector.txt.128.freq2"
dict_file = "./data/dict.txt"
word_dict = dict()
pos_dict = dict()
word_idx = 0
pos_idx = 0
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
parts = line.strip().split('\t\t')
if parts[1].startswith("nerl_in_query"):
pos_dict[parts[1]] = pos_idx
pos_idx += 1
if parts[1].startswith("nerl_wordseg"):
word_dict[parts[1]] = word_idx
word_idx += 1
word_dict["<unk>"] = len(word_dict)
pos_dict["<unk>"] = len(pos_dict)
word_dim = len(word_dict)
pos_dim = len(pos_dict)
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dict_file": dict_file})
batch_size = 512 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
def load_parameter(filename=emb_file, height=word_dim, width=128):
return np.loadtxt(open(filename), dtype=np.float32, delimiter=" ")
word = data_layer(name="word", size=word_dim)
emb_param = ParameterAttribute(
name="emb_param",
initial_std=0.,
is_static=False,
initializer=load_parameter)
word_emb = embedding_layer(input=word, size=128, param_attr=emb_param)
pos = data_layer(name="postag", size=pos_dim)
pos_emb = embedding_layer(input=pos, size=32)
embedding = concat_layer(input=[word_emb, pos_emb])
conv3 = sequence_conv_pool(input=embedding, context_len=3, hidden_size=128)
conv4 = sequence_conv_pool(input=embedding, context_len=4, hidden_size=128)
conv5 = sequence_conv_pool(input=embedding, context_len=5, hidden_size=128)
output = fc_layer(input=[conv3, conv4, conv5], size=2, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
读取dict和训练样本的脚本如下
#!/usr/bin/env python
# -*- coding: utf8 -*-
from paddle.trainer.PyDataProvider2 import *
def initializer(settings, dict_file, **kwargs):
word_dict = {}
pos_dict = {}
word_idx = 0
pos_idx = 0
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
parts = line.strip().split('\t\t')
if len(parts) != 2:
continue
if parts[1].startswith("nerl_in_query"):
word_dict[parts[1]] = word_idx
word_idx += 1
if parts[1].startswith("nerl_wordseg"):
pos_dict[parts[1]] = pos_idx
pos_idx += 1
word_dict["<unk>"] = len(word_dict)
pos_dict["<unk>"] = len(pos_dict)
settings.word_dict = word_dict
settings.pos_dict = pos_dict
settings.input_types = [
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(pos_dict)),
integer_value(2)
]
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_name):
UNK_POS = settings.pos_dict.get("<unk>")
UNK_WORD = settings.word_dict.get("<unk>")
with open(file_name, 'r') as f:
for line in f:
line = line.strip()
parts = line.split('\t\t')
if len(parts) != 3:
continue
label = parts[0]
feature_types = parts[2].split('\t')
word_vector = []
pos_vector = []
for feature_type in feature_types:
_index = feature_type.find(':')
extractor_name = feature_type[:_index]
feature_line = feature_type[_index+1:]
if feature_line == '':
continue
features = feature_line.split('\x01')
for feature in features:
w = extractor_name + "::" + feature
if extractor_name == "nerl_in_query":
pos_vector.append(settings.pos_dict.get(w, UNK_POS))
if extractor_name == "nerl_wordseg":
word_vector.append(settings.word_dict.get(w, UNK_POS))
if len(word_vector) == 0:
continue
yield word_vector, pos_vector, int(label)
def predict_initializer(settings, dict_file, **kwargs):
word_dict = {}
pos_dict = {}
word_idx = 0
pos_idx = 0
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
parts = line.strip().split('\t\t')
if len(parts) != 2:
continue
if parts[1].startswith("nerl_in_query"):
word_dict[parts[1]] = word_idx
word_idx += 1
if parts[1].startswith("nerl_wordseg"):
pos_dict[parts[1]] = pos_idx
pos_idx += 1
word_dict["<unk>"] = len(word_dict)
pos_dict["<unk>"] = len(pos_dict)
settings.word_dict = word_dict
settings.pos_dict = pos_dict
settings.input_types = [
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(pos_dict))
]
@provider(init_hook=predict_initializer, should_shuffle=False)
def process_predict(settings, file_name):
UNK_POS = settings.pos_dict.get("<unk>")
UNK_WORD = settings.word_dict.get("<unk>")
with open(file_name, 'r') as f:
for line in f:
line = line.strip()
parts = line.split('\t\t')
if len(parts) != 2:
yield [UNK_WORD], [UNK_POS]
continue
query = parts[0]
feature_types = parts[1].split('\t')
word_vector = []
pos_vector = []
for feature_type in feature_types:
_index = feature_type.find(':')
extractor_name = feature_type[:_index]
feature_line = feature_type[_index+1:]
if feature_line == '':
continue
features = feature_line.split('\x01')
for feature in features:
w = extractor_name + "::" + feature
if extractor_name == "nerl_in_query":
pos_vector.append(settings.pos_dict.get(w, UNK_POS))
if extractor_name == "nerl_wordseg":
word_vector.append(settings.word_dict.get(w, UNK_WORD))
if len(word_vector) == 0:
pos_vector.append(UNK_POS)
word_vector.append(UNK_WORD)
yield word_vector, pos_vector