diff --git a/label_semantic_roles/data/extract_dict_feature.py b/label_semantic_roles/data/extract_dict_feature.py deleted file mode 100644 index da44111976a0dec68345fc139d0aa459ca9211c2..0000000000000000000000000000000000000000 --- a/label_semantic_roles/data/extract_dict_feature.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os -from optparse import OptionParser - - -def extract_dict_features(pair_file, feature_file): - - with open(pair_file) as fin, open(feature_file, 'w') as feature_out: - for line in fin: - sentence, predicate, labels = line.strip().split('\t') - sentence_list = sentence.split() - labels_list = labels.split() - - verb_index = labels_list.index('B-V') - - mark = [0] * len(labels_list) - if verb_index > 0: - mark[verb_index - 1] = 1 - ctx_n1 = sentence_list[verb_index - 1] - else: - ctx_n1 = 'bos' - - if verb_index > 1: - mark[verb_index - 2] = 1 - ctx_n2 = sentence_list[verb_index - 2] - else: - ctx_n2 = 'bos' - - mark[verb_index] = 1 - ctx_0 = sentence_list[verb_index] - - if verb_index < len(labels_list) - 1: - mark[verb_index + 1] = 1 - ctx_p1 = sentence_list[verb_index + 1] - else: - ctx_p1 = 'eos' - - if verb_index < len(labels_list) - 2: - mark[verb_index + 2] = 1 - ctx_p2 = sentence_list[verb_index + 2] - else: - ctx_p2 = 'eos' - - - feature_str = sentence + '\t' \ - + predicate + '\t' \ - + ctx_n2 + '\t' \ - + ctx_n1 + '\t' \ - + ctx_0 + '\t' \ - + ctx_p1 + '\t' \ - + ctx_p2 + '\t' \ - + ' '.join([str(i) for i in mark]) + '\t' \ - + labels - - feature_out.write(feature_str + '\n') - - -if __name__ == '__main__': - - usage = '-p pair_file -f feature_file' - parser = OptionParser(usage) - parser.add_option('-p', dest='pair_file', help='the pair file') - parser.add_option('-f', dest='feature_file', help='the feature file') - - (options, args) = parser.parse_args() - - extract_dict_features(options.pair_file, options.feature_file) diff --git a/label_semantic_roles/data/extract_pairs.py b/label_semantic_roles/data/extract_pairs.py deleted file mode 100644 index d0290c44cf089990d2cf30137834132cb72ab5ea..0000000000000000000000000000000000000000 --- a/label_semantic_roles/data/extract_pairs.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os -from optparse import OptionParser - - -def read_labels(props_file): - ''' - a sentence maybe has more than one verb, each verb has its label sequence - label[], is a 3-dimension list. - the first dim is to store all sentence's label seqs, len is the sentence number - the second dim is to store all label sequences for one sentences - the third dim is to store each label for one word - ''' - labels = [] - with open(props_file) as fin: - label_seqs_for_one_sentences = [] - one_seg_in_file = [] - for line in fin: - line = line.strip() - if line == '': - for i in xrange(len(one_seg_in_file[0])): - a_kind_lable = [x[i] for x in one_seg_in_file] - label_seqs_for_one_sentences.append(a_kind_lable) - labels.append(label_seqs_for_one_sentences) - one_seg_in_file = [] - label_seqs_for_one_sentences = [] - else: - part = line.split() - one_seg_in_file.append(part) - return labels - - -def read_sentences(words_file): - sentences = [] - with open(words_file) as fin: - s = '' - for line in fin: - line = line.strip() - if line == '': - sentences.append(s) - s = '' - else: - s += line + ' ' - return sentences - - -def transform_labels(sentences, labels): - sen_lab_pair = [] - for i in xrange(len(sentences)): - if len(labels[i]) == 1: - continue - else: - verb_list = [] - for x in labels[i][0]: - if x != '-': - verb_list.append(x) - - for j in xrange(1, len(labels[i])): - label_list = labels[i][j] - current_tag = 'O' - is_in_bracket = False - label_seq = [] - verb_word = '' - for ll in label_list: - if ll == '*' and is_in_bracket == False: - label_seq.append('O') - elif ll == '*' and is_in_bracket == True: - label_seq.append('I-' + current_tag) - elif ll == '*)': - label_seq.append('I-' + current_tag) - is_in_bracket = False - elif ll.find('(') != -1 and ll.find(')') != -1: - current_tag = ll[1:ll.find('*')] - label_seq.append('B-' + current_tag) - is_in_bracket = False - elif ll.find('(') != -1 and ll.find(')') == -1: - current_tag = ll[1:ll.find('*')] - label_seq.append('B-' + current_tag) - is_in_bracket = True - else: - print 'error:', ll - sen_lab_pair.append((sentences[i], verb_list[j - 1], label_seq)) - return sen_lab_pair - - -def write_file(sen_lab_pair, output_file): - with open(output_file, 'w') as fout: - for x in sen_lab_pair: - sentence = x[0] - label_seq = ' '.join(x[2]) - assert len(sentence.split()) == len(x[2]) - fout.write(sentence + '\t' + x[1] + '\t' + label_seq + '\n') - - -if __name__ == '__main__': - - usage = '-w words_file -p props_file -o output_file' - parser = OptionParser(usage) - parser.add_option('-w', dest='words_file', help='the words file') - parser.add_option('-p', dest='props_file', help='the props file') - parser.add_option('-o', dest='output_file', help='the output_file') - (options, args) = parser.parse_args() - - sentences = read_sentences(options.words_file) - labels = read_labels(options.props_file) - sen_lab_pair = transform_labels(sentences, labels) - - write_file(sen_lab_pair, options.output_file) diff --git a/label_semantic_roles/data/get_data.sh b/label_semantic_roles/data/get_data.sh deleted file mode 100644 index dafa218988e480ed826c69328b5df7cb50be437d..0000000000000000000000000000000000000000 --- a/label_semantic_roles/data/get_data.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb -tar -xzvf conll05st-tests.tar.gz - -mv verbDict.txt predicate_dict -mv targetDict.txt label_dict -mv wordDict.txt word_dict -rm conll05st-tests.tar.gz - -cp ./conll05st-release/test.wsj/words/test.wsj.words.gz . -cp ./conll05st-release/test.wsj/props/test.wsj.props.gz . -gunzip test.wsj.words.gz -gunzip test.wsj.props.gz - -python extract_pairs.py -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair -python extract_dict_feature.py -p test.wsj.seq_pair -f feature - -echo `pwd`/feature > train.list -echo `pwd`/feature > test.list diff --git a/label_semantic_roles/dataprovider.py b/label_semantic_roles/dataprovider.py deleted file mode 100644 index ca2dcdff61681ec673b7ca97cc8ca3bb628cbb6e..0000000000000000000000000000000000000000 --- a/label_semantic_roles/dataprovider.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * - -UNK_IDX = 0 - - -def hook(settings, word_dict, label_dict, predicate_dict, **kwargs): - settings.word_dict = word_dict - settings.label_dict = label_dict - settings.predicate_dict = predicate_dict - - #all inputs are integral and sequential type - settings.input_types = { - 'word_data': integer_value_sequence(len(word_dict)), - 'ctx_n2_data': integer_value_sequence(len(word_dict)), - 'ctx_n1_data': integer_value_sequence(len(word_dict)), - 'ctx_0_data': integer_value_sequence(len(word_dict)), - 'ctx_p1_data': integer_value_sequence(len(word_dict)), - 'ctx_p2_data': integer_value_sequence(len(word_dict)), - 'verb_data': integer_value_sequence(len(predicate_dict)), - 'mark_data': integer_value_sequence(2), - 'target': integer_value_sequence(len(label_dict)) - } - - -def get_batch_size(yield_data): - return len(yield_data[0]) - - -@provider( - init_hook=hook, - should_shuffle=True, - calc_batch_size=get_batch_size, - can_over_batch_size=True, - cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ - line.strip().split('\t') - - words = sentence.split() - sen_len = len(words) - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - - predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len - ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [settings.label_dict.get(w) for w in label_list] - yield { - 'word_data': word_slot, - 'ctx_n2_data': ctx_n2_slot, - 'ctx_n1_data': ctx_n1_slot, - 'ctx_0_data': ctx_0_slot, - 'ctx_p1_data': ctx_p1_slot, - 'ctx_p2_data': ctx_p2_slot, - 'verb_data': predicate_slot, - 'mark_data': mark_slot, - 'target': label_slot - } diff --git a/label_semantic_roles/db_lstm.py b/label_semantic_roles/db_lstm.py deleted file mode 100755 index ea8c2d117ae904286b201ecc5d55cdfa78198754..0000000000000000000000000000000000000000 --- a/label_semantic_roles/db_lstm.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import os -import sys -from paddle.trainer_config_helpers import * - -is_test = get_config_arg('is_test', bool, False) -is_predict = get_config_arg('is_predict', bool, False) - -#file paths -word_dict_file = './data/word_dict' -label_dict_file = './data/label_dict' -predicate_file = './data/predicate_dict' -train_list_file = './data/train.list' if not (is_test or is_predict) else None -test_list_file = './data/test.list' - - -def load_dict(dict_file_path): - data_dict = {} - with open(dict_file_path, "r") as fdict: - for idx, line in enumerate(fdict): - data_dict[line.strip()] = idx - return data_dict - - -if not is_predict: - #load dictionaries - word_dict = load_dict(word_dict_file) - label_dict = load_dict(label_dict_file) - predicate_dict = load_dict(predicate_file) - - #define data provider - define_py_data_sources2( - train_list=train_list_file, - test_list=test_list_file, - module='dataprovider', - obj='process', - args={ - 'word_dict': word_dict, - 'label_dict': label_dict, - 'predicate_dict': predicate_dict - }) - -word_dict_len = get_config_arg('dict_len', - int) if is_predict else len(word_dict) -label_dict_len = get_config_arg('label_len', - int) if is_predict else len(label_dict) -pred_len = get_config_arg('pred_len', - int) if is_predict else len(predicate_dict) - -############################## Hyper-parameters ################################## -mark_dict_len = 2 -word_dim = 32 -mark_dim = 5 -hidden_dim = 512 -depth = 8 - -########################### Optimizer ####################################### - -settings( - batch_size=1, - learning_method=MomentumOptimizer(momentum=0), - learning_rate=2e-2, - regularization=L2Regularization(8e-4), - model_average=ModelAverage(average_window=0.5, max_average_window=10000), ) - -####################################### network ############################## -#8 features and 1 target -word = data_layer(name='word_data', size=word_dict_len) -predicate = data_layer(name='verb_data', size=pred_len) - -ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len) -ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len) -ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len) -ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len) -ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len) -mark = data_layer(name='mark_data', size=mark_dict_len) - -if not is_predict: - target = data_layer(name='target', size=label_dict_len) - -default_std = 1 / math.sqrt(hidden_dim) / 3.0 - -emb_para = ParameterAttribute(name='emb', initial_std=0., is_static=True) -std_0 = ParameterAttribute(initial_std=0.) -std_default = ParameterAttribute(initial_std=default_std) - -predicate_embedding = embedding_layer( - size=word_dim, - input=predicate, - param_attr=ParameterAttribute(name='vemb', initial_std=default_std)) - -word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] -emb_layers = [ - embedding_layer(size=word_dim, input=x, param_attr=emb_para) - for x in word_input -] -emb_layers.append(predicate_embedding) -mark_embedding = embedding_layer( - name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0) -emb_layers.append(mark_embedding) - -hidden_0 = mixed_layer( - name='hidden0', - size=hidden_dim, - bias_attr=std_default, - input=[ - full_matrix_projection(input=emb, param_attr=std_default) - for emb in emb_layers - ]) - -mix_hidden_lr = 1e-3 -lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0) -hidden_para_attr = ParameterAttribute( - initial_std=default_std, learning_rate=mix_hidden_lr) - -lstm_0 = lstmemory( - name='lstm0', - input=hidden_0, - act=ReluActivation(), - gate_act=SigmoidActivation(), - state_act=SigmoidActivation(), - bias_attr=std_0, - param_attr=lstm_para_attr) - -#stack L-LSTM and R-LSTM with direct edges -input_tmp = [hidden_0, lstm_0] - -for i in range(1, depth): - - mix_hidden = mixed_layer( - name='hidden' + str(i), - size=hidden_dim, - bias_attr=std_default, - input=[ - full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ]) - - lstm = lstmemory( - name='lstm' + str(i), - input=mix_hidden, - act=ReluActivation(), - gate_act=SigmoidActivation(), - state_act=SigmoidActivation(), - reverse=((i % 2) == 1), - bias_attr=std_0, - param_attr=lstm_para_attr) - - input_tmp = [mix_hidden, lstm] - -feature_out = mixed_layer( - name='output', - size=label_dict_len, - bias_attr=std_default, - input=[ - full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr), - full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr) - ], ) - -if not is_predict: - crf_l = crf_layer( - name='crf', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=ParameterAttribute( - name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr)) - - crf_dec_l = crf_decoding_layer( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=ParameterAttribute(name='crfw')) - - eval = sum_evaluator(input=crf_dec_l) - - outputs(crf_l) - -else: - crf_dec_l = crf_decoding_layer( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - param_attr=ParameterAttribute(name='crfw')) - - outputs(crf_dec_l) diff --git a/label_semantic_roles/predict.py b/label_semantic_roles/predict.py deleted file mode 100644 index 372fd090b6e8f08f5bb34697772c2e4976810595..0000000000000000000000000000000000000000 --- a/label_semantic_roles/predict.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import numpy as np -from optparse import OptionParser -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import integer_value_sequence -from paddle.trainer.config_parser import parse_config -""" -Usage: run following command to show help message. - python predict.py -h -""" -UNK_IDX = 0 - - -class Prediction(): - def __init__(self, train_conf, dict_file, model_dir, label_file, - predicate_dict_file): - """ - train_conf: trainer configure. - dict_file: word dictionary file name. - model_dir: directory of model. - """ - - self.dict = {} - self.labels = {} - self.predicate_dict = {} - self.labels_reverse = {} - self.load_dict_label(dict_file, label_file, predicate_dict_file) - - len_dict = len(self.dict) - len_label = len(self.labels) - len_pred = len(self.predicate_dict) - - conf = parse_config( - train_conf, 'dict_len=' + str(len_dict) + ',label_len=' + - str(len_label) + ',pred_len=' + str(len_pred) + ',is_predict=True') - self.network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - self.network.loadParameters(model_dir) - - slots = [ - integer_value_sequence(len_dict), integer_value_sequence(len_dict), - integer_value_sequence(len_dict), integer_value_sequence(len_dict), - integer_value_sequence(len_dict), integer_value_sequence(len_dict), - integer_value_sequence(len_pred), integer_value_sequence(2) - ] - self.converter = DataProviderConverter(slots) - - def load_dict_label(self, dict_file, label_file, predicate_dict_file): - """ - Load dictionary from self.dict_file. - """ - for line_count, line in enumerate(open(dict_file, 'r')): - self.dict[line.strip()] = line_count - - for line_count, line in enumerate(open(label_file, 'r')): - self.labels[line.strip()] = line_count - self.labels_reverse[line_count] = line.strip() - - for line_count, line in enumerate(open(predicate_dict_file, 'r')): - self.predicate_dict[line.strip()] = line_count - - def get_data(self, data_file): - """ - Get input data of paddle format. - """ - with open(data_file, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip( - ).split('\t') - words = sentence.split() - sen_len = len(words) - - word_slot = [self.dict.get(w, UNK_IDX) for w in words] - predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX) - ] * sen_len - ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - yield word_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot - - def predict(self, data_file, output_file): - """ - data_file: file name of input data. - """ - input = self.converter(self.get_data(data_file)) - output = self.network.forwardTest(input) - lab = output[0]["id"].tolist() - - with open(data_file, 'r') as fin, open(output_file, 'w') as fout: - index = 0 - for line in fin: - sen = line.split('\t')[0] - len_sen = len(sen.split()) - line_labels = lab[index:index + len_sen] - index += len_sen - fout.write(sen + '\t' + ' '.join( - [self.labels_reverse[i] for i in line_labels]) + '\n') - - -def option_parser(): - usage = ( - "python predict.py -c config -w model_dir " - "-d word dictionary -l label_file -i input_file -p pred_dict_file") - parser = OptionParser(usage="usage: %s [options]" % usage) - parser.add_option( - "-c", - "--tconf", - action="store", - dest="train_conf", - help="network config") - parser.add_option( - "-d", - "--dict", - action="store", - dest="dict_file", - help="dictionary file") - parser.add_option( - "-l", - "--label", - action="store", - dest="label_file", - default=None, - help="label file") - parser.add_option( - "-p", - "--predict_dict_file", - action="store", - dest="predict_dict_file", - default=None, - help="predict_dict_file") - parser.add_option( - "-i", - "--data", - action="store", - dest="data_file", - help="data file to predict") - parser.add_option( - "-w", - "--model", - action="store", - dest="model_path", - default=None, - help="model path") - - parser.add_option( - "-o", - "--output_file", - action="store", - dest="output_file", - default=None, - help="output file") - return parser.parse_args() - - -def main(): - options, args = option_parser() - train_conf = options.train_conf - data_file = options.data_file - dict_file = options.dict_file - model_path = options.model_path - label_file = options.label_file - predict_dict_file = options.predict_dict_file - output_file = options.output_file - - swig_paddle.initPaddle("--use_gpu=0") - predict = Prediction(train_conf, dict_file, model_path, label_file, - predict_dict_file) - predict.predict(data_file, output_file) - - -if __name__ == '__main__': - main() diff --git a/label_semantic_roles/predict.sh b/label_semantic_roles/predict.sh deleted file mode 100755 index 1fced67cf2c79b03a1f1a2e0d690e7bd8f6d32fc..0000000000000000000000000000000000000000 --- a/label_semantic_roles/predict.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \ - sort -n | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -best_model_path="output/pass-${LOG[1]}" - -config_file=db_lstm.py -dict_file=./data/wordDict.txt -label_file=./data/targetDict.txt -predicate_dict_file=./data/verbDict.txt -input_file=./data/feature -output_file=predict.res - -python predict.py \ - -c $config_file \ - -w $best_model_path \ - -l $label_file \ - -p $predicate_dict_file \ - -d $dict_file \ - -i $input_file \ - -o $output_file diff --git a/label_semantic_roles/test.sh b/label_semantic_roles/test.sh deleted file mode 100755 index 11d9d6a19c1b17ad1b7540ee7a03017f85dd821e..0000000000000000000000000000000000000000 --- a/label_semantic_roles/test.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\ - sort -n | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list - -paddle train \ - --config=./db_lstm.py \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --config_args=is_test=1 \ - --test_all_data_in_one_period=1 \ -2>&1 | tee 'test.log' diff --git a/label_semantic_roles/train.sh b/label_semantic_roles/train.sh deleted file mode 100755 index b0efaf42270cbd807bf40def7a02e90b54c9cf92..0000000000000000000000000000000000000000 --- a/label_semantic_roles/train.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -paddle train \ - --config=./db_lstm.py \ - --use_gpu=0 \ - --log_period=10 \ - --dot_period=5000 \ - --trainer_count=1 \ - --show_parameter_stats_period=500 \ - --save_dir=./output \ - --num_passes=150 \ - --init_model_path=./data \ - --load_missing_parameter_strategy=rand \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log'