diff --git a/understand_sentiment/data/get_imdb.sh b/understand_sentiment/data/get_imdb.sh deleted file mode 100755 index 4542fc8313d2665cb5056177813225add54e1395..0000000000000000000000000000000000000000 --- a/understand_sentiment/data/get_imdb.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e -set -x - -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd $DIR - -#download the dataset -echo "Downloading aclImdb..." -#http://ai.stanford.edu/%7Eamaas/data/sentiment/ -wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz - -echo "Downloading mosesdecoder..." -#https://github.com/moses-smt/mosesdecoder -wget https://github.com/moses-smt/mosesdecoder/archive/master.zip - -#extract package -echo "Unzipping..." -tar -zxvf aclImdb_v1.tar.gz -unzip master.zip - -#move train and test set to imdb_data directory -#in order to process when traing -mkdir -p imdb/train -mkdir -p imdb/test - -cp -r aclImdb/train/pos/ imdb/train/pos -cp -r aclImdb/train/neg/ imdb/train/neg - -cp -r aclImdb/test/pos/ imdb/test/pos -cp -r aclImdb/test/neg/ imdb/test/neg - -#remove compressed package -rm aclImdb_v1.tar.gz -rm master.zip - -echo "Done." diff --git a/understand_sentiment/dataprovider.py b/understand_sentiment/dataprovider.py deleted file mode 100755 index 976351ab7015fe136d270e97b0c767ac7fc63112..0000000000000000000000000000000000000000 --- a/understand_sentiment/dataprovider.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle.trainer.PyDataProvider2 import * - - -def hook(settings, dictionary, **kwargs): - settings.word_dict = dictionary - settings.input_types = { - 'word': integer_value_sequence(len(settings.word_dict)), - 'label': integer_value(2) - } - settings.logger.info('dict len : %d' % (len(settings.word_dict))) - - -@provider(init_hook=hook) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line_count, line in enumerate(fdata): - label, comment = line.strip().split('\t\t') - label = int(label) - words = comment.split() - word_slot = [ - settings.word_dict[w] for w in words if w in settings.word_dict - ] - yield {'word': word_slot, 'label': label} diff --git a/understand_sentiment/predict.py b/understand_sentiment/predict.py deleted file mode 100755 index 8ec490f64691924013200a3d0038d39aa834b038..0000000000000000000000000000000000000000 --- a/understand_sentiment/predict.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os, sys -import numpy as np -from optparse import OptionParser -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import integer_value_sequence -from paddle.trainer.config_parser import parse_config -""" -Usage: run following command to show help message. - python predict.py -h -""" - - -class SentimentPrediction(): - def __init__(self, train_conf, dict_file, model_dir=None, label_file=None): - """ - train_conf: trainer configure. - dict_file: word dictionary file name. - model_dir: directory of model. - """ - self.train_conf = train_conf - self.dict_file = dict_file - self.word_dict = {} - self.dict_dim = self.load_dict() - self.model_dir = model_dir - if model_dir is None: - self.model_dir = os.path.dirname(train_conf) - - self.label = None - if label_file is not None: - self.load_label(label_file) - - conf = parse_config(train_conf, "is_predict=1") - self.network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - self.network.loadParameters(self.model_dir) - input_types = [integer_value_sequence(self.dict_dim)] - self.converter = DataProviderConverter(input_types) - - def load_dict(self): - """ - Load dictionary from self.dict_file. - """ - for line_count, line in enumerate(open(self.dict_file, 'r')): - self.word_dict[line.strip().split('\t')[0]] = line_count - return len(self.word_dict) - - def load_label(self, label_file): - """ - Load label. - """ - self.label = {} - for v in open(label_file, 'r'): - self.label[int(v.split('\t')[1])] = v.split('\t')[0] - - def get_index(self, data): - """ - transform word into integer index according to the dictionary. - """ - words = data.strip().split() - word_slot = [self.word_dict[w] for w in words if w in self.word_dict] - return word_slot - - def batch_predict(self, data_batch): - input = self.converter(data_batch) - output = self.network.forwardTest(input) - prob = output[0]["value"] - labs = np.argsort(-prob) - for idx, lab in enumerate(labs): - if self.label is None: - print("predicting label is %d" % (lab[0])) - else: - print("predicting label is %s" % (self.label[lab[0]])) - - -def option_parser(): - usage = "python predict.py -n config -w model_dir -d dictionary -i input_file " - parser = OptionParser(usage="usage: %s [options]" % usage) - parser.add_option( - "-n", - "--tconf", - action="store", - dest="train_conf", - help="network config") - parser.add_option( - "-d", - "--dict", - action="store", - dest="dict_file", - help="dictionary file") - parser.add_option( - "-b", - "--label", - action="store", - dest="label", - default=None, - help="dictionary file") - parser.add_option( - "-c", - "--batch_size", - type="int", - action="store", - dest="batch_size", - default=1, - help="the batch size for prediction") - parser.add_option( - "-w", - "--model", - action="store", - dest="model_path", - default=None, - help="model path") - return parser.parse_args() - - -def main(): - options, args = option_parser() - train_conf = options.train_conf - batch_size = options.batch_size - dict_file = options.dict_file - model_path = options.model_path - label = options.label - swig_paddle.initPaddle("--use_gpu=0") - predict = SentimentPrediction(train_conf, dict_file, model_path, label) - - batch = [] - for line in sys.stdin: - batch.append([predict.get_index(line)]) - if len(batch) == batch_size: - predict.batch_predict(batch) - batch = [] - if len(batch) > 0: - predict.batch_predict(batch) - - -if __name__ == '__main__': - main() diff --git a/understand_sentiment/predict.sh b/understand_sentiment/predict.sh deleted file mode 100755 index 20adee8a465ad2b78066dccd9efac2743f583350..0000000000000000000000000000000000000000 --- a/understand_sentiment/predict.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -#Note the default model is pass-00002, you shold make sure the model path -#exists or change the mode path. -model=model_output/pass-00002/ -config=trainer_config.py -label=data/pre-imdb/labels.list -cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ - --tconf=$config \ - --model=$model \ - --label=$label \ - --dict=./data/pre-imdb/dict.txt \ - --batch_size=1 diff --git a/understand_sentiment/preprocess.py b/understand_sentiment/preprocess.py deleted file mode 100755 index cb4438ba1738456ef56a0a4656dcc0dac1b12384..0000000000000000000000000000000000000000 --- a/understand_sentiment/preprocess.py +++ /dev/null @@ -1,359 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import random -import operator -import numpy as np -from subprocess import Popen, PIPE -from os.path import join as join_path -from optparse import OptionParser - -from paddle.utils.preprocess_util import * -""" -Usage: run following command to show help message. - python preprocess.py -h -""" - - -def save_dict(dict, filename, is_reverse=True): - """ - Save dictionary into file. - dict: input dictionary. - filename: output file name, string. - is_reverse: True, descending order by value. - False, ascending order by value. - """ - f = open(filename, 'w') - for k, v in sorted(dict.items(), key=operator.itemgetter(1),\ - reverse=is_reverse): - f.write('%s\t%s\n' % (k, v)) - f.close() - - -def tokenize(sentences): - """ - Use tokenizer.perl to tokenize input sentences. - tokenizer.perl is tool of Moses. - sentences : a list of input sentences. - return: a list of processed text. - """ - dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl' - tokenizer_cmd = [dir, '-l', 'en', '-q', '-'] - assert isinstance(sentences, list) - text = "\n".join(sentences) - tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) - tok_text, _ = tokenizer.communicate(text) - toks = tok_text.split('\n')[:-1] - return toks - - -def read_lines(path): - """ - path: String, file path. - return a list of sequence. - """ - seqs = [] - with open(path, 'r') as f: - for line in f.readlines(): - line = line.strip() - if len(line): - seqs.append(line) - return seqs - - -class SentimentDataSetCreate(): - """ - A class to process data for sentiment analysis task. - """ - - def __init__(self, - data_path, - output_path, - use_okenizer=True, - multi_lines=False): - """ - data_path: string, traing and testing dataset path - output_path: string, output path, store processed dataset - multi_lines: whether a file has multi lines. - In order to shuffle fully, it needs to read all files into - memory, then shuffle them if one file has multi lines. - """ - self.output_path = output_path - self.data_path = data_path - - self.train_dir = 'train' - self.test_dir = 'test' - - self.train_list = "train.list" - self.test_list = "test.list" - - self.label_list = "labels.list" - self.classes_num = 0 - - self.batch_size = 50000 - self.batch_dir = 'batches' - - self.dict_file = "dict.txt" - self.dict_with_test = False - self.dict_size = 0 - self.word_count = {} - - self.tokenizer = use_okenizer - self.overwrite = False - - self.multi_lines = multi_lines - - self.train_dir = join_path(data_path, self.train_dir) - self.test_dir = join_path(data_path, self.test_dir) - self.train_list = join_path(output_path, self.train_list) - self.test_list = join_path(output_path, self.test_list) - self.label_list = join_path(output_path, self.label_list) - self.dict_file = join_path(output_path, self.dict_file) - - def data_list(self, path): - """ - create dataset from path - path: data path - return: data list - """ - label_set = get_label_set_from_dir(path) - data = [] - for lab_name in label_set.keys(): - file_paths = list_files(join_path(path, lab_name)) - for p in file_paths: - data.append({"label" : label_set[lab_name],\ - "seq_path": p}) - return data, label_set - - def create_dict(self, data): - """ - create dict for input data. - data: list, [sequence, sequnce, ...] - """ - for seq in data: - for w in seq.strip().lower().split(): - if w not in self.word_count: - self.word_count[w] = 1 - else: - self.word_count[w] += 1 - - def create_dataset(self): - """ - create file batches and dictionary of train data set. - If the self.overwrite is false and train.list already exists in - self.output_path, this function will not create and save file - batches from the data set path. - return: dictionary size, class number. - """ - out_path = self.output_path - if out_path and not os.path.exists(out_path): - os.makedirs(out_path) - - # If self.overwrite is false or self.train_list has existed, - # it will not process dataset. - if not (self.overwrite or not os.path.exists(self.train_list)): - print "%s already exists." % self.train_list - return - - # Preprocess train data. - train_data, train_lab_set = self.data_list(self.train_dir) - print "processing train set..." - file_lists = self.save_data(train_data, "train", self.batch_size, True, - True) - save_list(file_lists, self.train_list) - - # If have test data path, preprocess test data. - if os.path.exists(self.test_dir): - test_data, test_lab_set = self.data_list(self.test_dir) - assert (train_lab_set == test_lab_set) - print "processing test set..." - file_lists = self.save_data(test_data, "test", self.batch_size, - False, self.dict_with_test) - save_list(file_lists, self.test_list) - - # save labels set. - save_dict(train_lab_set, self.label_list, False) - self.classes_num = len(train_lab_set.keys()) - - # save dictionary. - save_dict(self.word_count, self.dict_file, True) - self.dict_size = len(self.word_count) - - def save_data(self, - data, - prefix="", - batch_size=50000, - is_shuffle=False, - build_dict=False): - """ - Create batches for a Dataset object. - data: the Dataset object to process. - prefix: the prefix of each batch. - batch_size: number of data in each batch. - build_dict: whether to build dictionary for data - - return: list of batch names - """ - if is_shuffle and self.multi_lines: - return self.save_data_multi_lines(data, prefix, batch_size, - build_dict) - - if is_shuffle: - random.shuffle(data) - num_batches = int(math.ceil(len(data) / float(batch_size))) - batch_names = [] - for i in range(num_batches): - batch_name = join_path(self.output_path, - "%s_part_%03d" % (prefix, i)) - begin = i * batch_size - end = min((i + 1) * batch_size, len(data)) - # read a batch of data - label_list, data_list = self.get_data_list(begin, end, data) - if build_dict: - self.create_dict(data_list) - self.save_file(label_list, data_list, batch_name) - batch_names.append(batch_name) - - return batch_names - - def get_data_list(self, begin, end, data): - """ - begin: int, begining index of data. - end: int, ending index of data. - data: a list of {"seq_path": seqquence path, "label": label index} - - return a list of label and a list of sequence. - """ - label_list = [] - data_list = [] - for j in range(begin, end): - seqs = read_lines(data[j]["seq_path"]) - lab = int(data[j]["label"]) - #File may have multiple lines. - for seq in seqs: - data_list.append(seq) - label_list.append(lab) - if self.tokenizer: - data_list = tokenize(data_list) - return label_list, data_list - - def save_data_multi_lines(self, - data, - prefix="", - batch_size=50000, - build_dict=False): - """ - In order to shuffle fully, there is no need to load all data if - each file only contains one sample, it only needs to shuffle list - of file name. But one file contains multi lines, each line is one - sample. It needs to read all data into memory to shuffle fully. - This interface is mainly for data containning multi lines in each - file, which consumes more memory if there is a great mount of data. - - data: the Dataset object to process. - prefix: the prefix of each batch. - batch_size: number of data in each batch. - build_dict: whether to build dictionary for data - - return: list of batch names - """ - assert self.multi_lines - label_list = [] - data_list = [] - - # read all data - label_list, data_list = self.get_data_list(0, len(data), data) - if build_dict: - self.create_dict(data_list) - - length = len(label_list) - perm_list = np.array([i for i in xrange(length)]) - random.shuffle(perm_list) - - num_batches = int(math.ceil(length / float(batch_size))) - batch_names = [] - for i in range(num_batches): - batch_name = join_path(self.output_path, - "%s_part_%03d" % (prefix, i)) - begin = i * batch_size - end = min((i + 1) * batch_size, length) - sub_label = [label_list[perm_list[i]] for i in range(begin, end)] - sub_data = [data_list[perm_list[i]] for i in range(begin, end)] - self.save_file(sub_label, sub_data, batch_name) - batch_names.append(batch_name) - - return batch_names - - def save_file(self, label_list, data_list, filename): - """ - Save data into file. - label_list: a list of int value. - data_list: a list of sequnece. - filename: output file name. - """ - f = open(filename, 'w') - print "saving file: %s" % filename - for lab, seq in zip(label_list, data_list): - f.write('%s\t\t%s\n' % (lab, seq)) - f.close() - - -def option_parser(): - parser = OptionParser(usage="usage: python preprcoess.py "\ - "-i data_dir [options]") - parser.add_option( - "-i", - "--data", - action="store", - dest="input", - help="Input data directory.") - parser.add_option( - "-o", - "--output", - action="store", - dest="output", - default=None, - help="Output directory.") - parser.add_option( - "-t", - "--tokenizer", - action="store", - dest="use_tokenizer", - default=True, - help="Whether to use tokenizer.") - parser.add_option("-m", "--multi_lines", action="store", - dest="multi_lines", default=False, - help="If input text files have multi lines and they "\ - "need to be shuffled, you should set -m True,") - return parser.parse_args() - - -def main(): - options, args = option_parser() - data_dir = options.input - output_dir = options.output - use_tokenizer = options.use_tokenizer - multi_lines = options.multi_lines - if output_dir is None: - outname = os.path.basename(options.input) - output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname) - data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer, - multi_lines) - data_creator.create_dataset() - - -if __name__ == '__main__': - main() diff --git a/understand_sentiment/test.sh b/understand_sentiment/test.sh deleted file mode 100755 index 8af827c3388c8df88a872bd87d121a4f9631c3ff..0000000000000000000000000000000000000000 --- a/understand_sentiment/test.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\ - sort -n | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="model_output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass - -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list -net_conf=trainer_config.py -paddle train --config=$net_conf \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --trainer_count=4 \ - --config_args=is_test=1 \ - 2>&1 | tee 'test.log' diff --git a/understand_sentiment/train.sh b/understand_sentiment/train.sh deleted file mode 100755 index df8d464d557edbc2f538cb492bd29e8d32c77635..0000000000000000000000000000000000000000 --- a/understand_sentiment/train.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -paddle train --config=trainer_config.py \ - --save_dir=./model_output \ - --job=train \ - --use_gpu=false \ - --trainer_count=4 \ - --num_passes=10 \ - --log_period=10 \ - --dot_period=20 \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' diff --git a/understand_sentiment/trainer_config.py b/understand_sentiment/trainer_config.py deleted file mode 100644 index 9b9b98634bda18e4659c9aeaa8eeffcc52c13e1c..0000000000000000000000000000000000000000 --- a/understand_sentiment/trainer_config.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from os.path import join as join_path -from paddle.trainer_config_helpers import * -# whether this config is used for test -is_test = get_config_arg('is_test', bool, False) -# whether this config is used for prediction -is_predict = get_config_arg('is_predict', bool, False) - -data_dir = "./data/pre-imdb" -train_list = "train.list" -test_list = "test.list" -dict_file = "dict.txt" - -dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines()) -class_dim = len(open(join_path(data_dir, 'labels.list')).readlines()) - -if not is_predict: - train_list = join_path(data_dir, train_list) - test_list = join_path(data_dir, test_list) - dict_file = join_path(data_dir, dict_file) - train_list = train_list if not is_test else None - word_dict = dict() - with open(dict_file, 'r') as f: - for i, line in enumerate(open(dict_file, 'r')): - word_dict[line.split('\t')[0]] = i - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={'dictionary': word_dict}) - -################## Algorithm Config ##################### - -settings( - batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - average_window=0.5, - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) - -#################### Network Config ###################### - - -def convolution_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=128, - is_predict=False): - data = data_layer("word", input_dim) - emb = embedding_layer(input=data, size=emb_dim) - conv_3 = sequence_conv_pool(input=emb, context_len=3, hidden_size=hid_dim) - conv_4 = sequence_conv_pool(input=emb, context_len=4, hidden_size=hid_dim) - output = fc_layer( - input=[conv_3, conv_4], size=class_dim, act=SoftmaxActivation()) - - if not is_predict: - lbl = data_layer("label", 1) - outputs(classification_cost(input=output, label=lbl)) - else: - outputs(output) - - -def stacked_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=512, - stacked_num=3, - is_predict=False): - """ - A Wrapper for sentiment classification task. - This network uses bi-directional recurrent network, - consisting three LSTM layers. This configure is referred to - the paper as following url, but use fewer layrs. - http://www.aclweb.org/anthology/P15-1109 - - input_dim: here is word dictionary dimension. - class_dim: number of categories. - emb_dim: dimension of word embedding. - hid_dim: dimension of hidden layer. - stacked_num: number of stacked lstm-hidden layer. - is_predict: is predicting or not. - Some layers is not needed in network when predicting. - """ - assert stacked_num % 2 == 1 - - layer_attr = ExtraLayerAttribute(drop_rate=0.5) - fc_para_attr = ParameterAttribute(learning_rate=1e-3) - lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.) - para_attr = [fc_para_attr, lstm_para_attr] - bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.) - relu = ReluActivation() - linear = LinearActivation() - - data = data_layer("word", input_dim) - emb = embedding_layer(input=data, size=emb_dim) - - fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr) - lstm1 = lstmemory( - input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr) - - inputs = [fc1, lstm1] - for i in range(2, stacked_num + 1): - fc = fc_layer( - input=inputs, - size=hid_dim, - act=linear, - param_attr=para_attr, - bias_attr=bias_attr) - lstm = lstmemory( - input=fc, - reverse=(i % 2) == 0, - act=relu, - bias_attr=bias_attr, - layer_attr=layer_attr) - inputs = [fc, lstm] - - fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling()) - lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling()) - output = fc_layer( - input=[fc_last, lstm_last], - size=class_dim, - act=SoftmaxActivation(), - bias_attr=bias_attr, - param_attr=para_attr) - - if is_predict: - outputs(output) - else: - outputs(classification_cost(input=output, label=data_layer('label', 1))) - - -stacked_lstm_net( - dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict) -# convolution_net(dict_dim, class_dim=class_dim, is_predict=is_predict)