Merge pull request #24 from schinger/sentiment

Sentiment

Merge pull request #24 from schinger/sentiment
Sentiment
da5aac99 · Zrachel · GitHub · 4672708e · c164305d · da5aac99
14 changed file
--- a/understand_sentiment/.gitignore
+++ b/understand_sentiment/.gitignore
+data/aclImdb
+data/imdb
+data/pre-imdb
+data/mosesdecoder-master
+*.log
+model_output
+dataprovider_copy_1.py
+model.list
+*.pyc
+.DS_Store
--- a/understand_sentiment/README.md
+++ b/understand_sentiment/README.md
--- a/understand_sentiment/data/get_imdb.sh
+++ b/understand_sentiment/data/get_imdb.sh
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+#download the dataset
+echo "Downloading aclImdb..."
+#http://ai.stanford.edu/%7Eamaas/data/sentiment/
+wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+echo "Downloading mosesdecoder..."
+#https://github.com/moses-smt/mosesdecoder
+wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+#extract package
+echo "Unzipping..."
+tar -zxvf aclImdb_v1.tar.gz
+unzip master.zip
+#move train and test set to imdb_data directory 
+#in order to process when traing
+mkdir -p imdb/train
+mkdir -p imdb/test
+cp -r aclImdb/train/pos/ imdb/train/pos
+cp -r aclImdb/train/neg/ imdb/train/neg
+cp -r aclImdb/test/pos/ imdb/test/pos
+cp -r aclImdb/test/neg/ imdb/test/neg
+#remove compressed package
+rm aclImdb_v1.tar.gz
+rm master.zip
+echo "Done."
--- a/understand_sentiment/dataprovider.py
+++ b/understand_sentiment/dataprovider.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer.PyDataProvider2 import *
+def hook(settings, dictionary, **kwargs):
+    settings.word_dict = dictionary
+    settings.input_types = {
+        'word': integer_value_sequence(len(settings.word_dict)),
+        'label': integer_value(2)
+    }
+    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
+@provider(init_hook=hook)
+def process(settings, file_name):
+    with open(file_name, 'r') as fdata:
+        for line_count, line in enumerate(fdata):
+            label, comment = line.strip().split('\t\t')
+            label = int(label)
+            words = comment.split()
+            word_slot = [
+                settings.word_dict[w] for w in words if w in settings.word_dict
+            ]
+            yield {'word': word_slot, 'label': label}
--- a/understand_sentiment/image/lstm.png
+++ b/understand_sentiment/image/lstm.png
--- a/understand_sentiment/image/rnn.png
+++ b/understand_sentiment/image/rnn.png
--- a/understand_sentiment/image/stacked_lstm.jpg
+++ b/understand_sentiment/image/stacked_lstm.jpg
--- a/understand_sentiment/image/text_cnn.png
+++ b/understand_sentiment/image/text_cnn.png
--- a/understand_sentiment/predict.py
+++ b/understand_sentiment/predict.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os, sys
+import numpy as np
+from optparse import OptionParser
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import integer_value_sequence
+from paddle.trainer.config_parser import parse_config
+"""
+Usage: run following command to show help message.
+  python predict.py -h
+"""
+class SentimentPrediction():
+    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
+        """
+        train_conf: trainer configure.
+        dict_file: word dictionary file name.
+        model_dir: directory of model.
+        """
+        self.train_conf = train_conf
+        self.dict_file = dict_file
+        self.word_dict = {}
+        self.dict_dim = self.load_dict()
+        self.model_dir = model_dir
+        if model_dir is None:
+            self.model_dir = os.path.dirname(train_conf)
+        self.label = None
+        if label_file is not None:
+            self.load_label(label_file)
+        conf = parse_config(train_conf, "is_predict=1")
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+            conf.model_config)
+        self.network.loadParameters(self.model_dir)
+        input_types = [integer_value_sequence(self.dict_dim)]
+        self.converter = DataProviderConverter(input_types)
+    def load_dict(self):
+        """
+        Load dictionary from self.dict_file.
+        """
+        for line_count, line in enumerate(open(self.dict_file, 'r')):
+            self.word_dict[line.strip().split('\t')[0]] = line_count
+        return len(self.word_dict)
+    def load_label(self, label_file):
+        """
+        Load label.
+        """
+        self.label = {}
+        for v in open(label_file, 'r'):
+            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
+    def get_index(self, data):
+        """
+        transform word into integer index according to the dictionary.
+        """
+        words = data.strip().split()
+        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
+        return word_slot
+    def batch_predict(self, data_batch):
+        input = self.converter(data_batch)
+        output = self.network.forwardTest(input)
+        prob = output[0]["value"]
+        labs = np.argsort(-prob)
+        for idx, lab in enumerate(labs):
+            if self.label is None:
+                print("predicting label is %d" % (lab[0]))
+            else:
+                print("predicting label is %s" % (self.label[lab[0]]))
+def option_parser():
+    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
+    parser = OptionParser(usage="usage: %s [options]" % usage)
+    parser.add_option(
+        "-n",
+        "--tconf",
+        action="store",
+        dest="train_conf",
+        help="network config")
+    parser.add_option(
+        "-d",
+        "--dict",
+        action="store",
+        dest="dict_file",
+        help="dictionary file")
+    parser.add_option(
+        "-b",
+        "--label",
+        action="store",
+        dest="label",
+        default=None,
+        help="dictionary file")
+    parser.add_option(
+        "-c",
+        "--batch_size",
+        type="int",
+        action="store",
+        dest="batch_size",
+        default=1,
+        help="the batch size for prediction")
+    parser.add_option(
+        "-w",
+        "--model",
+        action="store",
+        dest="model_path",
+        default=None,
+        help="model path")
+    return parser.parse_args()
+def main():
+    options, args = option_parser()
+    train_conf = options.train_conf
+    batch_size = options.batch_size
+    dict_file = options.dict_file
+    model_path = options.model_path
+    label = options.label
+    swig_paddle.initPaddle("--use_gpu=0")
+    predict = SentimentPrediction(train_conf, dict_file, model_path, label)
+    batch = []
+    for line in sys.stdin:
+        batch.append([predict.get_index(line)])
+        if len(batch) == batch_size:
+            predict.batch_predict(batch)
+            batch = []
+    if len(batch) > 0:
+        predict.batch_predict(batch)
+if __name__ == '__main__':
+    main()
--- a/understand_sentiment/predict.sh
+++ b/understand_sentiment/predict.sh
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+#Note the default model is pass-00002, you shold make sure the model path
+#exists or change the mode path.
+model=model_output/pass-00002/
+config=trainer_config.py
+label=data/pre-imdb/labels.list
+cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
+     --tconf=$config \
+     --model=$model \
+     --label=$label \
+     --dict=./data/pre-imdb/dict.txt \
+     --batch_size=1
--- a/understand_sentiment/preprocess.py
+++ b/understand_sentiment/preprocess.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import random
+import operator
+import numpy as np
+from subprocess import Popen, PIPE
+from os.path import join as join_path
+from optparse import OptionParser
+from paddle.utils.preprocess_util import *
+"""
+Usage: run following command to show help message.
+  python preprocess.py -h 
+"""
+def save_dict(dict, filename, is_reverse=True):
+    """
+    Save dictionary into file.
+    dict:   input dictionary.
+    filename: output file name, string.
+    is_reverse: True, descending order by value.
+                False, ascending order by value.
+    """
+    f = open(filename, 'w')
+    for k, v in sorted(dict.items(), key=operator.itemgetter(1),\
+                       reverse=is_reverse):
+        f.write('%s\t%s\n' % (k, v))
+    f.close()
+def tokenize(sentences):
+    """
+    Use tokenizer.perl to tokenize input sentences.
+    tokenizer.perl is tool of Moses.
+    sentences : a list of input sentences.
+    return: a list of processed text.
+    """
+    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
+    assert isinstance(sentences, list)
+    text = "\n".join(sentences)
+    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
+    tok_text, _ = tokenizer.communicate(text)
+    toks = tok_text.split('\n')[:-1]
+    return toks
+def read_lines(path):
+    """
+    path: String, file path.
+    return a list of sequence.
+    """
+    seqs = []
+    with open(path, 'r') as f:
+        for line in f.readlines():
+            line = line.strip()
+            if len(line):
+                seqs.append(line)
+    return seqs
+class SentimentDataSetCreate():
+    """
+    A class to process data for sentiment analysis task.
+    """
+    def __init__(self,
+                 data_path,
+                 output_path,
+                 use_okenizer=True,
+                 multi_lines=False):
+        """
+        data_path: string, traing and testing dataset path
+        output_path: string, output path, store processed dataset
+        multi_lines: whether a file has multi lines.
+                     In order to shuffle fully, it needs to read all files into
+                     memory, then shuffle them if one file has multi lines.
+        """
+        self.output_path = output_path
+        self.data_path = data_path
+        self.train_dir = 'train'
+        self.test_dir = 'test'
+        self.train_list = "train.list"
+        self.test_list = "test.list"
+        self.label_list = "labels.list"
+        self.classes_num = 0
+        self.batch_size = 50000
+        self.batch_dir = 'batches'
+        self.dict_file = "dict.txt"
+        self.dict_with_test = False
+        self.dict_size = 0
+        self.word_count = {}
+        self.tokenizer = use_okenizer
+        self.overwrite = False
+        self.multi_lines = multi_lines
+        self.train_dir = join_path(data_path, self.train_dir)
+        self.test_dir = join_path(data_path, self.test_dir)
+        self.train_list = join_path(output_path, self.train_list)
+        self.test_list = join_path(output_path, self.test_list)
+        self.label_list = join_path(output_path, self.label_list)
+        self.dict_file = join_path(output_path, self.dict_file)
+    def data_list(self, path):
+        """
+        create dataset from path
+        path: data path
+        return: data list
+        """
+        label_set = get_label_set_from_dir(path)
+        data = []
+        for lab_name in label_set.keys():
+            file_paths = list_files(join_path(path, lab_name))
+            for p in file_paths:
+                data.append({"label"  : label_set[lab_name],\
+                             "seq_path": p})
+        return data, label_set
+    def create_dict(self, data):
+        """
+        create dict for input data.
+        data: list, [sequence, sequnce, ...]
+        """
+        for seq in data:
+            for w in seq.strip().lower().split():
+                if w not in self.word_count:
+                    self.word_count[w] = 1
+                else:
+                    self.word_count[w] += 1
+    def create_dataset(self):
+        """
+        create file batches and dictionary of train data set.
+        If the self.overwrite is false and train.list already exists in
+        self.output_path, this function will not create and save file
+        batches from the data set path.
+        return: dictionary size, class number.
+        """
+        out_path = self.output_path
+        if out_path and not os.path.exists(out_path):
+            os.makedirs(out_path)
+        # If self.overwrite is false or self.train_list has existed,
+        # it will not process dataset.
+        if not (self.overwrite or not os.path.exists(self.train_list)):
+            print "%s already exists." % self.train_list
+            return
+        # Preprocess train data.
+        train_data, train_lab_set = self.data_list(self.train_dir)
+        print "processing train set..."
+        file_lists = self.save_data(train_data, "train", self.batch_size, True,
+                                    True)
+        save_list(file_lists, self.train_list)
+        # If have test data path, preprocess test data.
+        if os.path.exists(self.test_dir):
+            test_data, test_lab_set = self.data_list(self.test_dir)
+            assert (train_lab_set == test_lab_set)
+            print "processing test set..."
+            file_lists = self.save_data(test_data, "test", self.batch_size,
+                                        False, self.dict_with_test)
+            save_list(file_lists, self.test_list)
+        # save labels set.
+        save_dict(train_lab_set, self.label_list, False)
+        self.classes_num = len(train_lab_set.keys())
+        # save dictionary.
+        save_dict(self.word_count, self.dict_file, True)
+        self.dict_size = len(self.word_count)
+    def save_data(self,
+                  data,
+                  prefix="",
+                  batch_size=50000,
+                  is_shuffle=False,
+                  build_dict=False):
+        """
+        Create batches for a Dataset object.
+        data: the Dataset object to process.
+        prefix: the prefix of each batch.
+        batch_size: number of data in each batch.
+        build_dict: whether to build dictionary for data
+        return: list of batch names
+        """
+        if is_shuffle and self.multi_lines:
+            return self.save_data_multi_lines(data, prefix, batch_size,
+                                              build_dict)
+        if is_shuffle:
+            random.shuffle(data)
+        num_batches = int(math.ceil(len(data) / float(batch_size)))
+        batch_names = []
+        for i in range(num_batches):
+            batch_name = join_path(self.output_path,
+                                   "%s_part_%03d" % (prefix, i))
+            begin = i * batch_size
+            end = min((i + 1) * batch_size, len(data))
+            # read a batch of data
+            label_list, data_list = self.get_data_list(begin, end, data)
+            if build_dict:
+                self.create_dict(data_list)
+            self.save_file(label_list, data_list, batch_name)
+            batch_names.append(batch_name)
+        return batch_names
+    def get_data_list(self, begin, end, data):
+        """
+        begin: int, begining index of data.
+        end: int, ending index of data.
+        data: a list of {"seq_path": seqquence path, "label": label index}
+        return a list of label and a list of sequence.
+        """
+        label_list = []
+        data_list = []
+        for j in range(begin, end):
+            seqs = read_lines(data[j]["seq_path"])
+            lab = int(data[j]["label"])
+            #File may have multiple lines.
+            for seq in seqs:
+                data_list.append(seq)
+                label_list.append(lab)
+        if self.tokenizer:
+            data_list = tokenize(data_list)
+        return label_list, data_list
+    def save_data_multi_lines(self,
+                              data,
+                              prefix="",
+                              batch_size=50000,
+                              build_dict=False):
+        """
+        In order to shuffle fully, there is no need to load all data if
+        each file only contains one sample, it only needs to shuffle list
+        of file name. But one file contains multi lines, each line is one
+        sample. It needs to read all data into memory to shuffle fully.
+        This interface is mainly for data containning multi lines in each
+        file, which consumes more memory if there is a great mount of data.
+        data: the Dataset object to process.
+        prefix: the prefix of each batch.
+        batch_size: number of data in each batch.
+        build_dict: whether to build dictionary for data
+        return: list of batch names
+        """
+        assert self.multi_lines
+        label_list = []
+        data_list = []
+        # read all data
+        label_list, data_list = self.get_data_list(0, len(data), data)
+        if build_dict:
+            self.create_dict(data_list)
+        length = len(label_list)
+        perm_list = np.array([i for i in xrange(length)])
+        random.shuffle(perm_list)
+        num_batches = int(math.ceil(length / float(batch_size)))
+        batch_names = []
+        for i in range(num_batches):
+            batch_name = join_path(self.output_path,
+                                   "%s_part_%03d" % (prefix, i))
+            begin = i * batch_size
+            end = min((i + 1) * batch_size, length)
+            sub_label = [label_list[perm_list[i]] for i in range(begin, end)]
+            sub_data = [data_list[perm_list[i]] for i in range(begin, end)]
+            self.save_file(sub_label, sub_data, batch_name)
+            batch_names.append(batch_name)
+        return batch_names
+    def save_file(self, label_list, data_list, filename):
+        """
+        Save data into file.
+        label_list: a list of int value.
+        data_list: a list of sequnece.
+        filename: output file name.
+        """
+        f = open(filename, 'w')
+        print "saving file: %s" % filename
+        for lab, seq in zip(label_list, data_list):
+            f.write('%s\t\t%s\n' % (lab, seq))
+        f.close()
+def option_parser():
+    parser = OptionParser(usage="usage: python preprcoess.py "\
+                                "-i data_dir [options]")
+    parser.add_option(
+        "-i",
+        "--data",
+        action="store",
+        dest="input",
+        help="Input data directory.")
+    parser.add_option(
+        "-o",
+        "--output",
+        action="store",
+        dest="output",
+        default=None,
+        help="Output directory.")
+    parser.add_option(
+        "-t",
+        "--tokenizer",
+        action="store",
+        dest="use_tokenizer",
+        default=True,
+        help="Whether to use tokenizer.")
+    parser.add_option("-m", "--multi_lines", action="store",
+                      dest="multi_lines", default=False,
+                      help="If input text files have multi lines and they "\
+                           "need to be shuffled, you should set -m True,")
+    return parser.parse_args()
+def main():
+    options, args = option_parser()
+    data_dir = options.input
+    output_dir = options.output
+    use_tokenizer = options.use_tokenizer
+    multi_lines = options.multi_lines
+    if output_dir is None:
+        outname = os.path.basename(options.input)
+        output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname)
+    data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer,
+                                          multi_lines)
+    data_creator.create_dataset()
+if __name__ == '__main__':
+    main()
--- a/understand_sentiment/test.sh
+++ b/understand_sentiment/test.sh
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+function get_best_pass() {
+  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
+  sed  -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
+  sort -n | head -n 1
+}
+log=train.log
+LOG=`get_best_pass $log`
+LOG=(${LOG})
+evaluate_pass="model_output/pass-${LOG[1]}"
+echo 'evaluating from pass '$evaluate_pass
+model_list=./model.list
+touch $model_list | echo $evaluate_pass > $model_list
+net_conf=trainer_config.py
+paddle train --config=$net_conf \
+             --model_list=$model_list \
+             --job=test \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --config_args=is_test=1 \
+             2>&1 | tee 'test.log'
--- a/understand_sentiment/train.sh
+++ b/understand_sentiment/train.sh
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+paddle train --config=trainer_config.py \
+             --save_dir=./model_output \
+             --job=train \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --num_passes=10 \
+             --log_period=10 \
+             --dot_period=20 \
+             --show_parameter_stats_period=100 \
+             --test_all_data_in_one_period=1 \
+             2>&1 | tee 'train.log'
--- a/understand_sentiment/trainer_config.py
+++ b/understand_sentiment/trainer_config.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from os.path import join as join_path
+from paddle.trainer_config_helpers import *
+# whether this config is used for test
+is_test = get_config_arg('is_test', bool, False)
+# whether this config is used for prediction
+is_predict = get_config_arg('is_predict', bool, False)
+data_dir = "./data/pre-imdb"
+train_list = "train.list"
+test_list = "test.list"
+dict_file = "dict.txt"
+dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines())
+class_dim = len(open(join_path(data_dir, 'labels.list')).readlines())
+if not is_predict:
+    train_list = join_path(data_dir, train_list)
+    test_list = join_path(data_dir, test_list)
+    dict_file = join_path(data_dir, dict_file)
+    train_list = train_list if not is_test else None
+    word_dict = dict()
+    with open(dict_file, 'r') as f:
+        for i, line in enumerate(open(dict_file, 'r')):
+            word_dict[line.split('\t')[0]] = i
+    define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={'dictionary': word_dict})
+################## Algorithm Config #####################
+settings(
+    batch_size=128,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    average_window=0.5,
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25)
+#################### Network Config ######################
+def convolution_net(input_dim,
+                    class_dim=2,
+                    emb_dim=128,
+                    hid_dim=128,
+                    is_predict=False):
+    data = data_layer("word", input_dim)
+    emb = embedding_layer(input=data, size=emb_dim)
+    conv_3 = sequence_conv_pool(input=emb, context_len=3, hidden_size=hid_dim)
+    conv_4 = sequence_conv_pool(input=emb, context_len=4, hidden_size=hid_dim)
+    output = fc_layer(
+        input=[conv_3, conv_4], size=class_dim, act=SoftmaxActivation())
+    if not is_predict:
+        lbl = data_layer("label", 1)
+        outputs(classification_cost(input=output, label=lbl))
+    else:
+        outputs(output)
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3,
+                     is_predict=False):
+    """
+    A Wrapper for sentiment classification task.
+    This network uses bi-directional recurrent network,
+    consisting three LSTM layers. This configure is referred to
+    the paper as following url, but use fewer layrs.
+        http://www.aclweb.org/anthology/P15-1109
+    input_dim: here is word dictionary dimension.
+    class_dim: number of categories.
+    emb_dim: dimension of word embedding.
+    hid_dim: dimension of hidden layer.
+    stacked_num: number of stacked lstm-hidden layer.
+    is_predict: is predicting or not.
+                Some layers is not needed in network when predicting.
+    """
+    assert stacked_num % 2 == 1
+    layer_attr = ExtraLayerAttribute(drop_rate=0.5)
+    fc_para_attr = ParameterAttribute(learning_rate=1e-3)
+    lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.)
+    para_attr = [fc_para_attr, lstm_para_attr]
+    bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.)
+    relu = ReluActivation()
+    linear = LinearActivation()
+    data = data_layer("word", input_dim)
+    emb = embedding_layer(input=data, size=emb_dim)
+    fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
+    lstm1 = lstmemory(
+        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+    inputs = [fc1, lstm1]
+    for i in range(2, stacked_num + 1):
+        fc = fc_layer(
+            input=inputs,
+            size=hid_dim,
+            act=linear,
+            param_attr=para_attr,
+            bias_attr=bias_attr)
+        lstm = lstmemory(
+            input=fc,
+            reverse=(i % 2) == 0,
+            act=relu,
+            bias_attr=bias_attr,
+            layer_attr=layer_attr)
+        inputs = [fc, lstm]
+    fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
+    lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
+    output = fc_layer(
+        input=[fc_last, lstm_last],
+        size=class_dim,
+        act=SoftmaxActivation(),
+        bias_attr=bias_attr,
+        param_attr=para_attr)
+    if is_predict:
+        outputs(output)
+    else:
+        outputs(classification_cost(input=output, label=data_layer('label', 1)))
+stacked_lstm_net(
+    dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
+# convolution_net(dict_dim, class_dim=class_dim, is_predict=is_predict)