!10 The SentimentNet sample code needs to be updated"

Merge pull request !10 from caojian05/ms_master

!10 The SentimentNet sample code needs to be updated"
Merge pull request !10 from caojian05/ms_master
09083fe2 · mindspore-ci-bot · Gitee · 267f3cd3 · 9d07c631 · 09083fe2
8 changed file
--- a/chapter06/lstm/eval.py
+++ b/chapter06/lstm/eval.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+#################train lstm example on aclImdb########################
+python eval.py --ckpt_path=./lstm-20-390.ckpt
+"""
+import argparse
+import os
+
+import numpy as np
+
+from src.config import lstm_cfg as cfg
+from src.dataset import lstm_create_dataset, convert_to_mindrecord
+from src.lstm import SentimentNet
+from mindspore import Tensor, nn, Model, context
+from mindspore.nn import Accuracy
+from mindspore.train.callback import LossMonitor
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='MindSpore LSTM Example')
+    parser.add_argument('--preprocess', type=str, default='false', choices=['true', 'false'],
+                        help='whether to preprocess data.')
+    parser.add_argument('--aclimdb_path', type=str, default="./aclImdb",
+                        help='path where the dataset is stored.')
+    parser.add_argument('--glove_path', type=str, default="./glove",
+                        help='path where the GloVe is stored.')
+    parser.add_argument('--preprocess_path', type=str, default="./preprocess",
+                        help='path where the pre-process data is stored.')
+    parser.add_argument('--ckpt_path', type=str, default=None,
+                        help='the checkpoint file path used to evaluate model.')
+    parser.add_argument('--device_target', type=str, default="GPU", choices=['GPU', 'CPU'],
+                        help='the target device to run, support "GPU", "CPU". Default: "GPU".')
+    args = parser.parse_args()
+
+    context.set_context(
+        mode=context.GRAPH_MODE,
+        save_graphs=False,
+        device_target=args.device_target)
+
+    if args.preprocess == "true":
+        print("============== Starting Data Pre-processing ==============")
+        convert_to_mindrecord(cfg.embed_size, args.aclimdb_path, args.preprocess_path, args.glove_path)
+
+    embedding_table = np.loadtxt(os.path.join(args.preprocess_path, "weight.txt")).astype(np.float32)
+    network = SentimentNet(vocab_size=embedding_table.shape[0],
+                           embed_size=cfg.embed_size,
+                           num_hiddens=cfg.num_hiddens,
+                           num_layers=cfg.num_layers,
+                           bidirectional=cfg.bidirectional,
+                           num_classes=cfg.num_classes,
+                           weight=Tensor(embedding_table),
+                           batch_size=cfg.batch_size)
+
+    loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+    opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum)
+    loss_cb = LossMonitor()
+
+    model = Model(network, loss, opt, {'acc': Accuracy()})
+
+    print("============== Starting Testing ==============")
+    ds_eval = lstm_create_dataset(args.preprocess_path, cfg.batch_size, training=False)
+    param_dict = load_checkpoint(args.ckpt_path)
+    load_param_into_net(network, param_dict)
+    if args.device_target == "CPU":
+        acc = model.eval(ds_eval, dataset_sink_mode=False)
+    else:
+        acc = model.eval(ds_eval)
+    print("============== {} ==============".format(acc))
--- a/chapter06/lstm/main.py
+++ b/chapter06/lstm/main.py
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-LSTM Sample, has train and evaluate part.
-The sample can only be run on GPU.
-"""
-import os
-import shutil
-import math
-import argparse
-import json
-from itertools import chain
-import numpy as np
-from config import lstm_cfg as cfg
-
-import mindspore.nn as nn
-import mindspore.context as context
-import mindspore.dataset as ds
-from mindspore.ops import operations as P
-from mindspore import Tensor
-from mindspore.common.initializer import initializer
-from mindspore.common.parameter import Parameter
-from mindspore.mindrecord import FileWriter
-from mindspore.train import Model
-from mindspore.nn.metrics import Accuracy
-from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
-# Install gensim with 'pip install gensim'
-import gensim
-
-
-def encode_samples(tokenized_samples, word_to_idx):
-    """ encode word to index """
-    features = []
-    for sample in tokenized_samples:
-        feature = []
-        for token in sample:
-            if token in word_to_idx:
-                feature.append(word_to_idx[token])
-            else:
-                feature.append(0)
-        features.append(feature)
-    return features
-
-def pad_samples(features, maxlen=500, pad=0):
-    """ pad all features to the same length """
-    padded_features = []
-    for feature in features:
-        if len(feature) >= maxlen:
-            padded_feature = feature[:maxlen]
-        else:
-            padded_feature = feature
-            while len(padded_feature) < maxlen:
-                padded_feature.append(pad)
-        padded_features.append(padded_feature)
-    return padded_features
-
-def read_imdb(path, seg='train'):
-    """ read imdb dataset """
-    pos_or_neg = ['pos', 'neg']
-    data = []
-    for label in pos_or_neg:
-        files = os.listdir(os.path.join(path, seg, label))
-        for file in files:
-            with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf:
-                review = rf.read().replace('\n', '')
-                if label == 'pos':
-                    data.append([review, 1])
-                elif label == 'neg':
-                    data.append([review, 0])
-    return data
-
-def tokenizer(text):
-    return [tok.lower() for tok in text.split(' ')]
-
-def collect_weight(glove_path, vocab, word_to_idx, embed_size):
-    """ collect weight """
-    vocab_size = len(vocab)
-    wvmodel = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(glove_path, 'glove.6B.300d.txt'),
-                                                              binary=False, encoding='utf-8')
-    weight_np = np.zeros((vocab_size+1, embed_size)).astype(np.float32)
-
-    idx_to_word = {i+1: word for i, word in enumerate(vocab)}
-    idx_to_word[0] = '<unk>'
-
-    for i in range(len(wvmodel.index2word)):
-        try:
-            index = word_to_idx[wvmodel.index2word[i]]
-        except KeyError:
-            continue
-        weight_np[index, :] = wvmodel.get_vector(
-            idx_to_word[word_to_idx[wvmodel.index2word[i]]])
-    return weight_np
-
-def preprocess(aclimdb_path, glove_path, embed_size):
-    """ preprocess the train and test data """
-    train_data = read_imdb(aclimdb_path, 'train')
-    test_data = read_imdb(aclimdb_path, 'test')
-
-    train_tokenized = []
-    test_tokenized = []
-    for review, _ in train_data:
-        train_tokenized.append(tokenizer(review))
-    for review, _ in test_data:
-        test_tokenized.append(tokenizer(review))
-
-    vocab = set(chain(*train_tokenized))
-    vocab_size = len(vocab)
-    print("vocab_size: ", vocab_size)
-
-    word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
-    word_to_idx['<unk>'] = 0
-
-    train_features = np.array(pad_samples(encode_samples(train_tokenized, word_to_idx))).astype(np.int32)
-    train_labels = np.array([score for _, score in train_data]).astype(np.int32)
-    test_features = np.array(pad_samples(encode_samples(test_tokenized, word_to_idx))).astype(np.int32)
-    test_labels = np.array([score for _, score in test_data]).astype(np.int32)
-
-    weight_np = collect_weight(glove_path, vocab, word_to_idx, embed_size)
-    return train_features, train_labels, test_features, test_labels, weight_np, vocab_size
-
-def get_imdb_data(labels_data, features_data):
-    data_list = []
-    for i, (label, feature) in enumerate(zip(labels_data, features_data)):
-        data_json = {"id": i,
-                     "label": int(label),
-                     "feature": feature.reshape(-1)}
-        data_list.append(data_json)
-    return data_list
-
-def convert_to_mindrecord(embed_size, aclimdb_path, proprocess_path, glove_path):
-    """ convert imdb dataset to mindrecord """
-    num_shard = 4
-    train_features, train_labels, test_features, test_labels, weight_np, _ = \
-        preprocess(aclimdb_path, glove_path, embed_size)
-    np.savetxt(os.path.join(proprocess_path, 'weight.txt'), weight_np)
-
-    # write mindrecord
-    schema_json = {"id": {"type": "int32"},
-                   "label": {"type": "int32"},
-                   "feature": {"type": "int32", "shape":[-1]}}
-
-    writer = FileWriter(os.path.join(proprocess_path, 'aclImdb_train.mindrecord'), num_shard)
-    data = get_imdb_data(train_labels, train_features)
-    writer.add_schema(schema_json, "nlp_schema")
-    writer.add_index(["id", "label"])
-    writer.write_raw_data(data)
-    writer.commit()
-
-    writer = FileWriter(os.path.join(proprocess_path, 'aclImdb_test.mindrecord'), num_shard)
-    data = get_imdb_data(test_labels, test_features)
-    writer.add_schema(schema_json, "nlp_schema")
-    writer.add_index(["id", "label"])
-    writer.write_raw_data(data)
-    writer.commit()
-
-def init_lstm_weight(
-        input_size,
-        hidden_size,
-        num_layers,
-        bidirectional,
-        has_bias=True):
-    """Initialize lstm weight."""
-    num_directions = 1
-    if bidirectional:
-        num_directions = 2
-
-    weight_size = 0
-    gate_size = 4 * hidden_size
-    for layer in range(num_layers):
-        for _ in range(num_directions):
-            input_layer_size = input_size if layer == 0 else hidden_size * num_directions
-            weight_size += gate_size * input_layer_size
-            weight_size += gate_size * hidden_size
-            if has_bias:
-                weight_size += 2 * gate_size
-
-    stdv = 1 / math.sqrt(hidden_size)
-    w_np = np.random.uniform(-stdv, stdv, (weight_size,
-                                           1, 1)).astype(np.float32)
-    w = Parameter(
-        initializer(
-            Tensor(w_np), [
-                weight_size, 1, 1]), name='weight')
-
-    return w
-
-
-def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional):
-    """init default input."""
-    num_directions = 1
-    if bidirectional:
-        num_directions = 2
-
-    h = Tensor(
-        np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32))
-    c = Tensor(
-        np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32))
-    return h, c
-
-
-class SentimentNet(nn.Cell):
-    """Sentiment network structure."""
-    def __init__(self,
-                 vocab_size,
-                 embed_size,
-                 num_hiddens,
-                 num_layers,
-                 bidirectional,
-                 num_classes,
-                 weight,
-                 batch_size):
-        super(SentimentNet, self).__init__()
-        self.embedding = nn.Embedding(vocab_size,
-                                      embed_size,
-                                      embedding_table=weight)
-        self.embedding.embedding_table.requires_grad = False
-        self.trans = P.Transpose()
-        self.perm = (1, 0, 2)
-        self.encoder = nn.LSTM(input_size=embed_size,
-                               hidden_size=num_hiddens,
-                               num_layers=num_layers,
-                               has_bias=True,
-                               bidirectional=bidirectional,
-                               dropout=0.0)
-        w_init = init_lstm_weight(
-            embed_size,
-            num_hiddens,
-            num_layers,
-            bidirectional)
-        self.encoder.weight = w_init
-        self.h, self.c = lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional)
-
-        self.concat = P.Concat(1)
-        if bidirectional:
-            self.decoder = nn.Dense(num_hiddens * 4, num_classes)
-        else:
-            self.decoder = nn.Dense(num_hiddens * 2, num_classes)
-
-    def construct(self, inputs):
-        # (64,500,300)
-        embeddings = self.embedding(inputs)
-        embeddings = self.trans(embeddings, self.perm)
-        output, _ = self.encoder(embeddings, (self.h, self.c))
-        # states[i] size(64,200)  -> encoding.size(64,400)
-        encoding = self.concat((output[0], output[1]))
-        outputs = self.decoder(encoding)
-        return outputs
-
-
-def create_dataset(base_path, batch_size, num_epochs, is_train):
-    """Create dataset for training."""
-    columns_list = ["feature", "label"]
-    num_consumer = 4
-
-    if is_train:
-        path = os.path.join(base_path, 'aclImdb_train.mindrecord0')
-    else:
-        path = os.path.join(base_path, 'aclImdb_test.mindrecord0')
-
-    dtrain = ds.MindDataset(path, columns_list, num_consumer)
-    dtrain = dtrain.shuffle(buffer_size=dtrain.get_dataset_size())
-    dtrain = dtrain.batch(batch_size, drop_remainder=True)
-    dtrain = dtrain.repeat(count=num_epochs)
-
-    return dtrain
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='MindSpore LSTM Example')
-    parser.add_argument('--preprocess', type=str, default='false', choices=['true', 'false'],
-                        help='Whether to perform data preprocessing')
-    parser.add_argument('--mode', type=str, default="train", choices=['train', 'test'],
-                        help='implement phase, set to train or test')
-    # Download dataset from 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' and extract to 'aclimdb_path'
-    parser.add_argument('--aclimdb_path', type=str, default="./aclImdb",
-                        help='path where the dataset is store')
-    # Download glove from 'http://nlp.stanford.edu/data/glove.6B.zip' and extract to 'glove_path'
-    # Add a new line '400000 300' at the beginning of 'glove.6B.300d.txt' with '40000' for total words and '300' for vector length
-    parser.add_argument('--glove_path', type=str, default="./glove",
-                        help='path where the glove is store')
-    parser.add_argument('--preprocess_path', type=str, default="./preprocess",
-                        help='path where the pre-process data is store')
-    parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if mode is test, must provide\
-                        path where the trained ckpt file')
-    args = parser.parse_args()
-
-    context.set_context(
-        mode=context.GRAPH_MODE,
-        save_graphs=False,
-        device_target="GPU")
-
-    if args.preprocess == 'true':
-        print("============== Starting Data Pre-processing ==============")
-        shutil.rmtree(args.preprocess_path)
-        os.mkdir(args.preprocess_path)
-        convert_to_mindrecord(cfg.embed_size, args.aclimdb_path, args.preprocess_path, args.glove_path)
-
-    embedding_table = np.loadtxt(os.path.join(args.preprocess_path, "weight.txt")).astype(np.float32)
-    network = SentimentNet(vocab_size=embedding_table.shape[0],
-                           embed_size=cfg.embed_size,
-                           num_hiddens=cfg.num_hiddens,
-                           num_layers=cfg.num_layers,
-                           bidirectional=cfg.bidirectional,
-                           num_classes=cfg.num_classes,
-                           weight=Tensor(embedding_table),
-                           batch_size=cfg.batch_size)
-
-    loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
-    opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum)
-    loss_cb = LossMonitor()
-    model = Model(network, loss, opt, {'acc': Accuracy()})
-
-    if args.mode == 'train':
-        print("============== Starting Training ==============")
-        ds_train = create_dataset(args.preprocess_path, cfg.batch_size, cfg.num_epochs, True)
-        config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
-                                     keep_checkpoint_max=cfg.keep_checkpoint_max)
-        ckpoint_cb = ModelCheckpoint(prefix="lstm", directory=args.ckpt_path, config=config_ck)
-        model.train(cfg.num_epochs, ds_train, callbacks=[ckpoint_cb, loss_cb])
-    elif args.mode == 'test':
-        print("============== Starting Testing ==============")
-        ds_eval = create_dataset(args.preprocess_path, cfg.batch_size, 1, False)
-        param_dict = load_checkpoint(args.ckpt_path)
-        load_param_into_net(network, param_dict)
-        acc = model.eval(ds_eval)
-        print("============== Accuracy:{} ==============".format(acc))
-    else:
-        raise RuntimeError('mode should be train or test, rather than {}'.format(args.mode))
--- a/chapter06/lstm/src/__init__.py
+++ b/chapter06/lstm/src/__init__.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
--- a/chapter06/lstm/config.py
+++ b/chapter06/lstm/config.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ============================================================================
 """
-network config
+network config setting
 """
 from easydict import EasyDict as edict

@@ -22,7 +22,7 @@ lstm_cfg = edict({
    'num_classes': 2,
    'learning_rate': 0.1,
    'momentum': 0.9,
-    'num_epochs': 1,
+    'num_epochs': 20,
    'batch_size': 64,
    'embed_size': 300,
    'num_hiddens': 100,

--- a/chapter06/lstm/src/dataset.py
+++ b/chapter06/lstm/src/dataset.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Data operations, will be used in train.py and eval.py
+"""
+import os
+
+import numpy as np
+
+import mindspore.dataset as ds
+from mindspore.mindrecord import FileWriter
+from .imdb import ImdbParser
+
+
+def lstm_create_dataset(data_home, batch_size, repeat_num=1, training=True):
+    """Data operations."""
+    ds.config.set_seed(1)
+    data_dir = os.path.join(data_home, "aclImdb_train.mindrecord0")
+    if not training:
+        data_dir = os.path.join(data_home, "aclImdb_test.mindrecord0")
+
+    data_set = ds.MindDataset(data_dir, columns_list=["feature", "label"], num_parallel_workers=4)
+
+    # apply map operations on images
+    data_set = data_set.shuffle(buffer_size=data_set.get_dataset_size())
+    data_set = data_set.batch(batch_size=batch_size, drop_remainder=True)
+    data_set = data_set.repeat(count=repeat_num)
+
+    return data_set
+
+
+def _convert_to_mindrecord(data_home, features, labels, weight_np=None, training=True):
+    """
+    convert imdb dataset to mindrecoed dataset
+    """
+    if weight_np is not None:
+        np.savetxt(os.path.join(data_home, 'weight.txt'), weight_np)
+
+    # write mindrecord
+    schema_json = {"id": {"type": "int32"},
+                   "label": {"type": "int32"},
+                   "feature": {"type": "int32", "shape": [-1]}}
+
+    data_dir = os.path.join(data_home, "aclImdb_train.mindrecord")
+    if not training:
+        data_dir = os.path.join(data_home, "aclImdb_test.mindrecord")
+
+    def get_imdb_data(features, labels):
+        data_list = []
+        for i, (label, feature) in enumerate(zip(labels, features)):
+            data_json = {"id": i,
+                         "label": int(label),
+                         "feature": feature.reshape(-1)}
+            data_list.append(data_json)
+        return data_list
+
+    writer = FileWriter(data_dir, shard_num=4)
+    data = get_imdb_data(features, labels)
+    writer.add_schema(schema_json, "nlp_schema")
+    writer.add_index(["id", "label"])
+    writer.write_raw_data(data)
+    writer.commit()
+
+
+def convert_to_mindrecord(embed_size, aclimdb_path, preprocess_path, glove_path):
+    """
+    convert imdb dataset to mindrecoed dataset
+    """
+    parser = ImdbParser(aclimdb_path, glove_path, embed_size)
+    parser.parse()
+
+    if not os.path.exists(preprocess_path):
+        print(f"preprocess path {preprocess_path} is not exist")
+        os.makedirs(preprocess_path)
+
+    train_features, train_labels, train_weight_np = parser.get_datas('train')
+    _convert_to_mindrecord(preprocess_path, train_features, train_labels, train_weight_np)
+
+    test_features, test_labels, _ = parser.get_datas('test')
+    _convert_to_mindrecord(preprocess_path, test_features, test_labels, training=False)
--- a/chapter06/lstm/src/imdb.py
+++ b/chapter06/lstm/src/imdb.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+imdb dataset parser.
+"""
+import os
+from itertools import chain
+
+import numpy as np
+import gensim
+
+
+class ImdbParser():
+    """
+    parse aclImdb data to features and labels.
+    sentence->tokenized->encoded->padding->features
+    """
+
+    def __init__(self, imdb_path, glove_path, embed_size=300):
+        self.__segs = ['train', 'test']
+        self.__label_dic = {'pos': 1, 'neg': 0}
+        self.__imdb_path = imdb_path
+        self.__glove_dim = embed_size
+        self.__glove_file = os.path.join(glove_path, 'glove.6B.' + str(self.__glove_dim) + 'd.txt')
+
+        # properties
+        self.__imdb_datas = {}
+        self.__features = {}
+        self.__labels = {}
+        self.__vacab = {}
+        self.__word2idx = {}
+        self.__weight_np = {}
+        self.__wvmodel = None
+
+    def parse(self):
+        """
+        parse imdb data to memory
+        """
+        self.__wvmodel = gensim.models.KeyedVectors.load_word2vec_format(self.__glove_file)
+
+        for seg in self.__segs:
+            self.__parse_imdb_datas(seg)
+            self.__parse_features_and_labels(seg)
+            self.__gen_weight_np(seg)
+
+    def __parse_imdb_datas(self, seg):
+        """
+        load data from txt
+        """
+        data_lists = []
+        for label_name, label_id in self.__label_dic.items():
+            sentence_dir = os.path.join(self.__imdb_path, seg, label_name)
+            for file in os.listdir(sentence_dir):
+                with open(os.path.join(sentence_dir, file), mode='r', encoding='utf8') as f:
+                    sentence = f.read().replace('\n', '')
+                    data_lists.append([sentence, label_id])
+        self.__imdb_datas[seg] = data_lists
+
+    def __parse_features_and_labels(self, seg):
+        """
+        parse features and labels
+        """
+        features = []
+        labels = []
+        for sentence, label in self.__imdb_datas[seg]:
+            features.append(sentence)
+            labels.append(label)
+
+        self.__features[seg] = features
+        self.__labels[seg] = labels
+
+        # update feature to tokenized
+        self.__updata_features_to_tokenized(seg)
+        # parse vacab
+        self.__parse_vacab(seg)
+        # encode feature
+        self.__encode_features(seg)
+        # padding feature
+        self.__padding_features(seg)
+
+    def __updata_features_to_tokenized(self, seg):
+        tokenized_features = []
+        for sentence in self.__features[seg]:
+            tokenized_sentence = [word.lower() for word in sentence.split(" ")]
+            tokenized_features.append(tokenized_sentence)
+        self.__features[seg] = tokenized_features
+
+    def __parse_vacab(self, seg):
+        # vocab
+        tokenized_features = self.__features[seg]
+        vocab = set(chain(*tokenized_features))
+        self.__vacab[seg] = vocab
+
+        # word_to_idx: {'hello': 1, 'world':111, ... '<unk>': 0}
+        word_to_idx = {word: i + 1 for i, word in enumerate(vocab)}
+        word_to_idx['<unk>'] = 0
+        self.__word2idx[seg] = word_to_idx
+
+    def __encode_features(self, seg):
+        """ encode word to index """
+        word_to_idx = self.__word2idx['train']
+        encoded_features = []
+        for tokenized_sentence in self.__features[seg]:
+            encoded_sentence = []
+            for word in tokenized_sentence:
+                encoded_sentence.append(word_to_idx.get(word, 0))
+            encoded_features.append(encoded_sentence)
+        self.__features[seg] = encoded_features
+
+    def __padding_features(self, seg, maxlen=500, pad=0):
+        """ pad all features to the same length """
+        padded_features = []
+        for feature in self.__features[seg]:
+            if len(feature) >= maxlen:
+                padded_feature = feature[:maxlen]
+            else:
+                padded_feature = feature
+                while len(padded_feature) < maxlen:
+                    padded_feature.append(pad)
+            padded_features.append(padded_feature)
+        self.__features[seg] = padded_features
+
+    def __gen_weight_np(self, seg):
+        """
+        generate weight by gensim
+        """
+        weight_np = np.zeros((len(self.__word2idx[seg]), self.__glove_dim), dtype=np.float32)
+        for word, idx in self.__word2idx[seg].items():
+            if word not in self.__wvmodel:
+                continue
+            word_vector = self.__wvmodel.get_vector(word)
+            weight_np[idx, :] = word_vector
+
+        self.__weight_np[seg] = weight_np
+
+    def get_datas(self, seg):
+        """
+        return features, labels, and weight
+        """
+        features = np.array(self.__features[seg]).astype(np.int32)
+        labels = np.array(self.__labels[seg]).astype(np.int32)
+        weight = np.array(self.__weight_np[seg])
+        return features, labels, weight
--- a/chapter06/lstm/src/lstm.py
+++ b/chapter06/lstm/src/lstm.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""LSTM."""
+
+import numpy as np
+
+from mindspore import Tensor, nn, context
+from mindspore.ops import operations as P
+
+# Initialize short-term memory (h) and long-term memory (c) to 0
+def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional):
+    """init default input."""
+    num_directions = 1
+    if bidirectional:
+        num_directions = 2
+
+    if context.get_context("device_target") == "CPU":
+        h_list = []
+        c_list = []
+        i = 0
+        while i < num_layers:
+            hi = Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32))
+            h_list.append(hi)
+            ci = Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32))
+            c_list.append(ci)
+            i = i + 1
+        h = tuple(h_list)
+        c = tuple(c_list)
+        return h, c
+
+    h = Tensor(
+        np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32))
+    c = Tensor(
+        np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32))
+    return h, c
+
+
+class SentimentNet(nn.Cell):
+    """Sentiment network structure."""
+
+    def __init__(self,
+                 vocab_size,
+                 embed_size,
+                 num_hiddens,
+                 num_layers,
+                 bidirectional,
+                 num_classes,
+                 weight,
+                 batch_size):
+        super(SentimentNet, self).__init__()
+        # Mapp words to vectors
+        self.embedding = nn.Embedding(vocab_size,
+                                      embed_size,
+                                      embedding_table=weight)
+        self.embedding.embedding_table.requires_grad = False
+        self.trans = P.Transpose()
+        self.perm = (1, 0, 2)
+        self.encoder = nn.LSTM(input_size=embed_size,
+                               hidden_size=num_hiddens,
+                               num_layers=num_layers,
+                               has_bias=True,
+                               bidirectional=bidirectional,
+                               dropout=0.0)
+
+        self.h, self.c = lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional)
+
+        self.concat = P.Concat(1)
+        if bidirectional:
+            self.decoder = nn.Dense(num_hiddens * 4, num_classes)
+        else:
+            self.decoder = nn.Dense(num_hiddens * 2, num_classes)
+
+    def construct(self, inputs):
+        # input：(64,500,300)
+        embeddings = self.embedding(inputs)
+        embeddings = self.trans(embeddings, self.perm)
+        output, _ = self.encoder(embeddings, (self.h, self.c))
+        # states[i] size(64,200)  -> encoding.size(64,400)
+        encoding = self.concat((output[0], output[499]))
+        outputs = self.decoder(encoding)
+        return outputs
--- a/chapter06/lstm/train.py
+++ b/chapter06/lstm/train.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+#################train lstm example on aclImdb########################
+python train.py --preprocess=true --aclimdb_path=your_imdb_path --glove_path=your_glove_path
+"""
+import argparse
+import os
+
+import numpy as np
+
+from src.config import lstm_cfg as cfg
+from src.dataset import convert_to_mindrecord
+from src.dataset import lstm_create_dataset
+from src.lstm import SentimentNet
+from mindspore import Tensor, nn, Model, context
+from mindspore.nn import Accuracy
+from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor
+from mindspore.train.serialization import load_param_into_net, load_checkpoint
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='MindSpore LSTM Example')
+    parser.add_argument('--preprocess', type=str, default='false', choices=['true', 'false'],
+                        help='whether to preprocess data.')
+    parser.add_argument('--aclimdb_path', type=str, default="./aclImdb",
+                        help='path where the dataset is stored.')
+    parser.add_argument('--glove_path', type=str, default="./glove",
+                        help='path where the GloVe is stored.')
+    parser.add_argument('--preprocess_path', type=str, default="./preprocess",
+                        help='path where the pre-process data is stored.')
+    parser.add_argument('--ckpt_path', type=str, default="./",
+                        help='the path to save the checkpoint file.')
+    parser.add_argument('--pre_trained', type=str, default=None,
+                        help='the pretrained checkpoint file path.')
+    parser.add_argument('--device_target', type=str, default="GPU", choices=['GPU', 'CPU'],
+                        help='the target device to run, support "GPU", "CPU". Default: "GPU".')
+    args = parser.parse_args()
+
+    context.set_context(
+        mode=context.GRAPH_MODE,
+        save_graphs=False,
+        device_target=args.device_target)
+
+    if args.preprocess == "true":
+        print("============== Starting Data Pre-processing ==============")
+        convert_to_mindrecord(cfg.embed_size, args.aclimdb_path, args.preprocess_path, args.glove_path)
+
+    embedding_table = np.loadtxt(os.path.join(args.preprocess_path, "weight.txt")).astype(np.float32)
+    network = SentimentNet(vocab_size=embedding_table.shape[0],
+                           embed_size=cfg.embed_size,
+                           num_hiddens=cfg.num_hiddens,
+                           num_layers=cfg.num_layers,
+                           bidirectional=cfg.bidirectional,
+                           num_classes=cfg.num_classes,
+                           weight=Tensor(embedding_table),
+                           batch_size=cfg.batch_size)
+    # pre_trained
+    if args.pre_trained:
+        load_param_into_net(network, load_checkpoint(args.pre_trained))
+
+    loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+    opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum)
+    loss_cb = LossMonitor()
+
+    model = Model(network, loss, opt, {'acc': Accuracy()})
+
+    print("============== Starting Training ==============")
+    ds_train = lstm_create_dataset(args.preprocess_path, cfg.batch_size, cfg.num_epochs)
+    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
+                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
+    ckpoint_cb = ModelCheckpoint(prefix="lstm", directory=args.ckpt_path, config=config_ck)
+    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
+    if args.device_target == "CPU":
+        model.train(cfg.num_epochs, ds_train, callbacks=[time_cb, ckpoint_cb, loss_cb], dataset_sink_mode=False)
+    else:
+        model.train(cfg.num_epochs, ds_train, callbacks=[time_cb, ckpoint_cb, loss_cb])
+    print("============== Training Success ==============")