From f8e8fb5cc4285f0fd047e61c5b71a9e629a87e7b Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Sun, 3 Feb 2019 11:22:33 +0800
Subject: [PATCH] Refine .gitignore and fix the revert PR (#1744)

---
 .gitignore                                    |  28 ---
 fluid/PaddleCV/HiNAS_models/build/__init__.py |   0
 fluid/PaddleCV/HiNAS_models/build/layers.py   | 214 ++++++++++++++++++
 fluid/PaddleCV/HiNAS_models/build/ops.py      | 117 ++++++++++
 .../HiNAS_models/build/resnet_base.py         | 109 +++++++++
 fluid/PaddleCV/HiNAS_models/build/vgg_base.py |  70 ++++++
 .../sequence_tagging_for_ner/train.py         |   5 +-
 .../data_generator/build_raw_data.py          |  62 +++++
 8 files changed, 575 insertions(+), 30 deletions(-)
 create mode 100755 fluid/PaddleCV/HiNAS_models/build/__init__.py
 create mode 100755 fluid/PaddleCV/HiNAS_models/build/layers.py
 create mode 100755 fluid/PaddleCV/HiNAS_models/build/ops.py
 create mode 100755 fluid/PaddleCV/HiNAS_models/build/resnet_base.py
 create mode 100755 fluid/PaddleCV/HiNAS_models/build/vgg_base.py
 create mode 100644 fluid/PaddleNLP/text_classification/async_executor/data_generator/build_raw_data.py

diff --git a/.gitignore b/.gitignore
index e3fa5ac6..9376aa94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,33 +1,5 @@
-paddle/operators/check_t.save
-paddle/operators/check_tensor.ls
-paddle/operators/tensor.save
-python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
-python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
-python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 *.DS_Store
 *.vs
-build/
-build_doc/
 *.user
-
-.vscode
-.idea
-.project
-.cproject
-.pydevproject
-.settings/
-
 *.pyc
-CMakeSettings.json
-Makefile
-.test_env/
-third_party/
-
 *~
-bazel-*
-third_party/
-
-build_*
-# clion workspace.
-cmake-build-*
-model_test
\ No newline at end of file
diff --git a/fluid/PaddleCV/HiNAS_models/build/__init__.py b/fluid/PaddleCV/HiNAS_models/build/__init__.py
new file mode 100755
index 00000000..e69de29b
diff --git a/fluid/PaddleCV/HiNAS_models/build/layers.py b/fluid/PaddleCV/HiNAS_models/build/layers.py
new file mode 100755
index 00000000..5bd67fb8
--- /dev/null
+++ b/fluid/PaddleCV/HiNAS_models/build/layers.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import operator
+
+import numpy as np
+import paddle.fluid as fluid
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_float("bn_decay", 0.9, "batch norm decay")
+flags.DEFINE_float("dropout_rate", 0.5, "dropout rate")
+
+
+def calc_padding(img_width, stride, dilation, filter_width):
+    """ calculate pixels to padding in order to keep input/output size same. """
+
+    filter_width = dilation * (filter_width - 1) + 1
+    if img_width % stride == 0:
+        pad_along_width = max(filter_width - stride, 0)
+    else:
+        pad_along_width = max(filter_width - (img_width % stride), 0)
+    return pad_along_width // 2, pad_along_width - pad_along_width // 2
+
+
+def conv(inputs,
+         filters,
+         kernel,
+         strides=(1, 1),
+         dilation=(1, 1),
+         num_groups=1,
+         conv_param=None):
+    """ normal conv layer """
+
+    if isinstance(kernel, (tuple, list)):
+        n = operator.mul(*kernel) * inputs.shape[1]
+    else:
+        n = kernel * kernel * inputs.shape[1]
+
+    # pad input
+    padding = (0, 0, 0, 0) \
+        + calc_padding(inputs.shape[2], strides[0], dilation[0], kernel[0]) \
+        + calc_padding(inputs.shape[3], strides[1], dilation[1], kernel[1])
+    if sum(padding) > 0:
+        inputs = fluid.layers.pad(inputs, padding, 0)
+
+    param_attr = fluid.param_attr.ParamAttr(
+        initializer=fluid.initializer.NormalInitializer(
+            0.0, scale=np.sqrt(2.0 / n)),
+        regularizer=fluid.regularizer.L2Decay(FLAGS.weight_decay))
+
+    bias_attr = fluid.param_attr.ParamAttr(
+        regularizer=fluid.regularizer.L2Decay(0.))
+
+    return fluid.layers.conv2d(
+        inputs,
+        filters,
+        kernel,
+        stride=strides,
+        padding=0,
+        dilation=dilation,
+        groups=num_groups,
+        param_attr=param_attr if conv_param is None else conv_param,
+        use_cudnn=False if num_groups == inputs.shape[1] == filters else True,
+        bias_attr=bias_attr,
+        act=None)
+
+
+def sep(inputs, filters, kernel, strides=(1, 1), dilation=(1, 1)):
+    """ Separable convolution layer """
+
+    if isinstance(kernel, (tuple, list)):
+        n_depth = operator.mul(*kernel)
+    else:
+        n_depth = kernel * kernel
+    n_point = inputs.shape[1]
+
+    if isinstance(strides, (tuple, list)):
+        multiplier = strides[0]
+    else:
+        multiplier = strides
+
+    depthwise_param = fluid.param_attr.ParamAttr(
+        initializer=fluid.initializer.NormalInitializer(
+            0.0, scale=np.sqrt(2.0 / n_depth)),
+        regularizer=fluid.regularizer.L2Decay(FLAGS.weight_decay))
+
+    pointwise_param = fluid.param_attr.ParamAttr(
+        initializer=fluid.initializer.NormalInitializer(
+            0.0, scale=np.sqrt(2.0 / n_point)),
+        regularizer=fluid.regularizer.L2Decay(FLAGS.weight_decay))
+
+    depthwise_conv = conv(
+        inputs=inputs,
+        kernel=kernel,
+        filters=int(filters * multiplier),
+        strides=strides,
+        dilation=dilation,
+        num_groups=int(filters * multiplier),
+        conv_param=depthwise_param)
+
+    return conv(
+        inputs=depthwise_conv,
+        kernel=(1, 1),
+        filters=int(filters * multiplier),
+        strides=(1, 1),
+        dilation=dilation,
+        conv_param=pointwise_param)
+
+
+def maxpool(inputs, kernel, strides=(1, 1)):
+    padding = (0, 0, 0, 0) \
+              + calc_padding(inputs.shape[2], strides[0], 1, kernel[0]) \
+              + calc_padding(inputs.shape[3], strides[1], 1, kernel[1])
+    if sum(padding) > 0:
+        inputs = fluid.layers.pad(inputs, padding, 0)
+
+    return fluid.layers.pool2d(
+        inputs, kernel, 'max', strides, pool_padding=0, ceil_mode=False)
+
+
+def avgpool(inputs, kernel, strides=(1, 1)):
+    padding_pixel = (0, 0, 0, 0)
+    padding_pixel += calc_padding(inputs.shape[2], strides[0], 1, kernel[0])
+    padding_pixel += calc_padding(inputs.shape[3], strides[1], 1, kernel[1])
+
+    if padding_pixel[4] == padding_pixel[5] and padding_pixel[
+            6] == padding_pixel[7]:
+        # same padding pixel num on all sides.
+        return fluid.layers.pool2d(
+            inputs,
+            kernel,
+            'avg',
+            strides,
+            pool_padding=(padding_pixel[4], padding_pixel[6]),
+            ceil_mode=False)
+    elif padding_pixel[4] + 1 == padding_pixel[5] and padding_pixel[6] + 1 == padding_pixel[7] \
+            and strides == (1, 1):
+        # different padding size: first pad then crop.
+        x = fluid.layers.pool2d(
+            inputs,
+            kernel,
+            'avg',
+            strides,
+            pool_padding=(padding_pixel[5], padding_pixel[7]),
+            ceil_mode=False)
+        x_shape = x.shape
+        return fluid.layers.crop(
+            x,
+            shape=(-1, x_shape[1], x_shape[2] - 1, x_shape[3] - 1),
+            offsets=(0, 0, 1, 1))
+    else:
+        # not support. use padding-zero and pool2d.
+        print("Warning: use zero-padding in avgpool")
+        outputs = fluid.layers.pad(inputs, padding_pixel, 0)
+        return fluid.layers.pool2d(
+            outputs, kernel, 'avg', strides, pool_padding=0, ceil_mode=False)
+
+
+def global_avgpool(inputs):
+    return fluid.layers.pool2d(
+        inputs,
+        1,
+        'avg',
+        1,
+        pool_padding=0,
+        global_pooling=True,
+        ceil_mode=True)
+
+
+def fully_connected(inputs, units):
+    n = inputs.shape[1]
+    param_attr = fluid.param_attr.ParamAttr(
+        initializer=fluid.initializer.NormalInitializer(
+            0.0, scale=np.sqrt(2.0 / n)),
+        regularizer=fluid.regularizer.L2Decay(FLAGS.weight_decay))
+
+    bias_attr = fluid.param_attr.ParamAttr(
+        regularizer=fluid.regularizer.L2Decay(0.))
+
+    return fluid.layers.fc(inputs,
+                           units,
+                           param_attr=param_attr,
+                           bias_attr=bias_attr)
+
+
+def bn_relu(inputs):
+    """ batch norm + rely layer """
+
+    output = fluid.layers.batch_norm(
+        inputs, momentum=FLAGS.bn_decay, epsilon=0.001, data_layout="NCHW")
+    return fluid.layers.relu(output)
+
+
+def dropout(inputs):
+    """ dropout layer """
+
+    return fluid.layers.dropout(inputs, dropout_prob=FLAGS.dropout_rate)
diff --git a/fluid/PaddleCV/HiNAS_models/build/ops.py b/fluid/PaddleCV/HiNAS_models/build/ops.py
new file mode 100755
index 00000000..359f6285
--- /dev/null
+++ b/fluid/PaddleCV/HiNAS_models/build/ops.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import build.layers as layers
+
+
+def conv_1x1(inputs, downsample=False):
+    return conv_base(inputs, (1, 1), downsample=downsample)
+
+
+def conv_2x2(inputs, downsample=False):
+    return conv_base(inputs, (2, 2), downsample=downsample)
+
+
+def conv_3x3(inputs, downsample=False):
+    return conv_base(inputs, (3, 3), downsample=downsample)
+
+
+def dilated_2x2(inputs, downsample=False):
+    return conv_base(inputs, (2, 2), (2, 2), downsample)
+
+
+def conv_1x2_2x1(inputs, downsample=False):
+    return pair_base(inputs, 2, downsample)
+
+
+def conv_1x3_3x1(inputs, downsample=False):
+    return pair_base(inputs, 3, downsample)
+
+
+def sep_2x2(inputs, downsample=False):
+    return sep_base(inputs, (2, 2), downsample=downsample)
+
+
+def sep_3x3(inputs, downsample=False):
+    return sep_base(inputs, (3, 3), downsample=downsample)
+
+
+def maxpool_2x2(inputs, downsample=False):
+    return maxpool_base(inputs, (2, 2), downsample)
+
+
+def maxpool_3x3(inputs, downsample=False):
+    return maxpool_base(inputs, (3, 3), downsample)
+
+
+def avgpool_2x2(inputs, downsample=False):
+    return avgpool_base(inputs, (2, 2), downsample)
+
+
+def avgpool_3x3(inputs, downsample=False):
+    return avgpool_base(inputs, (3, 3), downsample)
+
+
+def conv_base(inputs, kernel, dilation=(1, 1), downsample=False):
+    filters = inputs.shape[1]
+    if downsample:
+        output = layers.conv(inputs, filters * 2, kernel, (2, 2))
+    else:
+        output = layers.conv(inputs, filters, kernel, dilation=dilation)
+    return output
+
+
+def pair_base(inputs, kernel, downsample=False):
+    filters = inputs.shape[1]
+    if downsample:
+        output = layers.conv(inputs, filters, (1, kernel), (1, 2))
+        output = layers.conv(output, filters, (kernel, 1), (2, 1))
+        output = layers.conv(output, filters * 2, (1, 1))
+    else:
+        output = layers.conv(inputs, filters, (1, kernel))
+        output = layers.conv(output, filters, (kernel, 1))
+    return output
+
+
+def sep_base(inputs, kernel, dilation=(1, 1), downsample=False):
+    filters = inputs.shape[1]
+    if downsample:
+        output = layers.sep(inputs, filters * 2, kernel, (2, 2))
+    else:
+        output = layers.sep(inputs, filters, kernel, dilation=dilation)
+    return output
+
+
+def maxpool_base(inputs, kernel, downsample=False):
+    if downsample:
+        filters = inputs.shape[1]
+        output = layers.maxpool(inputs, kernel, (2, 2))
+        output = layers.conv(output, filters * 2, (1, 1))
+    else:
+        output = layers.maxpool(inputs, kernel)
+    return output
+
+
+def avgpool_base(inputs, kernel, downsample=False):
+    if downsample:
+        filters = inputs.shape[1]
+        output = layers.avgpool(inputs, kernel, (2, 2))
+        output = layers.conv(output, filters * 2, (1, 1))
+    else:
+        output = layers.avgpool(inputs, kernel)
+    return output
diff --git a/fluid/PaddleCV/HiNAS_models/build/resnet_base.py b/fluid/PaddleCV/HiNAS_models/build/resnet_base.py
new file mode 100755
index 00000000..76c870de
--- /dev/null
+++ b/fluid/PaddleCV/HiNAS_models/build/resnet_base.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from absl import flags
+
+import build.layers as layers
+import build.ops as _ops
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_integer("num_stages", 3, "number of stages")
+flags.DEFINE_integer("num_blocks", 5, "number of blocks per stage")
+flags.DEFINE_integer("num_ops", 2, "number of operations per block")
+flags.DEFINE_integer("width", 64, "network width")
+flags.DEFINE_string("downsample", "pool", "conv or pool")
+
+num_classes = 10
+
+ops = [
+    _ops.conv_1x1,
+    _ops.conv_2x2,
+    _ops.conv_3x3,
+    _ops.dilated_2x2,
+    _ops.conv_1x2_2x1,
+    _ops.conv_1x3_3x1,
+    _ops.sep_2x2,
+    _ops.sep_3x3,
+    _ops.maxpool_2x2,
+    _ops.maxpool_3x3,
+    _ops.avgpool_2x2,
+    _ops.avgpool_3x3,
+]
+
+
+def net(inputs, tokens):
+    """ build network with skip links """
+
+    x = layers.conv(inputs, FLAGS.width, (3, 3))
+
+    num_ops = FLAGS.num_blocks * FLAGS.num_ops
+    x = stage(x, tokens[:num_ops], pre_activation=True)
+    for i in range(1, FLAGS.num_stages):
+        x = stage(x, tokens[i * num_ops:(i + 1) * num_ops], downsample=True)
+
+    x = layers.bn_relu(x)
+    x = layers.global_avgpool(x)
+    x = layers.dropout(x)
+    logits = layers.fully_connected(x, num_classes)
+
+    return fluid.layers.softmax(logits)
+
+
+def stage(x, tokens, pre_activation=False, downsample=False):
+    """ build network's stage. Stage consists of blocks """
+
+    x = block(x, tokens[:FLAGS.num_ops], pre_activation, downsample)
+    for i in range(1, FLAGS.num_blocks):
+        print("-" * 12)
+        x = block(x, tokens[i * FLAGS.num_ops:(i + 1) * FLAGS.num_ops])
+    print("=" * 12)
+
+    return x
+
+
+def block(x, tokens, pre_activation=False, downsample=False):
+    """ build block. """
+
+    if pre_activation:
+        x = layers.bn_relu(x)
+        res = x
+    else:
+        res = x
+        x = layers.bn_relu(x)
+
+    x = ops[tokens[0]](x, downsample)
+    print("%s \t-> shape %s" % (ops[0].__name__, x.shape))
+    for token in tokens[1:]:
+        x = layers.bn_relu(x)
+        x = ops[token](x)
+        print("%s \t-> shape %s" % (ops[token].__name__, x.shape))
+
+    if downsample:
+        filters = res.shape[1]
+        if FLAGS.downsample == "conv":
+            res = layers.conv(res, filters * 2, (1, 1), (2, 2))
+        elif FLAGS.downsample == "pool":
+            res = layers.avgpool(res, (2, 2), (2, 2))
+            res = fluid.layers.pad(res, (0, 0, filters // 2, filters // 2, 0, 0,
+                                         0, 0))
+        else:
+            raise NotImplementedError
+
+    return x + res
diff --git a/fluid/PaddleCV/HiNAS_models/build/vgg_base.py b/fluid/PaddleCV/HiNAS_models/build/vgg_base.py
new file mode 100755
index 00000000..d7506a7e
--- /dev/null
+++ b/fluid/PaddleCV/HiNAS_models/build/vgg_base.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from absl import flags
+
+import build.layers as layers
+import build.ops as _ops
+
+FLAGS = flags.FLAGS
+flags.DEFINE_integer("num_stages", 5, "number of stages")
+flags.DEFINE_integer("width", 64, "network width")
+
+num_classes = 10
+
+ops = [
+    _ops.conv_1x1,  #0
+    _ops.conv_2x2,  #1
+    _ops.conv_3x3,  #2
+    _ops.dilated_2x2,  #3
+    _ops.conv_1x2_2x1,  #4
+    _ops.conv_1x3_3x1,  #5
+    _ops.sep_2x2,  #6
+    _ops.sep_3x3,  #7
+    _ops.maxpool_2x2,  #8
+    _ops.maxpool_3x3,
+    _ops.avgpool_2x2,  #10
+    _ops.avgpool_3x3,
+]
+
+
+def net(inputs, tokens):
+    depth = len(tokens)
+    q, r = divmod(depth + 1, FLAGS.num_stages)
+    downsample_steps = [
+        i * q + max(0, i + r - FLAGS.num_stages + 1) - 2
+        for i in range(1, FLAGS.num_stages)
+    ]
+
+    x = layers.conv(inputs, FLAGS.width, (3, 3))
+    x = layers.bn_relu(x)
+
+    for i, token in enumerate(tokens):
+        downsample = i in downsample_steps
+        x = ops[token](x, downsample)
+        print("%s \t-> shape %s" % (ops[token].__name__, x.shape))
+        if downsample:
+            print("=" * 12)
+        x = layers.bn_relu(x)
+
+    x = layers.global_avgpool(x)
+    x = layers.dropout(x)
+    logits = layers.fully_connected(x, num_classes)
+
+    return fluid.layers.softmax(logits)
diff --git a/fluid/PaddleNLP/sequence_tagging_for_ner/train.py b/fluid/PaddleNLP/sequence_tagging_for_ner/train.py
index b77c081b..68e62137 100644
--- a/fluid/PaddleNLP/sequence_tagging_for_ner/train.py
+++ b/fluid/PaddleNLP/sequence_tagging_for_ner/train.py
@@ -136,8 +136,9 @@ def main(train_data_file,
               " pass_f1_score:" + str(test_pass_f1_score))
 
         save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
-        fluid.io.save_inference_model(save_dirname, ['word', 'mark'],
-                                      crf_decode, exe)
+        if "CE_MODE_X" not in os.environ:
+            fluid.io.save_inference_model(save_dirname, ['word', 'mark'],
+                                          crf_decode, exe)
 
     if "CE_MODE_X" in os.environ:
         print("kpis	train_precision	%f" % pass_precision)
diff --git a/fluid/PaddleNLP/text_classification/async_executor/data_generator/build_raw_data.py b/fluid/PaddleNLP/text_classification/async_executor/data_generator/build_raw_data.py
new file mode 100644
index 00000000..2c0c0981
--- /dev/null
+++ b/fluid/PaddleNLP/text_classification/async_executor/data_generator/build_raw_data.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Build raw data
+"""
+from __future__ import print_function
+import sys
+import os
+import random
+import re
+data_type = sys.argv[1]
+
+if not (data_type == "train" or data_type == "test"):
+    print("python %s [test/train]" % sys.argv[0], file=sys.stderr)
+    sys.exit(-1)
+
+pos_folder = "aclImdb/" + data_type + "/pos/"
+neg_folder = "aclImdb/" + data_type + "/neg/"
+
+pos_train_list = [(pos_folder + x, "1") for x in os.listdir(pos_folder)]
+neg_train_list = [(neg_folder + x, "0") for x in os.listdir(neg_folder)]
+
+all_train_list = pos_train_list + neg_train_list
+random.shuffle(all_train_list)
+
+
+def load_dict(dictfile):
+    """
+    Load word id dict
+    """
+    vocab = {}
+    wid = 0
+    with open(dictfile) as f:
+        for line in f:
+            vocab[line.strip()] = str(wid)
+            wid += 1
+    return vocab
+
+
+vocab = load_dict("aclImdb/imdb.vocab")
+unk_id = str(len(vocab))
+print("vocab size: ", len(vocab), file=sys.stderr)
+pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))')
+
+for fitem in all_train_list:
+    label = str(fitem[1])
+    fname = fitem[0]
+    with open(fname) as f:
+        sent = f.readline().lower().replace("<br />", " ").strip()
+        out_s = "%s | %s" % (sent, label)
+        print(out_s, file=sys.stdout)
-- 
GitLab