Seperate configuration and running logic.

347626a4 · yangyaming · 2160b34b · 347626a4 · 347626a4 · 347626a4
Showing with 63 addition and 46 deletion

word_embedding/network_conf.py word_embedding/network_conf.py +18 -18

word_embedding/predict_v2.py word_embedding/predict_v2.py +33 -23

word_embedding/train_v2.py word_embedding/train_v2.py +12 -5

未找到文件。
--- a/word_embedding/network_conf.py
+++ b/word_embedding/network_conf.py
@@ -5,7 +5,7 @@ import math
 import paddle.v2 as paddle
-def network_conf(hidden_size, embed_size, dict_size):
+def network_conf(is_train, hidden_size, embed_size, dict_size):
    def word_embed(in_layer):
        ''' word embedding layer '''
        word_embed = paddle.layer.table_projection(
@@ -44,20 +44,20 @@ def network_conf(hidden_size, embed_size, dict_size):
        param_attr=paddle.attr.Param(
            initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1))
+    if is_train == True:
        cost = paddle.layer.hsigmoid(
            input=hidden_layer,
            label=target_word,
            num_classes=dict_size,
            param_attr=paddle.attr.Param(name='sigmoid_w'),
            bias_attr=paddle.attr.Param(name='sigmoid_b'))
+        return cost
+    else:
        with paddle.layer.mixed(
                size=dict_size - 1,
                act=paddle.activation.Sigmoid(),
                bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
            prediction += paddle.layer.trans_full_matrix_projection(
-            input=hidden_layer, param_attr=paddle.attr.Param(name='sigmoid_w'))
+                input=hidden_layer,
+                param_attr=paddle.attr.Param(name='sigmoid_w'))
-    input_data_lst = ['firstw', 'secondw', 'thirdw', 'fourthw', 'fifthw']
+        return prediction
-    return input_data_lst, cost, prediction
--- a/word_embedding/predict_v2.py
+++ b/word_embedding/predict_v2.py
@@ -7,6 +7,16 @@ import gzip
 def decode_res(infer_res, dict_size):
+    """
+    Inferring probabilities are orginized as a complete binary tree.
+    The actual labels are leaves (indices are counted from class number).
+    This function travels paths decoded from inferring results.
+    If the probability >0.5 then go to right child, otherwise go to left child.
+    param infer_res: inferring result
+    param dict_size: class number
+    return predict_lbls: actual class
+    """
    predict_lbls = []
    infer_res = infer_res > 0.5
    for i, probs in enumerate(infer_res):
@@ -20,33 +30,30 @@ def decode_res(infer_res, dict_size):
                idx = idx * 2 + 2  # right child
            else:
                idx = idx * 2 + 1  # left child
        predict_lbl = result - dict_size
        predict_lbls.append(predict_lbl)
    return predict_lbls
 def main():
-    paddle.init(use_gpu=False, trainer_count=4)
+    paddle.init(use_gpu=False, trainer_count=1)
-    word_dict = paddle.dataset.imikolov.build_dict()
+    word_dict = paddle.dataset.imikolov.build_dict(typo_freq=2)
    dict_size = len(word_dict)
-    _, _, prediction = network_conf(
+    prediction = network_conf(
-        hidden_size=256, embed_size=32, dict_size=dict_size)
+        is_train=False, hidden_size=256, embed_size=32, dict_size=dict_size)
    print('Load model ....')
    with gzip.open('./models/model_pass_00000.tar.gz') as f:
        parameters = paddle.parameters.Parameters.from_tar(f)
-        ins_num = 10
+    ins_num = 10  # total 10 instance for prediction
-        ins_lst = []
+    ins_lst = []  # input data
-        ins_lbls = []
-        ins_buffer = paddle.reader.shuffle(
+    ins_iter = paddle.dataset.imikolov.test(word_dict, 5)
-            lambda: paddle.dataset.imikolov.train(word_dict, 5)(),
-            buf_size=1000)
-        for ins in ins_buffer():
+    for ins in ins_iter():
        ins_lst.append(ins[:-1])
-            ins_lbls.append(ins[-1])
        if len(ins_lst) >= ins_num: break
    infer_res = paddle.infer(
@@ -55,12 +62,15 @@ def main():
    idx_word_dict = dict((v, k) for k, v in word_dict.items())
    predict_lbls = decode_res(infer_res, dict_size)
-        predict_words = [idx_word_dict[lbl] for lbl in predict_lbls]
+    predict_words = [idx_word_dict[lbl] for lbl in predict_lbls]  # map to word
-        gt_words = [idx_word_dict[lbl] for lbl in ins_lbls]
+    # Ouput format: word1 word2 word3 word4 -> predict label
    for i, ins in enumerate(ins_lst):
-            print idx_word_dict[ins[0]] + ' ' + idx_word_dict[ins[1]] + \
+        print idx_word_dict[ins[0]] + ' ' + \
-             ' -> ' + predict_words[i] + ' ( ' + gt_words[i] + ' )'
+            idx_word_dict[ins[1]] + ' ' + \
+            idx_word_dict[ins[2]] + ' ' + \
+            idx_word_dict[ins[3]] + ' ' + \
+         ' -> ' + predict_words[i]
 if __name__ == '__main__':

--- a/word_embedding/train_v2.py
+++ b/word_embedding/train_v2.py
@@ -8,10 +8,10 @@ import gzip
 def main():
    paddle.init(use_gpu=False, trainer_count=1)
-    word_dict = paddle.dataset.imikolov.build_dict()
+    word_dict = paddle.dataset.imikolov.build_dict(typo_freq=2)
    dict_size = len(word_dict)
-    input_data_lst, cost, prediction = network_conf(
+    cost = network_conf(
-        hidden_size=256, embed_size=32, dict_size=dict_size)
+        is_train=True, hidden_size=256, embed_size=32, dict_size=dict_size)
    def event_handler(event):
        if isinstance(event, paddle.event.EndPass):
@@ -28,8 +28,15 @@ def main():
                print "Pass %d, Batch %d, Cost %f" % (
                    event.pass_id, event.batch_id, event.cost)
-    feeding = dict(zip(input_data_lst, xrange(len(input_data_lst))))
+    feeding = {
-    parameters = paddle.parameters.create([cost, prediction])
+        'firstw': 0,
+        'secondw': 1,
+        'thirdw': 2,
+        'fourthw': 3,
+        'fifthw': 4
+    }
+    parameters = paddle.parameters.create(cost)
    adam_optimizer = paddle.optimizer.Adam(
        learning_rate=3e-3,
        regularization=paddle.optimizer.L2Regularization(8e-4))