From 4946f5f5e5679834067dc317bf79a3523a8ebb9f Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 9 Jan 2019 10:29:40 +0000
Subject: [PATCH] for test w2v on small

---
 fluid/PaddleRec/word2vec/async_train.py |  2 +-
 fluid/PaddleRec/word2vec/infer.py       |  2 +-
 fluid/PaddleRec/word2vec/preprocess.py  | 64 ++++++++++++++-----------
 fluid/PaddleRec/word2vec/train.py       |  1 +
 4 files changed, 38 insertions(+), 31 deletions(-)
diff --git a/fluid/PaddleRec/word2vec/async_train.py b/fluid/PaddleRec/word2vec/async_train.py
index 91301088..0fa989dd 100644
--- a/fluid/PaddleRec/word2vec/async_train.py
+++ b/fluid/PaddleRec/word2vec/async_train.py
@@ -164,7 +164,7 @@ def async_train_loop(args, train_program, dataset, loss, thread_num):
                            debug=False)
         epoch_stop = time.time()
         run_time = epoch_stop - epoch_start
-        lines = len(filelist) * 1000000.0
+        lines = 109984625
         print("run epoch%d done, lines=%s, time=%d, sample/second=%s" %
               (i + 1, lines, run_time, lines / run_time))
         epoch_model = "word2vec_model/epoch" + str(i + 1)
diff --git a/fluid/PaddleRec/word2vec/infer.py b/fluid/PaddleRec/word2vec/infer.py
index 0acce5ff..b223607f 100644
--- a/fluid/PaddleRec/word2vec/infer.py
+++ b/fluid/PaddleRec/word2vec/infer.py
@@ -59,7 +59,7 @@ def parse_args():
     parser.add_argument(
         '--test_batch_size',
         type=int,
-        default=1000,
+        default=100,
         help="test used batch size (default: 1000)")
 
     return parser.parse_args()
diff --git a/fluid/PaddleRec/word2vec/preprocess.py b/fluid/PaddleRec/word2vec/preprocess.py
index a7223557..267413d0 100644
--- a/fluid/PaddleRec/word2vec/preprocess.py
+++ b/fluid/PaddleRec/word2vec/preprocess.py
@@ -198,10 +198,10 @@ def preprocess(args):
     """
     # word to count
 
-    if args.with_other_dict:
-        with io.open(args.other_dict_path, 'r', encoding='utf-8') as f:
-            for line in f:
-                word_count[native_to_unicode(line.strip())] = 1
+    # if args.with_other_dict:
+    #     with io.open(args.other_dict_path, 'r', encoding='utf-8') as f:
+    #         for line in f:
+    #             word_count[native_to_unicode(line.strip())] = 1
 
     # if args.is_local:
     #     for i in range(1, 100):
@@ -223,31 +223,37 @@ def preprocess(args):
     #                             word_count[item] = word_count[item] + 1
     #                         else:
     #                             word_count[item] = 1
-    if args.is_local:
-        with io.open(args.data_path + "/text8", encoding='utf-8') as f:
-            for line in f:
-                if args.with_other_dict:
-                    line = strip_lines(line)
-                    words = line.split()
-                    for item in words:
-                        if item in word_count:
-                            word_count[item] = word_count[item] + 1
-                        else:
-                            word_count[native_to_unicode('<UNK>')] += 1
-                else:
-                    line = text_strip(line)
-                    words = line.split()
-                    for item in words:
-                        if item in word_count:
-                            word_count[item] = word_count[item] + 1
-                        else:
-                            word_count[item] = 1
-    item_to_remove = []
-    for item in word_count:
-        if word_count[item] <= args.freq:
-            item_to_remove.append(item)
-    for item in item_to_remove:
-        del word_count[item]
+    # if args.is_local:
+    #     with io.open(args.data_path + "/text8", encoding='utf-8') as f:
+    #         for line in f:
+    #             if args.with_other_dict:
+    #                 line = strip_lines(line)
+    #                 words = line.split()
+    #                 for item in words:
+    #                     if item in word_count:
+    #                         word_count[item] = word_count[item] + 1
+    #                     else:
+    #                         word_count[native_to_unicode('<UNK>')] += 1
+    #             else:
+    #                 line = text_strip(line)
+    #                 words = line.split()
+    #                 for item in words:
+    #                     if item in word_count:
+    #                         word_count[item] = word_count[item] + 1
+    #                     else:
+    #                         word_count[item] = 1
+
+    # item_to_remove = []
+    # for item in word_count:
+    #     if word_count[item] <= args.freq:
+    #         item_to_remove.append(item)
+    # for item in item_to_remove:
+    #     del word_count[item]
+
+    with io.open(args.dict_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            word, count = line.split()[0], int(line.split()[1])
+            word_count[word] = count
 
     print(word_count)
     path_table, path_code, word_code_len = build_Huffman(word_count, 40)
diff --git a/fluid/PaddleRec/word2vec/train.py b/fluid/PaddleRec/word2vec/train.py
index 6fbe7a79..f19cc9d3 100644
--- a/fluid/PaddleRec/word2vec/train.py
+++ b/fluid/PaddleRec/word2vec/train.py
@@ -141,6 +141,7 @@ def convert_python_to_tensor(batch_size, sample_reader, is_hs):
             result = [[], []]
         for sample in sample_reader():
             for i, fea in enumerate(sample):
+                print(fea)
                 result[i].append(fea)
             if len(result[0]) == batch_size:
                 tensor_result = []
-- 
GitLab