From a0c3465b6e8a91da1865971ff97ecf44360fd290 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 3 May 2017 18:36:51 +0800 Subject: [PATCH] Set filter theshold as a paramter for function build_dict of dataset imikolov --- python/paddle/v2/dataset/imikolov.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 41ca27e23..bf88fe155 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -41,7 +41,7 @@ def word_count(f, word_freq=None): return word_freq -def build_dict(): +def build_dict(typo_freq=50): """ Build a word dictionary from the corpus, Keys of the dictionary are words, and values are zero-based IDs of these words. @@ -59,8 +59,7 @@ def build_dict(): # remove for now, since we will set it as last index del word_freq[''] - TYPO_FREQ = 50 - word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items()) + word_freq = filter(lambda x: x[1] > typo_freq, word_freq.items()) word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) words, _ = list(zip(*word_freq_sorted)) -- GitLab