From 812e21f3c4c14b8cf215fb1221b74814b132f301 Mon Sep 17 00:00:00 2001 From: wen-bo-yang Date: Mon, 27 Feb 2017 17:43:28 +0800 Subject: [PATCH] add cross reading sample files and fix bugs --- paddle/setup.py.in | 2 +- paddle/v2/dataset/config.py | 8 ---- .../paddle}/v2/dataset/sentiment.py | 42 +++++++++++++------ 3 files changed, 30 insertions(+), 22 deletions(-) delete mode 100644 paddle/v2/dataset/config.py rename {paddle => python/paddle}/v2/dataset/sentiment.py (73%) diff --git a/paddle/setup.py.in b/paddle/setup.py.in index d44f1145d..382d5be6e 100644 --- a/paddle/setup.py.in +++ b/paddle/setup.py.in @@ -72,7 +72,7 @@ setup(name="py_paddle", packages=['py_paddle'], include_dirs = include_dirs, install_requires = [ - 'nltk', + 'nltk>=3.2.2', 'numpy>=1.8.0', # The numpy is required. 'protobuf>=3.0.0' # The paddle protobuf version ], diff --git a/paddle/v2/dataset/config.py b/paddle/v2/dataset/config.py deleted file mode 100644 index 304c4bc5c..000000000 --- a/paddle/v2/dataset/config.py +++ /dev/null @@ -1,8 +0,0 @@ -import os - -__all__ = ['DATA_HOME'] - -DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') - -if not os.path.exists(DATA_HOME): - os.makedirs(DATA_HOME) diff --git a/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py similarity index 73% rename from paddle/v2/dataset/sentiment.py rename to python/paddle/v2/dataset/sentiment.py index 83581eadf..9825d2ef9 100644 --- a/paddle/v2/dataset/sentiment.py +++ b/python/paddle/v2/dataset/sentiment.py @@ -20,9 +20,9 @@ The script fetch and preprocess movie_reviews data set that provided by NLTK """ - import nltk import numpy as np +from itertools import chain from nltk.corpus import movie_reviews from config import DATA_HOME @@ -50,9 +50,10 @@ def download_data_if_not_yet(): except LookupError: print "Downloading movie_reviews data set, please wait....." nltk.download('movie_reviews', download_dir=DATA_HOME) - print "Download data set success......" # make sure that nltk can find the data nltk.data.path.append(DATA_HOME) + print "Download data set success....." + print "Path is " + nltk.data.find('corpora/movie_reviews').path def get_word_dict(): @@ -67,24 +68,39 @@ def get_word_dict(): words_sort_list = words_freq.items() words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) for index, word in enumerate(words_sort_list): - words_freq_sorted.append(word[0]) + words_freq_sorted.append((word[0], index + 1)) return words_freq_sorted +def sort_files(): + """ + Sorted the sample for cross reading the sample + :return: + files_list + """ + files_list = list() + download_data_if_not_yet() + neg_file_list = movie_reviews.fileids('neg') + pos_file_list = movie_reviews.fileids('pos') + files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list))) + return files_list + + def load_sentiment_data(): """ Load the data set :return: data_set """ - label_dict = get_label_dict() + data_set = list() download_data_if_not_yet() - words_freq = nltk.FreqDist(w.lower() for w in movie_reviews.words()) - data_set = [([words_freq[word.lower()] - for word in movie_reviews.words(fileid)], - label_dict[category]) - for category in movie_reviews.categories() - for fileid in movie_reviews.fileids(category)] + words_ids = dict(get_word_dict()) + for sample_file in sort_files(): + words_list = list() + category = 0 if 'neg' in sample_file else 1 + for word in movie_reviews.words(sample_file): + words_list.append(words_ids[word.lower()]) + data_set.append((words_list, category)) return data_set @@ -98,9 +114,9 @@ def reader_creator(data): train data set or test data set """ for each in data: - sentences = np.array(each[0], dtype=np.int32) - labels = np.array(each[1], dtype=np.int8) - yield sentences, labels + list_of_int = np.array(each[0], dtype=np.int32) + label = each[1] + yield list_of_int, label def train(): -- GitLab