diff --git a/paddle/setup.py.in b/paddle/setup.py.in index 38621af065913c9edd44958e9fb767c983c00dbb..8dc3ff6acd5bfea0db73fbbcd3513858b0a7f2f7 100644 --- a/paddle/setup.py.in +++ b/paddle/setup.py.in @@ -72,6 +72,8 @@ setup(name="py_paddle", packages=['py_paddle'], include_dirs = include_dirs, install_requires = [ + 'h5py', + 'nltk', 'numpy>=1.8.0', # The numpy is required. 'protobuf>=3.0.0' # The paddle protobuf version ], diff --git a/paddle/v2/data_set/config.py b/paddle/v2/data_set/config.py new file mode 100644 index 0000000000000000000000000000000000000000..69e96d65ef1ef868aff5d46ddf3af250ca11e641 --- /dev/null +++ b/paddle/v2/data_set/config.py @@ -0,0 +1,8 @@ +import os + +__all__ = ['DATA_HOME'] + +DATA_HOME = os.path.expanduser('~/.cache/paddle_data_set') + +if not os.path.exists(DATA_HOME): + os.makedirs(DATA_HOME) diff --git a/paddle/v2/data_set/sentiment.py b/paddle/v2/data_set/sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..323fc214dd317a69a3f1df1d2ad0aace1c765340 --- /dev/null +++ b/paddle/v2/data_set/sentiment.py @@ -0,0 +1,81 @@ +import random +import nltk +import numpy as np +from nltk.corpus import movie_reviews +from config import DATA_HOME + +__all__ = ['train', 'test', 'get_label_dict', 'get_word_dict'] +SPLIT_NUM = 800 +TOTAL_DATASET_NUM = 1000 + + +def get_label_dict(): + label_dict = {'neg': 0, 'pos': 1} + return label_dict + + +def is_download_data(): + try: + nltk.data.path.append(DATA_HOME) + movie_reviews.categories() + except LookupError: + print "dd" + nltk.download('movie_reviews', download_dir=DATA_HOME) + nltk.data.path.append(DATA_HOME) + + +def get_word_dict(): + words_freq_sorted = list() + is_download_data() + words_freq = nltk.FreqDist(w.lower() for w in movie_reviews.words()) + words_sort_list = words_freq.items() + words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) + print words_sort_list + for index, word in enumerate(words_sort_list): + words_freq_sorted.append(word[0]) + return words_freq_sorted + + +def load_sentiment_data(): + label_dict = get_label_dict() + is_download_data() + words_freq = nltk.FreqDist(w.lower() for w in movie_reviews.words()) + data_set = [([words_freq[word] + for word in movie_reviews.words(fileid)], label_dict[category]) + for category in movie_reviews.categories() + for fileid in movie_reviews.fileids(category)] + random.shuffle(data_set) + return data_set + + +data_set = load_sentiment_data() + + +def reader_creator(data_type): + if data_type == 'train': + for each in data_set[0:SPLIT_NUM]: + train_sentences = np.array(each[0], dtype=np.int32) + train_label = np.array(each[1], dtype=np.int8) + yield train_sentences, train_label + else: + for each in data_set[SPLIT_NUM:]: + test_sentences = np.array(each[0], dtype=np.int32) + test_label = np.array(each[1], dtype=np.int8) + yield test_sentences, test_label + + +def train(): + return reader_creator('train') + + +def test(): + return reader_creator('test') + + +if __name__ == '__main__': + for train in train(): + print "train" + print train + for test in test(): + print "test" + print test