diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py index 9a1eae3f82a7b45d482282a8afed8fb476ad6a6a..10bc33ac69c41444491a220a7c7fd664b2ffb7c2 100644 --- a/python/paddle/dataset/sentiment.py +++ b/python/paddle/dataset/sentiment.py @@ -28,6 +28,9 @@ from itertools import chain import nltk from nltk.corpus import movie_reviews +import ssl +ssl._create_default_https_context = ssl._create_unverified_context +from functools import cmp_to_key import paddle.dataset.common @@ -68,7 +71,7 @@ def get_word_dict(): for words in movie_reviews.words(field): word_freq_dict[words] += 1 words_sort_list = list(six.iteritems(word_freq_dict)) - words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) + words_sort_list.sort(key=cmp_to_key(lambda a, b: b[1] - a[1])) for index, word in enumerate(words_sort_list): words_freq_sorted.append((word[0], index)) return words_freq_sorted diff --git a/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py b/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..f92c1e5264b3dd4347dc6bc1d2f958408f5d7b3a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py @@ -0,0 +1,51 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +TestCases for Dataset, +including create, config, run, etc. +""" + +from __future__ import print_function +import numpy as np +import unittest +import os +import paddle +import zipfile +import paddle.dataset.common + +URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip" +MD5 = '155de2b77c6834dd8eea7cbe88e93acb' + + +class TestDatasetSentiment(unittest.TestCase): + """ TestCases for Sentiment. """ + + def setUp(self): + paddle.dataset.common.download( + URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip') + path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora') + filename = os.path.join(path, 'movie_reviews.zip') + zip_file = zipfile.ZipFile(filename) + zip_file.extractall(path) + zip_file.close() + + def test_get_word_dict(self): + """ Testcase for get_word_dict. """ + words_freq_sorted = paddle.dataset.sentiment.get_word_dict() + print(words_freq_sorted) + self.assertTrue(len(words_freq_sorted) == 39768) + + +if __name__ == '__main__': + unittest.main()