diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py index d222739ba2c0fe1913d7625b209f8b7fbd9e4f39..7c999752af22a58aca4ee9a504a909d97c53498d 100644 --- a/python/paddle/v2/dataset/__init__.py +++ b/python/paddle/v2/dataset/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import mnist import imikolov import imdb diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 77c54bd268b5d988b0802a3edca91605e56f730e..5c6f5d85567fa19f2835ee4f3951531b6dfd3209 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -1,6 +1,20 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html """ + import cPickle import itertools import numpy diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index fcf4437ffaf329f52cc5bc997eff45dee200873c..397c9e66d495431f412c22b9b1d19ee32257b2dd 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import requests import hashlib import os diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py new file mode 100644 index 0000000000000000000000000000000000000000..7874161a05968921a8e6789a7ff6a229d6cc6c01 --- /dev/null +++ b/python/paddle/v2/dataset/conll05.py @@ -0,0 +1,205 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.common +import tarfile +import gzip +import itertools + +__all__ = ['test, get_dict', 'get_embedding'] +""" +Conll 2005 dataset. Paddle semantic role labeling Book and demo use this +dataset as an example. Because Conll 2005 is not free in public, the default +downloaded URL is test set of Conll 2005 (which is public). Users can change +URL and MD5 to their Conll dataset. +""" + +DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' +DATA_MD5 = '387719152ae52d60422c016e92a742fc' +WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' +WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' +VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' +VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' +TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' +TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' +EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' +EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' + +UNK_IDX = 0 + + +def load_dict(filename): + d = dict() + with open(filename, 'r') as f: + for i, line in enumerate(f): + d[line.strip()] = i + return d + + +def corpus_reader(data_path, words_name, props_name): + """ + Read one corpus. It returns an iterator. Each element of + this iterator is a tuple including sentence and labels. The sentence is + consist of a list of word IDs. The labels include a list of label IDs. + :return: a iterator of data. + :rtype: iterator + """ + + def reader(): + tf = tarfile.open(data_path) + wf = tf.extractfile(words_name) + pf = tf.extractfile(props_name) + with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile( + fileobj=pf) as props_file: + sentences = [] + labels = [] + one_seg = [] + for word, label in itertools.izip(words_file, props_file): + word = word.strip() + label = label.strip().split() + + if len(label) == 0: # end of sentence + for i in xrange(len(one_seg[0])): + a_kind_lable = [x[i] for x in one_seg] + labels.append(a_kind_lable) + + if len(labels) >= 1: + verb_list = [] + for x in labels[0]: + if x != '-': + verb_list.append(x) + + for i, lbl in enumerate(labels[1:]): + cur_tag = 'O' + is_in_bracket = False + lbl_seq = [] + verb_word = '' + for l in lbl: + if l == '*' and is_in_bracket == False: + lbl_seq.append('O') + elif l == '*' and is_in_bracket == True: + lbl_seq.append('I-' + cur_tag) + elif l == '*)': + lbl_seq.append('I-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') != -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') == -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = True + else: + raise RuntimeError('Unexpected label: %s' % + l) + + yield sentences, verb_list[i], lbl_seq + + sentences = [] + labels = [] + one_seg = [] + else: + sentences.append(word) + one_seg.append(label) + + pf.close() + wf.close() + tf.close() + + return reader + + +def reader_creator(corpus_reader, + word_dict=None, + predicate_dict=None, + label_dict=None): + def reader(): + for sentence, predicate, labels in corpus_reader(): + + sen_len = len(sentence) + + verb_index = labels.index('B-V') + mark = [0] * len(labels) + if verb_index > 0: + mark[verb_index - 1] = 1 + ctx_n1 = sentence[verb_index - 1] + else: + ctx_n1 = 'bos' + + if verb_index > 1: + mark[verb_index - 2] = 1 + ctx_n2 = sentence[verb_index - 2] + else: + ctx_n2 = 'bos' + + mark[verb_index] = 1 + ctx_0 = sentence[verb_index] + + if verb_index < len(labels) - 1: + mark[verb_index + 1] = 1 + ctx_p1 = sentence[verb_index + 1] + else: + ctx_p1 = 'eos' + + if verb_index < len(labels) - 2: + mark[verb_index + 2] = 1 + ctx_p2 = sentence[verb_index + 2] + else: + ctx_p2 = 'eos' + + word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] + pred_idx = [predicate_dict.get(predicate)] * sen_len + + ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len + ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len + ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len + ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len + ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len + + label_idx = [label_dict.get(w) for w in labels] + + yield word_idx, pred_idx, ctx_n2_idx, ctx_n1_idx, \ + ctx_0_idx, ctx_p1_idx, ctx_p2_idx, mark, label_idx + + return reader() + + +def get_dict(): + word_dict = load_dict( + common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) + verb_dict = load_dict( + common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) + label_dict = load_dict( + common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) + return word_dict, verb_dict, label_dict + + +def get_embedding(): + return common.download(EMB_URL, 'conll05st', EMB_MD5) + + +def test(): + word_dict, verb_dict, label_dict = get_dict() + reader = corpus_reader( + common.download(DATA_URL, 'conll05st', DATA_MD5), + words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', + props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') + return reader_creator(reader, word_dict, verb_dict, label_dict) + + +if __name__ == '__main__': + print get_embedding() + for f in test(): + print f diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 433e37380f840f5b7ff619a5f64b99d2ad724b17..ffd7d89049358e979db762af07701e010f40dc6e 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -1,6 +1,3 @@ -# /usr/bin/env python -# -*- coding:utf-8 -*- - # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,6 +14,7 @@ """ IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz """ + import paddle.v2.dataset.common import tarfile import Queue diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index b3791ddad66e588356338150fccadbcc8fa113ca..285d3eaca8317c78dc84e99b4d524a0f4872c687 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -1,3 +1,16 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ """ diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index ebcdff78b317ceb4811048ac78982e072962fa9c..6a621a2aaad14bf9598b838ce7c2ebf297bb0d30 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -1,3 +1,16 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ MNIST dataset. """ diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index dcffcff2f58c63d451761d37f14127d730faf621..c22bcfa38b5f501732768dd4f62d8e088d57a7ff 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import zipfile from common import download import re diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py index a2af45ecf508462fe4b596b5d8d6401c5b974eff..e0e18229da7818be5752ee592e094a00da286ad9 100644 --- a/python/paddle/v2/dataset/tests/cifar_test.py +++ b/python/paddle/v2/dataset/tests/cifar_test.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.v2.dataset.cifar import unittest diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py index 7d8406171b8478e4a8331637c5e867c18d5eb3d8..5babcef0eb4345d243904877d323c37d4889a643 100644 --- a/python/paddle/v2/dataset/tests/common_test.py +++ b/python/paddle/v2/dataset/tests/common_test.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.v2.dataset.common import unittest import tempfile diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py index e887af16634d2db04b8cf5fa0269a69991d8baac..c4d82f26895d77d05c6e936bd636b1239e1a0cd8 100644 --- a/python/paddle/v2/dataset/tests/imdb_test.py +++ b/python/paddle/v2/dataset/tests/imdb_test.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.v2.dataset.imdb import unittest import re diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py index b4408cc2f590d4d8da4ce5e98213cf7b208cfc15..1d344cac3e7483a351033570fbec75a4d19f4a55 100644 --- a/python/paddle/v2/dataset/tests/mnist_test.py +++ b/python/paddle/v2/dataset/tests/mnist_test.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.v2.dataset.mnist import unittest