# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from paddle.trainer.PyDataProvider2 import * import collections import logging import pdb logging.basicConfig( format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', ) logger = logging.getLogger('paddle') logger.setLevel(logging.INFO) N = 5 # Ngram cutoff = 50 # select words with frequency > cutoff to dictionary def build_dict(ftrain, fdict): sentences = [] with open(ftrain) as fin: for line in fin: line = [''] + line.strip().split() + [''] sentences += line wordfreq = collections.Counter(sentences) wordfreq = filter(lambda x: x[1] > cutoff, wordfreq.items()) dictionary = sorted(wordfreq, key=lambda x: (-x[1], x[0])) words, _ = list(zip(*dictionary)) for word in words: print >> fdict, word word_idx = dict(zip(words, xrange(len(words)))) logger.info("Dictionary size=%s" % len(words)) return word_idx def initializer(settings, srcText, dictfile, **xargs): with open(dictfile, 'w') as fdict: settings.dicts = build_dict(srcText, fdict) input_types = [] for i in xrange(N): input_types.append(integer_value(len(settings.dicts))) settings.input_types = input_types @provider(init_hook=initializer) def process(settings, filename): UNKID = settings.dicts[''] with open(filename) as fin: for line in fin: line = [''] * (N - 1) + line.strip().split() + [''] line = [settings.dicts.get(w, UNKID) for w in line] for i in range(N, len(line) + 1): yield line[i - N:i]