# Copyright (c) 2016 Baidu, Inc. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from paddle.trainer_config_helpers import * try: import cPickle as pickle except ImportError: import pickle is_predict = get_config_arg('is_predict', bool, False) META_FILE = 'data/meta.bin' with open(META_FILE, 'rb') as f: # load meta file meta = pickle.load(f) settings(batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer()) def construct_feature(name): """ Construct movie/user features. This method read from meta data. Then convert feature to neural network due to feature type. The map relation as follow. * id: embedding => fc * embedding: is_sequence: embedding => context_projection => fc => pool not sequence: embedding => fc * one_hot_dense: fc => fc Then gather all features vector, and use a fc layer to combined them as return. :param name: 'movie' or 'user' :type name: basestring :return: combined feature output :rtype: LayerOutput """ __meta__ = meta[name]['__meta__']['raw_meta'] fusion = [] for each_meta in __meta__: type_name = each_meta['type'] slot_name = each_meta.get('name', '%s_id' % name) if type_name == 'id': slot_dim = each_meta['max'] embedding = embedding_layer(input=data_layer(slot_name, size=slot_dim), size=256, param_attr=ParamAttr( sparse_update=True)) fusion.append(fc_layer(input=embedding, size=256)) elif type_name == 'embedding': is_seq = each_meta['seq'] == 'sequence' slot_dim = len(each_meta['dict']) din = data_layer(slot_name, slot_dim) embedding = embedding_layer(input=din, size=256) if is_seq: fusion.append( text_conv_pool(input=embedding, context_len=5, hidden_size=256)) else: fusion.append(fc_layer(input=embedding, size=256)) elif type_name == 'one_hot_dense': slot_dim = len(each_meta['dict']) hidden = fc_layer(input=data_layer(slot_name, slot_dim), size=256) fusion.append(fc_layer(input=hidden, size=256)) return fc_layer(name="%s_fusion" % name, input=fusion, size=256) movie_feature = construct_feature("movie") user_feature = construct_feature("user") similarity = cos_sim(a=movie_feature, b=user_feature) if not is_predict: outputs(regression_cost(input=similarity, label=data_layer('rating', size=1))) define_py_data_sources2('data/train.list', 'data/test.list', module='dataprovider', obj='process', args={'meta': meta}) else: outputs(similarity)