diff --git a/fluid/PaddleRec/word2vec/cluster_train.sh b/fluid/PaddleRec/word2vec/cluster_train.sh index 9a994a450a8d029e62c8f0b7d600d5f597c92e36..76e499598ce831f818e231aa3dc0958a16cad793 100644 --- a/fluid/PaddleRec/word2vec/cluster_train.sh +++ b/fluid/PaddleRec/word2vec/cluster_train.sh @@ -38,4 +38,5 @@ python train.py \ --endpoints 127.0.0.1:6000,127.0.0.1:6001 \ --trainers 2 \ --trainer_id 1 \ - > trainer1.log 2>&1 & \ No newline at end of file + > trainer1.log 2>&1 & + diff --git a/fluid/PaddleRec/word2vec/network_conf.py b/fluid/PaddleRec/word2vec/network_conf.py index 05d9257178064bae8da45859308aa94ecf0ac88f..373681fe9c68f45857a9ae8ec6c337d70c1d692d 100644 --- a/fluid/PaddleRec/word2vec/network_conf.py +++ b/fluid/PaddleRec/word2vec/network_conf.py @@ -1,35 +1,63 @@ -import paddle.fluid as fluid +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +neural network for word2vec +""" + +from __future__ import print_function + import math +import numpy as np +import paddle.fluid as fluid + +def skip_gram_word2vec(dict_size, word_frequencys, embedding_size): + def nce_layer(input, label, embedding_size, num_total_classes, num_neg_samples, sampler, custom_dist, sample_weight): + # convert word_frequencys to tensor + nid_freq_arr = np.array(word_frequencys).astype('float32') + nid_freq_var = fluid.layers.assign(input=nid_freq_arr) + + w_param_name = "nce_w" + b_param_name = "nce_b" + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, embedding_size], dtype='float32', name=w_param_name) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 1], dtype='float32', name=b_param_name) + + cost = fluid.layers.nce( + input=input, + label=label, + num_total_classes=num_total_classes, + sampler=sampler, + custom_dist=nid_freq_var, + sample_weight = sample_weight, + param_attr=fluid.ParamAttr(name=w_param_name), + bias_attr=fluid.ParamAttr(name=b_param_name), + num_neg_samples=num_neg_samples) + + return cost -def skip_gram_word2vec(dict_size, embedding_size): input_word = fluid.layers.data(name="input_word", shape=[1], dtype='int64') + predict_word = fluid.layers.data(name='predict_word', shape=[1], dtype='int64') + data_list = [input_word, predict_word] emb = fluid.layers.embedding( input=input_word, size=[dict_size, embedding_size], - param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( - scale=1 / math.sqrt(dict_size)))) - - predict_word = fluid.layers.data( - name='predict_word', shape=[1], dtype='int64') - - data_list = [input_word, predict_word] - - w_param_name = "nce_w" - fluid.default_main_program().global_block().create_parameter( - shape=[dict_size, embedding_size], dtype='float32', name=w_param_name) - - b_param_name = "nce_b" - fluid.default_main_program().global_block().create_parameter( - shape=[dict_size, 1], dtype='float32', name=b_param_name) + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1 / math.sqrt(dict_size)))) - cost = fluid.layers.nce(input=emb, - label=predict_word, - num_total_classes=dict_size, - param_attr=fluid.ParamAttr(name=w_param_name), - bias_attr=fluid.ParamAttr(name=b_param_name), - num_neg_samples=5) + cost = nce_layer(emb, predict_word, embedding_size, dict_size, 5, "uniform", word_frequencys, None) avg_cost = fluid.layers.reduce_mean(cost) return avg_cost, data_list diff --git a/fluid/PaddleRec/word2vec/reader.py b/fluid/PaddleRec/word2vec/reader.py index 5eb18f65a6a1138424023e2a64ce1b8c25f8a9d2..6f411283bd6133b90ae43e990de865c4084b4b7e 100644 --- a/fluid/PaddleRec/word2vec/reader.py +++ b/fluid/PaddleRec/word2vec/reader.py @@ -10,13 +10,21 @@ class Word2VecReader(object): self.data_path_ = data_path self.word_to_id_ = dict() + word_all_count = 0 + word_counts = [] word_id = 0 + with open(dict_path, 'r') as f: for line in f: - self.word_to_id_[line.split()[0]] = word_id + word, count = line.split()[0], int(line.split()[1]) + self.word_to_id_[word] = word_id word_id += 1 + word_counts.append(count) + word_all_count += count + self.dict_size = len(self.word_to_id_) - print("dict_size = " + str(self.dict_size)) + self.word_frequencys = [ float(count)/word_all_count for count in word_counts] + print("dict_size = " + str(self.dict_size)) + " word_all_count = " + str(word_all_count) def get_context_words(self, words, idx, window_size): """ diff --git a/fluid/PaddleRec/word2vec/train.py b/fluid/PaddleRec/word2vec/train.py index a6c471320c378598aa371be07e953d736011e39e..8581a9c3303b737d3d052d2d17d87f0fa177939f 100644 --- a/fluid/PaddleRec/word2vec/train.py +++ b/fluid/PaddleRec/word2vec/train.py @@ -66,7 +66,7 @@ def parse_args(): '--role', type=str, default='pserver', # trainer or pserver - help='The path for model to store (default: models)') + help='The training role (trainer|pserver) (default: pserver)') parser.add_argument( '--endpoints', type=str, @@ -76,12 +76,12 @@ def parse_args(): '--current_endpoint', type=str, default='127.0.0.1:6000', - help='The path for model to store (default: 127.0.0.1:6000)') + help='The current pserver endpoint (default: 127.0.0.1:6000)') parser.add_argument( '--trainer_id', type=int, default=0, - help='The path for model to store (default: models)') + help='The current trainer id (default: 0)') parser.add_argument( '--trainers', type=int, @@ -131,8 +131,11 @@ def train(): word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_path) - loss, data_list = skip_gram_word2vec(word2vec_reader.dict_size, - args.embedding_size) + + logger.info("dict_size: {}".format(word2vec_reader.dict_size)) + logger.info("word_frequencys length: {}".format(len(word2vec_reader.word_frequencys))) + + loss, data_list = skip_gram_word2vec(word2vec_reader.dict_size, word2vec_reader.word_frequencys, args.embedding_size) optimizer = fluid.optimizer.Adam(learning_rate=1e-3) optimizer.minimize(loss)