提交 c173e64b 编写于 作者: T tangwei12

add is_sparse and args optimize

上级 873b5475
...@@ -28,12 +28,10 @@ def skip_gram_word2vec(dict_size, ...@@ -28,12 +28,10 @@ def skip_gram_word2vec(dict_size,
embedding_size, embedding_size,
max_code_length=None, max_code_length=None,
with_hsigmoid=False, with_hsigmoid=False,
with_nce=True): with_nce=True,
is_sparse=False):
def nce_layer(input, label, embedding_size, num_total_classes, def nce_layer(input, label, embedding_size, num_total_classes,
num_neg_samples, sampler, custom_dist, sample_weight): num_neg_samples, sampler, word_frequencys, sample_weight):
# convert word_frequencys to tensor
nid_freq_arr = np.array(word_frequencys).astype('float32')
nid_freq_var = fluid.layers.assign(input=nid_freq_arr)
w_param_name = "nce_w" w_param_name = "nce_w"
b_param_name = "nce_b" b_param_name = "nce_b"
...@@ -48,11 +46,11 @@ def skip_gram_word2vec(dict_size, ...@@ -48,11 +46,11 @@ def skip_gram_word2vec(dict_size,
label=label, label=label,
num_total_classes=num_total_classes, num_total_classes=num_total_classes,
sampler=sampler, sampler=sampler,
custom_dist=nid_freq_var, custom_dist=word_frequencys,
sample_weight=sample_weight, sample_weight=sample_weight,
param_attr=fluid.ParamAttr(name=w_param_name), param_attr=fluid.ParamAttr(name=w_param_name),
bias_attr=fluid.ParamAttr(name=b_param_name), bias_attr=fluid.ParamAttr(name=b_param_name),
num_neg_samples=num_neg_samples) num_neg_samples=num_neg_samples, is_sparse=is_sparse)
return cost return cost
...@@ -76,8 +74,8 @@ def skip_gram_word2vec(dict_size, ...@@ -76,8 +74,8 @@ def skip_gram_word2vec(dict_size,
non_leaf_num = dict_size non_leaf_num = dict_size
cost = fluid.layers.hsigmoid( cost = fluid.layers.hsigmoid(
input=emb, input=input,
label=predict_word, label=label,
non_leaf_num=non_leaf_num, non_leaf_num=non_leaf_num,
ptable=ptable, ptable=ptable,
pcode=pcode, pcode=pcode,
...@@ -86,13 +84,13 @@ def skip_gram_word2vec(dict_size, ...@@ -86,13 +84,13 @@ def skip_gram_word2vec(dict_size,
return cost return cost
input_word = fluid.layers.data(name="input_word", shape=[1], dtype='int64') input_word = fluid.layers.data(name="input_word", shape=[1], dtype='int64')
predict_word = fluid.layers.data( predict_word = fluid.layers.data(name='predict_word', shape=[1], dtype='int64')
name='predict_word', shape=[1], dtype='int64')
cost = None cost = None
data_list = [input_word, predict_word] data_list = [input_word, predict_word]
emb = fluid.layers.embedding( emb = fluid.layers.embedding(
input=input_word, input=input_word,
is_sparse=is_sparse,
size=[dict_size, embedding_size], size=[dict_size, embedding_size],
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(dict_size)))) scale=1 / math.sqrt(dict_size))))
......
...@@ -5,7 +5,7 @@ import logging ...@@ -5,7 +5,7 @@ import logging
import os import os
import time import time
# disable gpu training for this example # disable gpu training for this example
os.environ["CUDA_VISIBLE_DEVICES"] = "" os.environ["CUDA_VISIBLE_DEVICES"] = ""
import paddle import paddle
...@@ -57,6 +57,31 @@ def parse_args(): ...@@ -57,6 +57,31 @@ def parse_args():
default=64, default=64,
help='sparse feature hashing space for index processing') help='sparse feature hashing space for index processing')
parser.add_argument(
'--with_hs',
action='store_true',
required=False,
default=False,
help='using hierarchical sigmoid, (default: False)')
parser.add_argument(
'--with_nce',
action='store_true',
required=False,
default=True,
help='using negtive sampling, (default: True)')
parser.add_argument(
'--max_code_length',
type=int,
default=40,
help='max code length used by hierarchical sigmoid, (default: 40)')
parser.add_argument(
'--is_sparse',
type=bool,
default=False,
help='embedding and nce will use sparse or not, (default: False)')
parser.add_argument( parser.add_argument(
'--is_local', '--is_local',
type=int, type=int,
...@@ -88,21 +113,6 @@ def parse_args(): ...@@ -88,21 +113,6 @@ def parse_args():
type=int, type=int,
default=1, default=1,
help='The num of trianers, (default: 1)') help='The num of trianers, (default: 1)')
parser.add_argument(
'--with_hs',
type=int,
default=0,
help='using hierarchical sigmoid, (default: 0)')
parser.add_argument(
'--with_nce',
type=int,
default=1,
help='using negtive sampling, (default: 1)')
parser.add_argument(
'--max_code_length',
type=int,
default=40,
help='max code length used by hierarchical sigmoid, (default: 40)')
return parser.parse_args() return parser.parse_args()
...@@ -142,8 +152,7 @@ def train_loop(args, train_program, reader, data_list, loss, trainer_num, ...@@ -142,8 +152,7 @@ def train_loop(args, train_program, reader, data_list, loss, trainer_num,
[loss], exe) [loss], exe)
model_dir = args.model_output_dir + '/pass-' + str(pass_id) model_dir = args.model_output_dir + '/pass-' + str(pass_id)
if args.trainer_id == 0: if args.trainer_id == 0:
fluid.io.save_inference_model(model_dir, data_name_list, [loss], fluid.io.save_inference_model(model_dir, data_name_list, [loss], exe)
exe)
def train(): def train():
...@@ -156,12 +165,12 @@ def train(): ...@@ -156,12 +165,12 @@ def train():
args.train_data_path) args.train_data_path)
logger.info("dict_size: {}".format(word2vec_reader.dict_size)) logger.info("dict_size: {}".format(word2vec_reader.dict_size))
logger.info("word_frequencys length: {}".format(
len(word2vec_reader.word_frequencys)))
loss, data_list = skip_gram_word2vec( loss, data_list = skip_gram_word2vec(
word2vec_reader.dict_size, word2vec_reader.word_frequencys, word2vec_reader.dict_size, word2vec_reader.word_frequencys,
args.embedding_size, args.max_code_length, args.with_hs, args.with_nce) args.embedding_size, args.max_code_length,
args.with_hs, args.with_nce, is_sparse=args.is_sparse)
optimizer = fluid.optimizer.Adam(learning_rate=1e-3) optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
optimizer.minimize(loss) optimizer.minimize(loss)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册