提交 72ed55fb 编写于 作者: J JiabinYang

fix api to fit new hs op

上级 22d433f1
......@@ -62,10 +62,10 @@ def skip_gram_word2vec(dict_size,
cost = fluid.layers.hsigmoid(
input=input,
label=label,
non_leaf_num=non_leaf_num,
ptable=ptable,
pcode=pcode,
is_costum=True,
num_classes=non_leaf_num,
path_table=ptable,
path_code=pcode,
is_custom=True,
is_sparse=is_sparse)
return cost
......@@ -109,7 +109,8 @@ def skip_gram_word2vec(dict_size,
"uniform", word_frequencys, None)
cost = cost_nce
if with_hsigmoid:
cost_hs = hsigmoid_layer(emb, words[1], words[2], words[3], dict_size)
cost_hs = hsigmoid_layer(emb, words[1], words[2], words[3], dict_size,
is_sparse)
cost = cost_hs
if with_nce and with_hsigmoid:
cost = fluid.layers.elementwise_add(cost_nce, cost_hs)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
train for word2vec
"""
from __future__ import print_function
import argparse
......@@ -122,15 +105,17 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
start = time.clock()
exec_strategy = fluid.ExecutionStrategy()
if os.getenv("NUM_THREADS", ""):
exec_strategy.num_threads = int(os.getenv("NUM_THREADS"))
#if os.getenv("NUM_THREADS", ""):
# exec_strategy.num_threads = int(os.getenv("NUM_THREADS"))
print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
exec_strategy.num_threads = int(os.getenv("CPU_NUM"))
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
if int(os.getenv("CPU_NUM")) > 1:
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
train_exe = fluid.ParallelExecutor(
use_cuda=False,
......@@ -148,6 +133,7 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
epoch_start = time.time()
py_reader.start()
batch_id = 0
start = time.clock()
try:
while True:
......@@ -164,18 +150,26 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
profiler_step += 1
if batch_id % 10 == 0:
logger.info("TRAIN --> pass: {} batch: {} loss: {}".format(
pass_id, batch_id, loss_val.mean() / args.batch_size))
logger.info(
"TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
format(pass_id, batch_id,
loss_val.mean() / args.batch_size,
py_reader.queue.size()))
if batch_id % 100 == 0 and batch_id != 0:
elapsed = (time.clock() - start)
logger.info("Time used: {}".format(elapsed))
if batch_id % 1000 == 0 and batch_id != 0:
model_dir = args.model_output_dir + '/batch-' + str(
batch_id)
if trainer_id == 0:
fluid.io.save_inference_model(model_dir, data_name_list,
[loss], exe)
start = time.clock()
samples = 101 * args.batch_size * int(os.getenv("CPU_NUM"))
logger.info("Time used: {}, Samples/Sec: {}".format(
elapsed, samples / elapsed))
# elapsed = (time.clock() - start)
# start = time.clock()
# samples = 101 * args.batch_size * int(os.getenv("CPU_NUM"))
# logger.info("Time used: {}, Samples/Sec: {}".format(elapsed, samples/elapsed))
#if batch_id % 1000 == 0 and batch_id != 0:
# model_dir = args.model_output_dir + '/batch-' + str(batch_id)
# if trainer_id == 0:
# fluid.io.save_inference_model(model_dir, data_name_list, [loss], exe)
batch_id += 1
except fluid.core.EOFException:
......@@ -184,18 +178,14 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
print("Epoch: {0}, Train total expend: {1} ".format(
pass_id, epoch_end - epoch_start))
model_dir = args.model_output_dir + '/pass-' + str(pass_id)
if trainer_id == 0:
fluid.io.save_inference_model(model_dir, data_name_list,
[loss], exe)
#model_dir = args.model_output_dir + '/pass-' + str(pass_id)
#if trainer_id == 0:
# fluid.io.save_inference_model(model_dir, data_name_list, [loss], exe)
def train():
args = parse_args()
if not args.with_nce and not args.with_hs:
logger.error("with_nce or with_hs must choose one")
if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
......@@ -213,12 +203,17 @@ def train():
args.with_nce,
is_sparse=args.is_sparse)
optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
#optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
optimizer.minimize(loss)
if os.getenv("PADDLE_IS_LOCAL", "1") == "1":
logger.info("run local training")
main_program = fluid.default_main_program()
with open("local.main.proto", "w") as f:
f.write(str(main_program))
train_loop(args, main_program, word2vec_reader, py_reader, loss, 0)
else:
logger.info("run dist training")
......@@ -227,15 +222,13 @@ def train():
trainers = int(os.environ["PADDLE_TRAINERS"])
training_role = os.environ["PADDLE_TRAINING_ROLE"]
ports = os.getenv("PADDLE_PSERVER_PORTS", "6174")
pserver_ip = os.getenv("PADDLE_IP", "")
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
eplist = []
for port in ports.split(","):
eplist.append(':'.join([pserver_ip, port]))
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist)
current_endpoint = pserver_ip + ":" + os.getenv("CUR_PORT", "2333")
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
config = fluid.DistributeTranspilerConfig()
config.slice_var_up = False
......@@ -270,5 +263,31 @@ def train():
trainer_id)
def env_declar():
print("******** Rename Cluster Env to PaddleFluid Env ********")
print("Content-Type: text/plain\n\n")
for key in os.environ.keys():
print("%30s %s \n" % (key, os.environ[key]))
if os.environ["TRAINING_ROLE"] == "PSERVER" or os.environ[
"PADDLE_IS_LOCAL"] == "0":
os.environ["PADDLE_TRAINING_ROLE"] = os.environ["TRAINING_ROLE"]
os.environ["PADDLE_PSERVER_PORT"] = os.environ["PADDLE_PORT"]
os.environ["PADDLE_PSERVER_IPS"] = os.environ["PADDLE_PSERVERS"]
os.environ["PADDLE_TRAINERS"] = os.environ["PADDLE_TRAINERS_NUM"]
os.environ["PADDLE_CURRENT_IP"] = os.environ["POD_IP"]
os.environ["PADDLE_TRAINER_ID"] = os.environ["PADDLE_TRAINER_ID"]
os.environ["CPU_NUM"] = "12"
os.environ["NUM_THREADS"] = "12"
print("Content-Type: text/plain\n\n")
for key in os.environ.keys():
print("%30s %s \n" % (key, os.environ[key]))
print("****** Rename Cluster Env to PaddleFluid Env END ******")
if __name__ == '__main__':
#`env_declar()
train()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册