提交 772fd131 编写于 作者: Y Yibing Liu

Some improvements in DAM config

上级 d65c9edf
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0
python -u ../test_and_evaluate.py --use_cuda \ python -u ../test_and_evaluate.py --use_cuda \
--ext_eval \ --ext_eval \
--data_path ./data/data.pkl \ --data_path ./data/data.pkl \
--save_path ./eval_10000 \ --save_path ./eval_3900 \
--model_path models/step_10000 \ --model_path models/step_3900 \
--batch_size 100 \ --channel1_num 16 \
--batch_size 200 \
--vocab_size 172130 \ --vocab_size 172130 \
--emb_size 200 \ --emb_size 200 \
--_EOS_ 1 --_EOS_ 1
......
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0
export FLAGS_eager_delete_tensor_gb=0.0
python -u ../train_and_evaluate.py --use_cuda \ python -u ../train_and_evaluate.py --use_cuda \
--data_path ./data/data.pkl \ --data_path ./data/data.pkl \
--ext_eval \ --ext_eval \
--word_emb_init ./data/word_embedding.pkl \ --word_emb_init ./data/word_embedding.pkl \
--save_path ./models \ --save_path ./models \
--batch_size 100 \ --batch_size 256 \
--vocab_size 172130 \ --vocab_size 172130 \
--channel1_num 16 \
--emb_size 200 \ --emb_size 200 \
--_EOS_ 1 --_EOS_ 1
...@@ -6,18 +6,25 @@ import utils.layers as layers ...@@ -6,18 +6,25 @@ import utils.layers as layers
class Net(object): class Net(object):
def __init__(self, max_turn_num, max_turn_len, vocab_size, emb_size, def __init__(self, max_turn_num, max_turn_len, vocab_size, emb_size,
stack_num): stack_num, channel1_num, channel2_num):
self._max_turn_num = max_turn_num self._max_turn_num = max_turn_num
self._max_turn_len = max_turn_len self._max_turn_len = max_turn_len
self._vocab_size = vocab_size self._vocab_size = vocab_size
self._emb_size = emb_size self._emb_size = emb_size
self._stack_num = stack_num self._stack_num = stack_num
self._channel1_num = channel1_num
self._channel2_num = channel2_num
self.word_emb_name = "shared_word_emb" self.word_emb_name = "shared_word_emb"
self.use_stack_op = True self.use_stack_op = True
self.use_mask_cache = True self.use_mask_cache = True
self.use_sparse_embedding = True self.use_sparse_embedding = True
def set_word_embedding(self, word_emb, place):
word_emb_param = fluid.global_scope().find_var(
self.word_emb_name).get_tensor()
word_emb_param.set(word_emb, place)
def create_network(self): def create_network(self):
mask_cache = dict() if self.use_mask_cache else None mask_cache = dict() if self.use_mask_cache else None
...@@ -136,7 +143,7 @@ class Net(object): ...@@ -136,7 +143,7 @@ class Net(object):
t_a_r = fluid.layers.concat(input=t_a_r_stack, axis=1) t_a_r = fluid.layers.concat(input=t_a_r_stack, axis=1)
r_a_t = fluid.layers.concat(input=r_a_t_stack, axis=1) r_a_t = fluid.layers.concat(input=r_a_t_stack, axis=1)
# sim shape: [batch_size, 2*(stack_num+2), max_turn_len, max_turn_len] # sim shape: [batch_size, 2*(stack_num+1), max_turn_len, max_turn_len]
sim = fluid.layers.matmul( sim = fluid.layers.matmul(
x=t_a_r, y=r_a_t, transpose_y=True, alpha=1 / np.sqrt(200.0)) x=t_a_r, y=r_a_t, transpose_y=True, alpha=1 / np.sqrt(200.0))
sim_turns.append(sim) sim_turns.append(sim)
...@@ -147,10 +154,9 @@ class Net(object): ...@@ -147,10 +154,9 @@ class Net(object):
for index in xrange(len(sim_turns)): for index in xrange(len(sim_turns)):
sim_turns[index] = fluid.layers.unsqueeze( sim_turns[index] = fluid.layers.unsqueeze(
input=sim_turns[index], axes=[2]) input=sim_turns[index], axes=[2])
# sim shape: [batch_size, 2*(stack_num+2), max_turn_num, max_turn_len, max_turn_len] # sim shape: [batch_size, 2*(stack_num+1), max_turn_num, max_turn_len, max_turn_len]
sim = fluid.layers.concat(input=sim_turns, axis=2) sim = fluid.layers.concat(input=sim_turns, axis=2)
# for douban final_info = layers.cnn_3d(sim, self._channel1_num, self._channel2_num)
final_info = layers.cnn_3d(sim, 32, 16)
loss, logits = layers.loss(final_info, label) loss, logits = layers.loss(final_info, label)
return loss, logits return loss, logits
...@@ -88,6 +88,16 @@ def parse_args(): ...@@ -88,6 +88,16 @@ def parse_args():
type=int, type=int,
default=5, default=5,
help='The number of stacked attentive modules in network.') help='The number of stacked attentive modules in network.')
parser.add_argument(
'--channel1_num',
type=int,
default=32,
help="The channels' number of the 1st conv3d layer's output.")
parser.add_argument(
'--channel2_num',
type=int,
default=16,
help="The channels' number of the 2nd conv3d layer's output.")
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -109,7 +119,8 @@ def test(args): ...@@ -109,7 +119,8 @@ def test(args):
} }
dam = Net(args.max_turn_num, args.max_turn_len, args.vocab_size, dam = Net(args.max_turn_num, args.max_turn_len, args.vocab_size,
args.emb_size, args.stack_num) args.emb_size, args.stack_num, args.channel1_num,
args.channel2_num)
loss, logits = dam.create_network() loss, logits = dam.create_network()
loss.persistable = True loss.persistable = True
......
...@@ -83,6 +83,16 @@ def parse_args(): ...@@ -83,6 +83,16 @@ def parse_args():
type=int, type=int,
default=5, default=5,
help='The number of stacked attentive modules in network.') help='The number of stacked attentive modules in network.')
parser.add_argument(
'--channel1_num',
type=int,
default=32,
help="The channels' number of the 1st conv3d layer's output.")
parser.add_argument(
'--channel2_num',
type=int,
default=16,
help="The channels' number of the 2nd conv3d layer's output.")
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -100,7 +110,8 @@ def train(args): ...@@ -100,7 +110,8 @@ def train(args):
} }
dam = Net(args.max_turn_num, args.max_turn_len, args.vocab_size, dam = Net(args.max_turn_num, args.max_turn_len, args.vocab_size,
args.emb_size, args.stack_num) args.emb_size, args.stack_num, args.channel1_num,
args.channel2_num)
loss, logits = dam.create_network() loss, logits = dam.create_network()
loss.persistable = True loss.persistable = True
...@@ -131,6 +142,9 @@ def train(args): ...@@ -131,6 +142,9 @@ def train(args):
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
print("device count %d" % dev_count) print("device count %d" % dev_count)
print("theoretical memory usage: ")
print(fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size))
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -152,7 +166,8 @@ def train(args): ...@@ -152,7 +166,8 @@ def train(args):
print("start loading word embedding init ...") print("start loading word embedding init ...")
word_emb = np.array(pickle.load(open(args.word_emb_init, 'rb'))).astype( word_emb = np.array(pickle.load(open(args.word_emb_init, 'rb'))).astype(
'float32') 'float32')
print("finish loading word embedding init ...") dam.set_word_embedding(word_emb, place)
print("finish init word embedding ...")
print("start loading data ...") print("start loading data ...")
train_data, val_data, test_data = pickle.load(open(args.data_path, 'rb')) train_data, val_data, test_data = pickle.load(open(args.data_path, 'rb'))
...@@ -166,8 +181,6 @@ def train(args): ...@@ -166,8 +181,6 @@ def train(args):
print_step = max(1, batch_num / (dev_count * 100)) print_step = max(1, batch_num / (dev_count * 100))
save_step = max(1, batch_num / (dev_count * 10)) save_step = max(1, batch_num / (dev_count * 10))
word_emb_inited = False
print("begin model training ...") print("begin model training ...")
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
...@@ -182,12 +195,8 @@ def train(args): ...@@ -182,12 +195,8 @@ def train(args):
for dev in xrange(dev_count): for dev in xrange(dev_count):
index = it * dev_count + dev index = it * dev_count + dev
feed_dict = reader.make_one_batch_input(train_batches, index) feed_dict = reader.make_one_batch_input(train_batches, index)
if word_emb_inited is False and args.word_emb_init is not None:
feed_dict[dam.word_emb_name] = word_emb
feed_list.append(feed_dict) feed_list.append(feed_dict)
word_emb_inited = True
cost = train_exe.run(feed=feed_list, fetch_list=[loss.name]) cost = train_exe.run(feed=feed_list, fetch_list=[loss.name])
ave_cost += np.array(cost[0]).mean() ave_cost += np.array(cost[0]).mean()
......
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0
python -u ../test_and_evaluate.py --use_cuda \ python -u ../test_and_evaluate.py --use_cuda \
--data_path ./data/data.pkl \ --data_path ./data/data.pkl \
--save_path ./ \ --save_path ./step_3900 \
--model_path models/step_10000 \ --model_path ./models/step_3900 \
--batch_size 100 \ --batch_size 200 \
--vocab_size 434512 \ --vocab_size 434512 \
--emb_size 200 \ --emb_size 200 \
--_EOS_ 28270 --_EOS_ 28270
......
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0
export FLAGS_eager_delete_tensor_gb=0.0
python -u ../train_and_evaluate.py --use_cuda \ python -u ../train_and_evaluate.py --use_cuda \
--data_path ./data/data.pkl \ --data_path ./data/data.pkl \
--word_emb_init ./data/word_embedding.pkl \ --word_emb_init ./data/word_embedding.pkl \
--save_path ./models \ --save_path ./models \
--batch_size 100 \ --batch_size 256 \
--vocab_size 434512 \ --vocab_size 434512 \
--emb_size 200 \ --emb_size 200 \
--_EOS_ 28270 --_EOS_ 28270
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册