未验证 提交 ff60ed86 编写于 作者: Y Yibing Liu 提交者: GitHub

Merge pull request #1284 from kuke/refine_dam

Some improvements in DAM config
......@@ -55,6 +55,12 @@ for more detailed explanation about the arguments, please run
python ../train_and_evaluate.py --help
```
By default, the training is executed on one single GPU, which can be switched to multiple-GPU mode easily by simply resetting the visible devices in `train.sh`, e.g.,
```
export CUDA_VISIBLE_DEVICES=0,1,2,3
```
4) Run test by
```
......
export CUDA_VISIBLE_DEVICES=0,1,2,3
export CUDA_VISIBLE_DEVICES=0
python -u ../test_and_evaluate.py --use_cuda \
--ext_eval \
--data_path ./data/data.pkl \
--save_path ./eval_10000 \
--model_path models/step_10000 \
--batch_size 100 \
--save_path ./eval_3900 \
--model_path models/step_3900 \
--channel1_num 16 \
--batch_size 200 \
--vocab_size 172130 \
--emb_size 200 \
--_EOS_ 1
......
export CUDA_VISIBLE_DEVICES=0,1,2,3
export CUDA_VISIBLE_DEVICES=0
export FLAGS_eager_delete_tensor_gb=0.0
python -u ../train_and_evaluate.py --use_cuda \
--data_path ./data/data.pkl \
--ext_eval \
--word_emb_init ./data/word_embedding.pkl \
--save_path ./models \
--batch_size 100 \
--batch_size 256 \
--vocab_size 172130 \
--channel1_num 16 \
--emb_size 200 \
--_EOS_ 1
......@@ -6,18 +6,25 @@ import utils.layers as layers
class Net(object):
def __init__(self, max_turn_num, max_turn_len, vocab_size, emb_size,
stack_num):
stack_num, channel1_num, channel2_num):
self._max_turn_num = max_turn_num
self._max_turn_len = max_turn_len
self._vocab_size = vocab_size
self._emb_size = emb_size
self._stack_num = stack_num
self._channel1_num = channel1_num
self._channel2_num = channel2_num
self.word_emb_name = "shared_word_emb"
self.use_stack_op = True
self.use_mask_cache = True
self.use_sparse_embedding = True
def set_word_embedding(self, word_emb, place):
word_emb_param = fluid.global_scope().find_var(
self.word_emb_name).get_tensor()
word_emb_param.set(word_emb, place)
def create_network(self):
mask_cache = dict() if self.use_mask_cache else None
......@@ -136,7 +143,7 @@ class Net(object):
t_a_r = fluid.layers.concat(input=t_a_r_stack, axis=1)
r_a_t = fluid.layers.concat(input=r_a_t_stack, axis=1)
# sim shape: [batch_size, 2*(stack_num+2), max_turn_len, max_turn_len]
# sim shape: [batch_size, 2*(stack_num+1), max_turn_len, max_turn_len]
sim = fluid.layers.matmul(
x=t_a_r, y=r_a_t, transpose_y=True, alpha=1 / np.sqrt(200.0))
sim_turns.append(sim)
......@@ -147,10 +154,9 @@ class Net(object):
for index in xrange(len(sim_turns)):
sim_turns[index] = fluid.layers.unsqueeze(
input=sim_turns[index], axes=[2])
# sim shape: [batch_size, 2*(stack_num+2), max_turn_num, max_turn_len, max_turn_len]
# sim shape: [batch_size, 2*(stack_num+1), max_turn_num, max_turn_len, max_turn_len]
sim = fluid.layers.concat(input=sim_turns, axis=2)
# for douban
final_info = layers.cnn_3d(sim, 32, 16)
final_info = layers.cnn_3d(sim, self._channel1_num, self._channel2_num)
loss, logits = layers.loss(final_info, label)
return loss, logits
......@@ -88,6 +88,16 @@ def parse_args():
type=int,
default=5,
help='The number of stacked attentive modules in network.')
parser.add_argument(
'--channel1_num',
type=int,
default=32,
help="The channels' number of the 1st conv3d layer's output.")
parser.add_argument(
'--channel2_num',
type=int,
default=16,
help="The channels' number of the 2nd conv3d layer's output.")
args = parser.parse_args()
return args
......@@ -109,7 +119,8 @@ def test(args):
}
dam = Net(args.max_turn_num, args.max_turn_len, args.vocab_size,
args.emb_size, args.stack_num)
args.emb_size, args.stack_num, args.channel1_num,
args.channel2_num)
loss, logits = dam.create_network()
loss.persistable = True
......
......@@ -83,6 +83,16 @@ def parse_args():
type=int,
default=5,
help='The number of stacked attentive modules in network.')
parser.add_argument(
'--channel1_num',
type=int,
default=32,
help="The channels' number of the 1st conv3d layer's output.")
parser.add_argument(
'--channel2_num',
type=int,
default=16,
help="The channels' number of the 2nd conv3d layer's output.")
args = parser.parse_args()
return args
......@@ -100,7 +110,8 @@ def train(args):
}
dam = Net(args.max_turn_num, args.max_turn_len, args.vocab_size,
args.emb_size, args.stack_num)
args.emb_size, args.stack_num, args.channel1_num,
args.channel2_num)
loss, logits = dam.create_network()
loss.persistable = True
......@@ -131,6 +142,9 @@ def train(args):
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
print("device count %d" % dev_count)
print("theoretical memory usage: ")
print(fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size))
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -152,7 +166,8 @@ def train(args):
print("start loading word embedding init ...")
word_emb = np.array(pickle.load(open(args.word_emb_init, 'rb'))).astype(
'float32')
print("finish loading word embedding init ...")
dam.set_word_embedding(word_emb, place)
print("finish init word embedding ...")
print("start loading data ...")
train_data, val_data, test_data = pickle.load(open(args.data_path, 'rb'))
......@@ -166,8 +181,6 @@ def train(args):
print_step = max(1, batch_num / (dev_count * 100))
save_step = max(1, batch_num / (dev_count * 10))
word_emb_inited = False
print("begin model training ...")
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
......@@ -182,12 +195,8 @@ def train(args):
for dev in xrange(dev_count):
index = it * dev_count + dev
feed_dict = reader.make_one_batch_input(train_batches, index)
if word_emb_inited is False and args.word_emb_init is not None:
feed_dict[dam.word_emb_name] = word_emb
feed_list.append(feed_dict)
word_emb_inited = True
cost = train_exe.run(feed=feed_list, fetch_list=[loss.name])
ave_cost += np.array(cost[0]).mean()
......
export CUDA_VISIBLE_DEVICES=0,1,2,3
export CUDA_VISIBLE_DEVICES=0
python -u ../test_and_evaluate.py --use_cuda \
--data_path ./data/data.pkl \
--save_path ./ \
--model_path models/step_10000 \
--batch_size 100 \
--save_path ./step_3900 \
--model_path ./models/step_3900 \
--batch_size 200 \
--vocab_size 434512 \
--emb_size 200 \
--_EOS_ 28270
......
export CUDA_VISIBLE_DEVICES=0,1,2,3
export CUDA_VISIBLE_DEVICES=0
export FLAGS_eager_delete_tensor_gb=0.0
python -u ../train_and_evaluate.py --use_cuda \
--data_path ./data/data.pkl \
--word_emb_init ./data/word_embedding.pkl \
--save_path ./models \
--batch_size 100 \
--batch_size 256 \
--vocab_size 434512 \
--emb_size 200 \
--_EOS_ 28270
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
新手
引导
客服 返回
顶部