提交 f39ed679 编写于 作者: J James Qin 提交者: TensorFlower Gardener

Return correct dynamic shape from CuDNN RNN ParamsToCanonical kernel

PiperOrigin-RevId: 165389504
上级 657c4432
......@@ -100,13 +100,13 @@ enum class TFRNNInputMode {
};
namespace {
using perftools::gputools::dnn::RnnMode;
using perftools::gputools::dnn::RnnInputMode;
using perftools::gputools::dnn::RnnDirectionMode;
using perftools::gputools::dnn::ToDataType;
using perftools::gputools::DeviceMemory;
using perftools::gputools::DeviceMemoryBase;
using perftools::gputools::ScratchAllocator;
using perftools::gputools::dnn::RnnDirectionMode;
using perftools::gputools::dnn::RnnInputMode;
using perftools::gputools::dnn::RnnMode;
using perftools::gputools::dnn::ToDataType;
using perftools::gputools::port::StatusOr;
Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
......@@ -668,25 +668,13 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
}
CHECK(size == width * height) << "Params size mismatch. Expected "
<< width * height << ", got " << size;
// If data is aligned, use slice view to avoid expensive memcpy.
bool start_aligned =
rnn_desc->ParamsWeightRegions()[i].offset % EIGEN_MAX_ALIGN_BYTES ==
0;
bool size_aligned = size_in_bytes % EIGEN_MAX_ALIGN_BYTES == 0;
if (start_aligned && size_aligned) {
int start = rnn_desc->ParamsWeightRegions()[i].offset / sizeof(T);
int end = start + size_in_bytes / sizeof(T);
context->set_output(i, input.Slice(start, end));
} else {
Tensor* output = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(
i, TensorShape({width, height}), &output));
DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
input_ptr, rnn_desc->ParamsWeightRegions()[i].offset,
size_in_bytes);
auto data_dst_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
}
Tensor* output = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(
i, TensorShape({height, width}), &output));
DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
input_ptr, rnn_desc->ParamsWeightRegions()[i].offset, size_in_bytes);
auto data_dst_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
}
OP_REQUIRES(context, num_params_ == rnn_desc->ParamsBiasRegions().size(),
......@@ -700,24 +688,14 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
errors::InvalidArgument("Params size mismatch. Expected ",
num_units, ", got ", size));
// If data is aligned, use slice view to avoid expensive memcpy.
bool start_aligned =
rnn_desc->ParamsBiasRegions()[i].offset % EIGEN_MAX_ALIGN_BYTES == 0;
bool size_aligned = size_in_bytes % EIGEN_MAX_ALIGN_BYTES == 0;
if (start_aligned && size_aligned) {
int start = rnn_desc->ParamsBiasRegions()[i].offset / sizeof(T);
int end = start + size_in_bytes / sizeof(T);
context->set_output(num_params_ + i, input.Slice(start, end));
} else {
Tensor* output = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output(num_params_ + i,
TensorShape({size}), &output));
DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
input_ptr, rnn_desc->ParamsBiasRegions()[i].offset, size_in_bytes);
auto data_dst_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
}
Tensor* output = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output(num_params_ + i,
TensorShape({size}), &output));
DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
input_ptr, rnn_desc->ParamsBiasRegions()[i].offset, size_in_bytes);
auto data_dst_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
}
}
......
# -*- coding: utf-8 -*-
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -942,6 +943,78 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
dir_count, dropout, dtype, delta, tolerance)
class CudnnRNNTestParamsToCanonical(TensorFlowTestCase):
@unittest.skipUnless(test.is_built_with_cuda(),
"Test only applicable when running on GPUs")
def testLSTMParamsToCanonical(self):
"""Test ParamsToCanonical kernel returns valid canonical weights."""
num_layers = 1
dir_count = 1
num_units = 2
input_size = 4
batch_size = 3
lstm = _CreateModel(
rnn_mode="lstm",
num_layers=num_layers,
num_units=num_units,
input_size=input_size,
input_mode="linear_input",
direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
params_size_t = lstm.params_size()
input_data = random_ops.random_uniform([1, batch_size, input_size])
input_h = random_ops.random_uniform([num_layers * dir_count, batch_size,
num_units])
input_c = random_ops.random_uniform([num_layers * dir_count, batch_size,
num_units])
cu_params = vs.get_variable(
"cu_params", initializer=random_ops.random_uniform([params_size_t]),
validate_shape=False)
output, _, output_c = lstm(
input_data=input_data,
input_h=input_h,
input_c=input_c,
params=cu_params,
is_training=False)
total_sum = math_ops.reduce_sum(output) + math_ops.reduce_sum(output_c)
# Subgraph manually computing the LSTM
# i_t = σ(w_i * x_t + r_i * h_(t-1) + b_wi + b_ri)
# f_t = σ(w_f * x_t + r_f * h_(t-1) + b_wf + b_rf)
# o_t = σ(w_o * x_t + r_o h_(t-1) + b_wo + b_ro)
# c'_t = tanh(w_c * x_t + r_c * h_(t-1) + b_wc + b_rc)
# c_t = f_t ◦ c_(t-1) + i_t ◦ c'_t
# h_t = o_t ◦ tanh(c_t)
wt, bs = lstm.params_to_canonical(cu_params)
# Kernel returned transposed weights.
wt = [array_ops.transpose(w) for w in wt]
wi, wf, wc, wo, ri, rf, rc, ro = wt
b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro = bs
x = array_ops.squeeze(input_data, 0)
h = array_ops.squeeze(input_h, 0)
c = array_ops.squeeze(input_c, 0)
i_g = math_ops.sigmoid(
math_ops.matmul(x, wi) + math_ops.matmul(h, ri) + b_wi + b_ri)
f_g = math_ops.sigmoid(
math_ops.matmul(x, wf) + math_ops.matmul(h, rf) + b_wf + b_rf)
c_g = math_ops.tanh(
math_ops.matmul(x, wc) + math_ops.matmul(h, rc) + b_wc + b_rc)
o_g = math_ops.sigmoid(
math_ops.matmul(x, wo) + math_ops.matmul(h, ro) + b_wo + b_ro)
c = f_g * c + i_g * c_g
h = o_g * math_ops.tanh(c)
actual_total_sum = math_ops.reduce_sum(h) + math_ops.reduce_sum(c)
with self.test_session(use_gpu=True) as sess:
variables.global_variables_initializer().run()
total_sum_v, actual_total_sum_v = sess.run([total_sum, actual_total_sum])
self.assertAllClose(total_sum_v, actual_total_sum_v)
class CudnnRNNTestBidirectional(TensorFlowTestCase):
# TODO(jamesqin): Test multi-layer bi-Cudnn.
......
......@@ -38,10 +38,6 @@ _cudnn_rnn_ops_so = loader.load_op_library(
resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))
_flatten_transpose = lambda t: array_ops.reshape(array_ops.transpose(t), [-1])
# pylint: disable=g-long-lambda
_transpose_reshape = lambda t, shape: array_ops.transpose(
array_ops.reshape(t, shape))
# pylint: enable=g-long-lambda
CUDNN_RNN_UNIDIRECTION = "unidirectional"
CUDNN_RNN_BIDIRECTION = "bidirectional"
......@@ -242,8 +238,6 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
transformed_weights, transformed_biases = [], []
for i in range(self._cudnn_rnn.num_layers):
base_idx = i * 8
num_units = self._cudnn_rnn.num_units
input_size = self._cudnn_rnn.input_size if i == 0 else num_units
# cuDNN tensor shapes per time_step:
# input.shape: [batch_size, input_size],
# input_weights.shape: [num_units, input_size] (first layer)
......@@ -260,16 +254,11 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
# Stitch weights together in this layer.
stitched_w = []
for j in range(4):
stitched_w.append(
array_ops.concat(
[
array_ops.reshape(weights[base_idx + j],
[num_units, input_size]),
array_ops.reshape(weights[base_idx + j + 4],
[num_units, num_units])
],
axis=1))
[weights[base_idx + j], weights[base_idx + j + 4]], axis=1))
# cuDNN weights are in ifco order, convert to icfo order.
self._switch_inner(stitched_w, 0)
transformed_weights.append(
......@@ -307,8 +296,6 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
transformed_weights, transformed_biases = [], []
for i in range(self._cudnn_rnn.num_layers):
base_idx = i * 6
num_units = self._cudnn_rnn.num_units
input_size = self._cudnn_rnn.input_size if i == 0 else num_units
# cuDNN tensor shapes per time_step:
# input.shape: [batch_size, input_size],
# input_weights.shape: [num_units, input_size] (first layer)
......@@ -335,19 +322,14 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
stitched_w.append(
array_ops.concat(
[
array_ops.reshape(weights[base_idx + j],
[num_units, input_size]),
array_ops.reshape(weights[base_idx + j + 3],
[num_units, num_units])
],
axis=1))
weights[base_idx + j],
weights[base_idx + j + 3],
], axis=1))
transformed_weights.append(
array_ops.transpose(array_ops.concat(stitched_w[:2], axis=0)))
# weights for new memory gate are kept separate.
transformed_weights.append(
_transpose_reshape(weights[base_idx + 2], [num_units, input_size]))
transformed_weights.append(
_transpose_reshape(weights[base_idx + 5], [num_units, num_units]))
transformed_weights.append(array_ops.transpose(weights[base_idx + 2]))
transformed_weights.append(array_ops.transpose(weights[base_idx + 5]))
# Bias for reset and update gates.
b_r = array_ops.concat(biases[base_idx:base_idx + 2], axis=0)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册