提交 773c6557 编写于 作者: _青葱's avatar _青葱

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into ini

......@@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad
From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
## Turing Completeness
......
......@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
const auto *bias = ctx.Input<Tensor>("Bias");
auto *y = ctx.Output<Tensor>("Y");
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
// alloc memory
y->mutable_data<T>(ctx.GetPlace());
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
auto handle = dev_ctx.cudnn_handle();
......@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
// Run training mode.
// obtain running mean and running inv var, and see if we need to
// initialize them.
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
double this_factor = 1. - momentum;
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
......
......@@ -288,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
template <>
......@@ -310,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>(
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N;
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
template <>
......@@ -332,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>(
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N;
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
template <>
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
......
......@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
mkldnn::memory({src_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(input_data)));
auto dst_memory =
mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
mkldnn::memory({dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(output_data)));
auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
*workspace_memory);
......@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
pool_bwd_desc, mkldnn_engine, *pool_pd);
auto diff_src_memory =
mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
mkldnn::memory({diff_src_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(in_x_grad_data)));
auto diff_dst_memory =
mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
mkldnn::memory({diff_dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(out_grad_data)));
auto bwd_prim = mkldnn::pooling_backward(
pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/prelu_op.h"
#include <string>
namespace paddle {
......
......@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
bool flip = ctx->Attrs().Get<bool>("flip");
std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
if (max_sizes.size() > 0) {
......
......@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
......
......@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/transform.h"
......@@ -22,23 +24,23 @@ namespace operators {
inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
bool flip,
std::vector<float>& output_aspect_ratior) {
std::vector<float>* output_aspect_ratior) {
constexpr float epsilon = 1e-6;
output_aspect_ratior.clear();
output_aspect_ratior.push_back(1.0f);
output_aspect_ratior->clear();
output_aspect_ratior->push_back(1.0f);
for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
float ar = input_aspect_ratior[i];
bool already_exist = false;
for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
already_exist = true;
break;
}
}
if (!already_exist) {
output_aspect_ratior.push_back(ar);
output_aspect_ratior->push_back(ar);
if (flip) {
output_aspect_ratior.push_back(1.0f / ar);
output_aspect_ratior->push_back(1.0f / ar);
}
}
}
......@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/rank_loss_op.h"
#include <string>
namespace paddle {
namespace operators {
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
......@@ -19,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle {
......
......@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <limits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
......
......@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& src_stride,
const framework::DDim& dst_dim,
const framework::DDim& dst_stride, T* dst) {
using namespace detail;
StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
paddle::operators::detail::StridedCopyDimVisitor<T> func(
dev_ctx, src, src_stride, dst_stride, dst);
boost::apply_visitor(func, dst_dim);
}
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h"
namespace paddle {
......@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* src,
bool& firstStep, bool& is_empty,
Pair<T>& max, int dim,
bool* firstStep, bool* is_empty,
Pair<T>* max, int dim,
const int tid) {
if (beam > 0) {
int length = beam < beam_size ? beam : beam_size;
if (firstStep) {
firstStep = false;
if (*beam > 0) {
int length = (*beam) < beam_size ? *beam : beam_size;
if (*firstStep) {
*firstStep = false;
GetTopK<T, BlockSize>(topk, src, tid, dim, length);
} else {
for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - beam) {
topk[k] = topk[k + beam];
if (k < MaxLength - (*beam)) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(-INFINITY, -1);
}
}
if (!is_empty) {
GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
length);
}
}
max = topk[MaxLength - 1];
if (max.v == -1) is_empty = true;
beam = 0;
*max = topk[MaxLength - 1];
if ((*max).v == -1) *is_empty = true;
*beam = 0;
}
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* val,
int* col, bool& firstStep,
bool& is_empty, Pair<T>& max,
int* col, bool* firstStep,
bool* is_empty, Pair<T>* max,
int dim, const int tid) {
if (beam > 0) {
int length = beam < beam_size ? beam : beam_size;
if (firstStep) {
firstStep = false;
if (*beam > 0) {
int length = (*beam) < beam_size ? *beam : beam_size;
if (*firstStep) {
*firstStep = false;
GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
} else {
for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - beam) {
topk[k] = topk[k + beam];
if (k < MaxLength - *beam) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(-INFINITY, -1);
}
}
if (!is_empty) {
GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
length);
}
}
max = topk[MaxLength - 1];
if (max.v == -1) is_empty = true;
beam = 0;
*max = topk[MaxLength - 1];
if ((*max).v == -1) *is_empty = true;
*beam = 0;
}
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
Pair<T> topk[], T** topVal,
int64_t** topIds, int& beam, int& k,
int64_t** topIds, int* beam, int* k,
const int tid, const int warp) {
while (true) {
__syncthreads();
......@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
(*topVal)++;
(*topIds)++;
}
if (tid == maxid[0]) beam++;
if (--k == 0) break;
if (tid == maxid[0]) (*beam)++;
if (--(*k) == 0) break;
__syncthreads();
if (tid == maxid[0]) {
if (beam < MaxLength) {
sh_topk[tid] = topk[beam];
if (*beam < MaxLength) {
sh_topk[tid] = topk[*beam];
}
}
if (maxid[0] / 32 == warp) {
if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
}
}
}
......@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
topk[k].set(-INFINITY, -1);
}
while (k) {
ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
src + blockIdx.x * lds, firststep,
is_empty, max, dim, tid);
ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
src + blockIdx.x * lds, &firststep,
&is_empty, &max, dim, tid);
sh_topk[tid] = topk[0];
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
&indices, beam, k, tid, warp);
&indices, &beam, &k, tid, warp);
}
}
......@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
KeMatrixTopK<T, 5, 256><<<
grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
.stream()>>>(output_data, output->dims()[1],
indices_data, input_data,
input_width, input_width, int(k));
.stream()>>>(
output_data, output->dims()[1], indices_data, input_data, input_width,
input_width, static_cast<int>(k));
}
};
......
......@@ -37,7 +37,7 @@ depth = 8
mix_hidden_lr = 1e-3
IS_SPARSE = True
PASS_NUM = 10
PASS_NUM = 100
BATCH_SIZE = 10
embedding_name = 'emb'
......@@ -77,7 +77,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
emb_layers.append(mark_embedding)
hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
for emb in emb_layers
]
hidden_0 = fluid.layers.sums(input=hidden_0_layers)
......@@ -94,8 +95,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
])
lstm = fluid.layers.dynamic_lstm(
......@@ -109,8 +110,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
])
return feature_out
......@@ -171,7 +172,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
# check other optimizers and check why out will be NAN
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.0001,
learning_rate=0.01,
decay_steps=100000,
decay_rate=0.5,
staircase=True))
......@@ -233,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
print("second per batch: " + str((time.time(
) - start_time) / batch_id))
# Set the threshold low to speed up the CI test
if float(pass_precision) > 0.05:
if float(pass_precision) > 0.01:
if save_dirname is not None:
# TODO(liuyiqun): Change the target to crf_decode
fluid.io.save_inference_model(save_dirname, [
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册