提交 773c6557 编写于 作者: _青葱's avatar _青葱

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into ini

...@@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad ...@@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad
From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop. From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid. We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
## Turing Completeness ## Turing Completeness
......
...@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
const auto *bias = ctx.Input<Tensor>("Bias"); const auto *bias = ctx.Input<Tensor>("Bias");
auto *y = ctx.Output<Tensor>("Y"); auto *y = ctx.Output<Tensor>("Y");
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
// alloc memory // alloc memory
y->mutable_data<T>(ctx.GetPlace()); y->mutable_data<T>(ctx.GetPlace());
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
...@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
// Run training mode. // Run training mode.
// obtain running mean and running inv var, and see if we need to // obtain running mean and running inv var, and see if we need to
// initialize them. // initialize them.
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
double this_factor = 1. - momentum; double this_factor = 1. - momentum;
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
......
...@@ -288,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>( ...@@ -288,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
// TODO(kexinzhao): add processing code for compute capability < 53 case // TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53"); "cublas Hgemm requires GPU compute capability >= 53");
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
#endif
} }
template <> template <>
...@@ -310,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>( ...@@ -310,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>(
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N; const int strideC = M * N;
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
#endif
} }
template <> template <>
...@@ -332,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>( ...@@ -332,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>(
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N; const int strideC = M * N;
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
#endif
} }
template <> template <>
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
auto src_memory = auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); mkldnn::memory({src_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(input_data)));
auto dst_memory = auto dst_memory =
mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data); mkldnn::memory({dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(output_data)));
auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory, auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
*workspace_memory); *workspace_memory);
...@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
pool_bwd_desc, mkldnn_engine, *pool_pd); pool_bwd_desc, mkldnn_engine, *pool_pd);
auto diff_src_memory = auto diff_src_memory =
mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data); mkldnn::memory({diff_src_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(in_x_grad_data)));
auto diff_dst_memory = auto diff_dst_memory =
mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data); mkldnn::memory({diff_dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(out_grad_data)));
auto bwd_prim = mkldnn::pooling_backward( auto bwd_prim = mkldnn::pooling_backward(
pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory); pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/prelu_op.h" #include "paddle/fluid/operators/prelu_op.h"
#include <string> #include <string>
namespace paddle { namespace paddle {
......
...@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel { ...@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
bool flip = ctx->Attrs().Get<bool>("flip"); bool flip = ctx->Attrs().Get<bool>("flip");
std::vector<float> aspect_ratios_vec; std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec); ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
size_t num_priors = aspect_ratios_vec.size() * min_sizes.size(); size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
......
...@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> { ...@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
auto clip = ctx.Attr<bool>("clip"); auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios; std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w")); T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h")); T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
...@@ -22,23 +24,23 @@ namespace operators { ...@@ -22,23 +24,23 @@ namespace operators {
inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior, inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
bool flip, bool flip,
std::vector<float>& output_aspect_ratior) { std::vector<float>* output_aspect_ratior) {
constexpr float epsilon = 1e-6; constexpr float epsilon = 1e-6;
output_aspect_ratior.clear(); output_aspect_ratior->clear();
output_aspect_ratior.push_back(1.0f); output_aspect_ratior->push_back(1.0f);
for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
float ar = input_aspect_ratior[i]; float ar = input_aspect_ratior[i];
bool already_exist = false; bool already_exist = false;
for (size_t j = 0; j < output_aspect_ratior.size(); ++j) { for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
if (fabs(ar - output_aspect_ratior[j]) < epsilon) { if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
already_exist = true; already_exist = true;
break; break;
} }
} }
if (!already_exist) { if (!already_exist) {
output_aspect_ratior.push_back(ar); output_aspect_ratior->push_back(ar);
if (flip) { if (flip) {
output_aspect_ratior.push_back(1.0f / ar); output_aspect_ratior->push_back(1.0f / ar);
} }
} }
} }
...@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
auto clip = ctx.Attr<bool>("clip"); auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios; std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w")); T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h")); T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/rank_loss_op.h" #include "paddle/fluid/operators/rank_loss_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <future> // NOLINT
#include <ostream> #include <ostream>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -19,7 +20,6 @@ limitations under the License. */ ...@@ -19,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <limits>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, ...@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& src_stride, const framework::DDim& src_stride,
const framework::DDim& dst_dim, const framework::DDim& dst_dim,
const framework::DDim& dst_stride, T* dst) { const framework::DDim& dst_stride, T* dst) {
using namespace detail; paddle::operators::detail::StridedCopyDimVisitor<T> func(
StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst); dev_ctx, src, src_stride, dst_stride, dst);
boost::apply_visitor(func, dst_dim); boost::apply_visitor(func, dst_dim);
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
namespace paddle { namespace paddle {
...@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col, ...@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
} }
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam, __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* src, int beam_size, const T* src,
bool& firstStep, bool& is_empty, bool* firstStep, bool* is_empty,
Pair<T>& max, int dim, Pair<T>* max, int dim,
const int tid) { const int tid) {
if (beam > 0) { if (*beam > 0) {
int length = beam < beam_size ? beam : beam_size; int length = (*beam) < beam_size ? *beam : beam_size;
if (firstStep) { if (*firstStep) {
firstStep = false; *firstStep = false;
GetTopK<T, BlockSize>(topk, src, tid, dim, length); GetTopK<T, BlockSize>(topk, src, tid, dim, length);
} else { } else {
for (int k = 0; k < MaxLength; k++) { for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - beam) { if (k < MaxLength - (*beam)) {
topk[k] = topk[k + beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(-INFINITY, -1); topk[k].set(-INFINITY, -1);
} }
} }
if (!is_empty) { if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max, GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
length); length);
} }
} }
max = topk[MaxLength - 1]; *max = topk[MaxLength - 1];
if (max.v == -1) is_empty = true; if ((*max).v == -1) *is_empty = true;
beam = 0; *beam = 0;
} }
} }
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam, __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* val, int beam_size, const T* val,
int* col, bool& firstStep, int* col, bool* firstStep,
bool& is_empty, Pair<T>& max, bool* is_empty, Pair<T>* max,
int dim, const int tid) { int dim, const int tid) {
if (beam > 0) { if (*beam > 0) {
int length = beam < beam_size ? beam : beam_size; int length = (*beam) < beam_size ? *beam : beam_size;
if (firstStep) { if (*firstStep) {
firstStep = false; *firstStep = false;
GetTopK<T, BlockSize>(topk, val, col, tid, dim, length); GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
} else { } else {
for (int k = 0; k < MaxLength; k++) { for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - beam) { if (k < MaxLength - *beam) {
topk[k] = topk[k + beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(-INFINITY, -1); topk[k].set(-INFINITY, -1);
} }
} }
if (!is_empty) { if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max, GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
length); length);
} }
} }
max = topk[MaxLength - 1]; *max = topk[MaxLength - 1];
if (max.v == -1) is_empty = true; if ((*max).v == -1) *is_empty = true;
beam = 0; *beam = 0;
} }
} }
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid, __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
Pair<T> topk[], T** topVal, Pair<T> topk[], T** topVal,
int64_t** topIds, int& beam, int& k, int64_t** topIds, int* beam, int* k,
const int tid, const int warp) { const int tid, const int warp) {
while (true) { while (true) {
__syncthreads(); __syncthreads();
...@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid, ...@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
(*topVal)++; (*topVal)++;
(*topIds)++; (*topIds)++;
} }
if (tid == maxid[0]) beam++; if (tid == maxid[0]) (*beam)++;
if (--k == 0) break; if (--(*k) == 0) break;
__syncthreads(); __syncthreads();
if (tid == maxid[0]) { if (tid == maxid[0]) {
if (beam < MaxLength) { if (*beam < MaxLength) {
sh_topk[tid] = topk[beam]; sh_topk[tid] = topk[*beam];
} }
} }
if (maxid[0] / 32 == warp) { if (maxid[0] / 32 == warp) {
if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break; if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
} }
} }
} }
...@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, ...@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
topk[k].set(-INFINITY, -1); topk[k].set(-INFINITY, -1);
} }
while (k) { while (k) {
ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k, ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
src + blockIdx.x * lds, firststep, src + blockIdx.x * lds, &firststep,
is_empty, max, dim, tid); &is_empty, &max, dim, tid);
sh_topk[tid] = topk[0]; sh_topk[tid] = topk[0];
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output, BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
&indices, beam, k, tid, warp); &indices, &beam, &k, tid, warp);
} }
} }
...@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
KeMatrixTopK<T, 5, 256><<< KeMatrixTopK<T, 5, 256><<<
grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>( grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context()) ctx.device_context())
.stream()>>>(output_data, output->dims()[1], .stream()>>>(
indices_data, input_data, output_data, output->dims()[1], indices_data, input_data, input_width,
input_width, input_width, int(k)); input_width, static_cast<int>(k));
} }
}; };
......
...@@ -37,7 +37,7 @@ depth = 8 ...@@ -37,7 +37,7 @@ depth = 8
mix_hidden_lr = 1e-3 mix_hidden_lr = 1e-3
IS_SPARSE = True IS_SPARSE = True
PASS_NUM = 10 PASS_NUM = 100
BATCH_SIZE = 10 BATCH_SIZE = 10
embedding_name = 'emb' embedding_name = 'emb'
...@@ -77,7 +77,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -77,7 +77,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
emb_layers.append(mark_embedding) emb_layers.append(mark_embedding)
hidden_0_layers = [ hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
for emb in emb_layers
] ]
hidden_0 = fluid.layers.sums(input=hidden_0_layers) hidden_0 = fluid.layers.sums(input=hidden_0_layers)
...@@ -94,8 +95,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -94,8 +95,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
for i in range(1, depth): for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[ mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim), fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim) fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
]) ])
lstm = fluid.layers.dynamic_lstm( lstm = fluid.layers.dynamic_lstm(
...@@ -109,8 +110,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -109,8 +110,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
input_tmp = [mix_hidden, lstm] input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[ feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=label_dict_len), fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len) fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
]) ])
return feature_out return feature_out
...@@ -171,7 +172,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -171,7 +172,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
# check other optimizers and check why out will be NAN # check other optimizers and check why out will be NAN
sgd_optimizer = fluid.optimizer.SGD( sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay( learning_rate=fluid.layers.exponential_decay(
learning_rate=0.0001, learning_rate=0.01,
decay_steps=100000, decay_steps=100000,
decay_rate=0.5, decay_rate=0.5,
staircase=True)) staircase=True))
...@@ -233,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -233,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
print("second per batch: " + str((time.time( print("second per batch: " + str((time.time(
) - start_time) / batch_id)) ) - start_time) / batch_id))
# Set the threshold low to speed up the CI test # Set the threshold low to speed up the CI test
if float(pass_precision) > 0.05: if float(pass_precision) > 0.01:
if save_dirname is not None: if save_dirname is not None:
# TODO(liuyiqun): Change the target to crf_decode # TODO(liuyiqun): Change the target to crf_decode
fluid.io.save_inference_model(save_dirname, [ fluid.io.save_inference_model(save_dirname, [
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册