提交 814a7590 编写于 作者: S sneaxiy

merge develop

test=develop
...@@ -11,7 +11,7 @@ paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, ...@@ -11,7 +11,7 @@ paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None,
paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6')) paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6'))
paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2')) paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '78e512cabeda9c7f42cb7c7e88967ae7')) paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45')) paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45'))
paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0')) paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2')) paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
......
...@@ -38,10 +38,10 @@ if(WITH_GPU) ...@@ -38,10 +38,10 @@ if(WITH_GPU)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util) add_dependencies(tensor tensor_util)
else() else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
endif(WIN32) endif(WIN32)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
endif() endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -76,11 +77,11 @@ struct BuildStrategy { ...@@ -76,11 +77,11 @@ struct BuildStrategy {
bool fuse_relu_depthwise_conv_{false}; bool fuse_relu_depthwise_conv_{false};
bool memory_optimize_{false}; bool memory_optimize_{true};
// TODO(dzhwinter): // TODO(dzhwinter):
// make enable_inplace, memory_optimize_ // make enable_inplace, memory_optimize_
// memory_early_delete_ true by default // memory_early_delete_ true by default
bool enable_inplace_{false}; bool enable_inplace_{true};
bool enable_sequential_execution_{false}; bool enable_sequential_execution_{false};
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -138,19 +137,16 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -138,19 +137,16 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:GPU->CPU");
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place); auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cpu_place(src_place) && } else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:CPU->GPU");
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place); auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) && } else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:GPU->GPU");
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(3) << "Skip copy the same data from " << src_place << " to " VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place; << dst_place;
...@@ -161,7 +157,6 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -161,7 +157,6 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cuda_pinned_place(src_place) && } else if (platform::is_cuda_pinned_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU");
auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place); auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
......
add_subdirectory(detail) add_subdirectory(detail)
add_subdirectory(allocation) add_subdirectory(allocation)
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler) cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade)
cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(memory cc_library(memory
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include <cstring> // for memcpy #include <cstring> // for memcpy
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -30,23 +29,14 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, ...@@ -30,23 +29,14 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
// because GpuMemcpySync issues the copying command to the default stream,
// which will make two commands from different streams cannot run concurrently.
// Reference:
// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
template <> template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>( void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) { const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
// FIXME(zjl): do we really need it? // FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) { if (num <= kMaxGpuAsyncCopyBytes) {
...@@ -61,10 +51,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -61,10 +51,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
const void* src, size_t num, cudaStream_t stream) { const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
// FIXME(zjl): do we really need it? // FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) { if (num <= kMaxGpuAsyncCopyBytes) {
...@@ -80,19 +68,15 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -80,19 +68,15 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if (dst_place == src_place) { if (dst_place == src_place) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
} }
} else { } else {
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, stream); num, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
num); num);
} }
...@@ -127,10 +111,8 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( ...@@ -127,10 +111,8 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
cudaStream_t stream) { cudaStream_t stream) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
} }
} }
...@@ -142,10 +124,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( ...@@ -142,10 +124,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
cudaStream_t stream) { cudaStream_t stream) {
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
} }
} }
......
...@@ -127,6 +127,12 @@ void Conv2DTransposeOpMaker::Make() { ...@@ -127,6 +127,12 @@ void Conv2DTransposeOpMaker::Make() {
"output feature channels," "output feature channels,"
"H is the height of the filter, and W is the width of the filter. " "H is the height of the filter, and W is the width of the filter. "
"We enforce groups number == 1 in the convolution transpose scenario."); "We enforce groups number == 1 in the convolution transpose scenario.");
AddInput("Bias",
"(Tensor) Bias to be added to each output of filter application."
"The format of output tensor is X (one-dimensional) of size equal"
"to the number of output channels. Only used with MKL-DNN.")
.AsDispensable();
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator. " "(Tensor) The output tensor of convolution transpose operator. "
"The format of output tensor is also NCHW."); "The format of output tensor is also NCHW.");
......
...@@ -55,4 +55,4 @@ void BuildTanhGradNode( ...@@ -55,4 +55,4 @@ void BuildTanhGradNode(
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(relu_grad, BuildReluGradNode); REGISTER_NG_OP(relu_grad, BuildReluGradNode);
REGISTER_NG_OP(than_grad, BuildTanhGradNode); REGISTER_NG_OP(tanh_grad, BuildTanhGradNode);
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace reader { namespace reader {
...@@ -51,10 +50,9 @@ BufferedReader::BufferedReader( ...@@ -51,10 +50,9 @@ BufferedReader::BufferedReader(
.Get(place_))) .Get(place_)))
->stream(); ->stream();
events.resize(buffer_size); events.resize(buffer_size);
PADDLE_ENFORCE(cudaStreamCreate(&stream)); for (auto &event : events)
for (auto &event : events) {
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
} PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
} }
#endif #endif
cpu_buffer_.resize(buffer_size); cpu_buffer_.resize(buffer_size);
...@@ -86,15 +84,12 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -86,15 +84,12 @@ void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// NOTE(liangdun): using async copy instead of TensorCopySync // NOTE(liangdun): using async copy instead of TensorCopySync
// TensorCopySync would block other stream, because TensorCopySync // TensorCopySync would block other stream
// issues the copying command to the default stream, it will make two
// commands from different streams cannot run concurrently.
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device); platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0)); PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
TensorVec &gpu = gpu_buffer_[i]; TensorVec &gpu = gpu_buffer_[i];
gpu.resize(cpu.size()); gpu.resize(cpu.size());
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
for (size_t i = 0; i < cpu.size(); ++i) { for (size_t i = 0; i < cpu.size(); ++i) {
gpu[i].Resize(cpu[i].dims()); gpu[i].Resize(cpu[i].dims());
gpu[i].set_layout(cpu[i].layout()); gpu[i].set_layout(cpu[i].layout());
...@@ -103,19 +98,20 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -103,19 +98,20 @@ void BufferedReader::ReadAsync(size_t i) {
auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type()); auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
auto size = auto size =
cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
if (platform::is_cuda_pinned_place(cpu_place)) { if (platform::is_cuda_pinned_place(cpu_place))
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPinnedPlace>(cpu_place), boost::get<platform::CUDAPinnedPlace>(cpu_place),
cpu_ptr, size, stream); cpu_ptr, size, stream);
} else if ((platform::is_gpu_place(cpu_place))) { else if ((platform::is_gpu_place(cpu_place)))
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr, boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
size, stream); size, stream);
} else { else
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size, boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
stream); 0);
}
gpu[i].set_lod(cpu[i].lod()); gpu[i].set_lod(cpu[i].lod());
} }
PADDLE_ENFORCE(cudaStreamSynchronize(stream)); PADDLE_ENFORCE(cudaStreamSynchronize(stream));
......
...@@ -64,8 +64,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> { ...@@ -64,8 +64,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<LoDTensor>("Out"); auto* out = ctx.Output<LoDTensor>("Out");
auto lod = in->lod(); auto lod = in->lod();
PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(lod[lod.size() - 1].back(), (size_t)in->numel(),
PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
"The actual size mismatches with the LoD information."); "The actual size mismatches with the LoD information.");
auto tokens = ctx.Attr<std::vector<int>>("tokens"); auto tokens = ctx.Attr<std::vector<int>>("tokens");
auto in_len = in->numel(); auto in_len = in->numel();
...@@ -85,10 +84,9 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> { ...@@ -85,10 +84,9 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
num_erased.begin() + 1); num_erased.begin() + 1);
// Copy LoD to GPU // Copy LoD to GPU
auto lod0 = lod[0]; auto last_lod = lod[lod.size() - 1];
auto lod_len = lod0.size(); auto lod_len = last_lod.size();
const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace()); const size_t* dev_in_lod_ptr = last_lod.CUDAData(ctx.GetPlace());
// Calc output LoD // Calc output LoD
thrust::device_vector<size_t> dev_out_lod(lod_len); thrust::device_vector<size_t> dev_out_lod(lod_len);
size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data()); size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
...@@ -96,13 +94,16 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> { ...@@ -96,13 +94,16 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_CUDA_NUM_THREADS, 0, stream>>>( PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr); num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
// Set LoD for output // Set LoD for output
std::vector<size_t> out_lod0(dev_out_lod.begin(), dev_out_lod.end()); std::vector<size_t> out_last_lod(dev_out_lod.begin(), dev_out_lod.end());
framework::LoD out_lod; framework::LoD out_lod;
out_lod.push_back(out_lod0); for (size_t i = 0; i < lod.size() - 1; ++i) {
out_lod.push_back(lod[i]);
}
out_lod.push_back(out_last_lod);
out->set_lod(out_lod); out->set_lod(out_lod);
// Set output // Set output
out->Resize({static_cast<int64_t>(out_lod0.back()), 1}); out->Resize({static_cast<int64_t>(out_last_lod.back()), 1});
auto out_dat = out->mutable_data<T>(ctx.GetPlace()); auto out_dat = out->mutable_data<T>(ctx.GetPlace());
SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
......
...@@ -28,19 +28,18 @@ class SequenceEraseKernel : public framework::OpKernel<T> { ...@@ -28,19 +28,18 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<framework::LoDTensor>("Out"); auto* out = ctx.Output<framework::LoDTensor>("Out");
auto lod = in->lod(); auto lod = in->lod();
PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(lod[lod.size() - 1].back(), (size_t)in->numel(),
PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
"The actual size mismatches with the LoD information."); "The actual size mismatches with the LoD information.");
auto tokens = ctx.Attr<std::vector<int>>("tokens"); auto tokens = ctx.Attr<std::vector<int>>("tokens");
auto in_len = in->numel(); auto in_len = in->numel();
auto in_dat = in->data<T>(); auto in_dat = in->data<T>();
auto lod0 = lod[0]; auto last_lod = lod[lod.size() - 1];
std::vector<size_t> num_erased(in_len + 1, 0); std::vector<size_t> num_erased(in_len + 1, 0);
std::vector<size_t> out_lod0(1, 0); std::vector<size_t> out_last_lod(1, 0);
for (size_t i = 0; i < lod0.size() - 1; ++i) { for (size_t i = 0; i < last_lod.size() - 1; ++i) {
size_t num_out = 0; size_t num_out = 0;
for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) { for (auto j = last_lod[i] + 1; j <= last_lod[i + 1]; ++j) {
num_erased[j] = num_erased[j - 1]; num_erased[j] = num_erased[j - 1];
if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) != if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) !=
tokens.end()) { tokens.end()) {
...@@ -49,7 +48,7 @@ class SequenceEraseKernel : public framework::OpKernel<T> { ...@@ -49,7 +48,7 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
num_out += 1; num_out += 1;
} }
} }
out_lod0.push_back(out_lod0.back() + num_out); out_last_lod.push_back(out_last_lod.back() + num_out);
} }
auto out_len = in_len - num_erased[in_len]; auto out_len = in_len - num_erased[in_len];
...@@ -62,7 +61,10 @@ class SequenceEraseKernel : public framework::OpKernel<T> { ...@@ -62,7 +61,10 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
} }
} }
framework::LoD out_lod; framework::LoD out_lod;
out_lod.push_back(out_lod0); for (size_t i = 0; i < lod.size() - 1; ++i) {
out_lod.push_back(lod[i]);
}
out_lod.push_back(out_last_lod);
out->set_lod(out_lod); out->set_lod(out_lod);
} }
}; };
......
...@@ -30,6 +30,7 @@ limitations under the License. */ ...@@ -30,6 +30,7 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
...@@ -221,24 +222,19 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -221,24 +222,19 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
} }
case CUPTI_ACTIVITY_KIND_DRIVER: { case CUPTI_ACTIVITY_KIND_DRIVER: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record); auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0) { if (api->start != 0 && api->end != 0)
// -1 device id represents ActiveKind api call // -1 device id represents CUDA api call
tracer->AddActiveKindRecords( tracer->AddCPURecords(
DriverKind(api->cbid), api->start, api->end, -1, DriverKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId), GetThreadIdFromSystemThreadId(api->threadId));
api->correlationId);
}
break; break;
} }
case CUPTI_ACTIVITY_KIND_RUNTIME: { case CUPTI_ACTIVITY_KIND_RUNTIME: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record); auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0) { if (api->start != 0 && api->end != 0)
// -1 device id represents ActiveKind api call tracer->AddCPURecords(
tracer->AddActiveKindRecords(
RuntimeKind(api->cbid), api->start, api->end, -1, RuntimeKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId), GetThreadIdFromSystemThreadId(api->threadId));
api->correlationId);
}
break; break;
} }
default: { break; } default: { break; }
...@@ -317,25 +313,6 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -317,25 +313,6 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id, uint32_t correlation_id) {
if (anno.empty()) {
VLOG(1) << "Empty timeline annotation.";
return;
}
thread_local std::forward_list<ActiveKindRecord>
*local_active_kind_records = nullptr;
if (local_active_kind_records == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
active_kind_records_.emplace_front();
local_active_kind_records = &active_kind_records_.front();
}
// lock is not needed, only one thread call this function.
local_active_kind_records->push_front(ActiveKindRecord{
anno, start_ns, end_ns, device_id, thread_id, correlation_id});
}
void AddKernelRecords(std::string name, uint64_t start, uint64_t end, void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t device_id, int64_t stream_id, int64_t device_id, int64_t stream_id,
uint32_t correlation_id) { uint32_t correlation_id) {
...@@ -378,7 +355,6 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -378,7 +355,6 @@ class DeviceTracerImpl : public DeviceTracer {
} }
const std::vector<int> cbids { const std::vector<int> cbids {
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
...@@ -409,7 +385,6 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -409,7 +385,6 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_.clear(); correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear(); for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear(); for (auto &tmp : cpu_records_) tmp.clear();
for (auto &tmp : active_kind_records_) tmp.clear();
} }
void GenEventKernelCudaElapsedTime() { void GenEventKernelCudaElapsedTime() {
...@@ -462,7 +437,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -462,7 +437,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
for (auto &tmp : cpu_records_) { for (auto &tmp : cpu_records_)
for (const CPURecord &r : tmp) { for (const CPURecord &r : tmp) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU); event->set_type(proto::Event::CPU);
...@@ -472,24 +447,6 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -472,24 +447,6 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.thread_id); event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
}
for (auto &tmp : active_kind_records_) {
for (const ActiveKindRecord &r : tmp) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU);
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
} else {
event->set_name(r.name);
}
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id);
}
}
miss = find = 0; miss = find = 0;
for (const MemRecord &r : mem_records_) { for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
...@@ -553,7 +510,6 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -553,7 +510,6 @@ class DeviceTracerImpl : public DeviceTracer {
std::forward_list<KernelRecord> kernel_records_; std::forward_list<KernelRecord> kernel_records_;
std::forward_list<MemRecord> mem_records_; std::forward_list<MemRecord> mem_records_;
std::forward_list<std::forward_list<CPURecord>> cpu_records_; std::forward_list<std::forward_list<CPURecord>> cpu_records_;
std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>> std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
correlations_pairs; correlations_pairs;
std::unordered_map<uint32_t, Event *> correlations_; std::unordered_map<uint32_t, Event *> correlations_;
...@@ -657,7 +613,6 @@ void initCuptiCbidStr() { ...@@ -657,7 +613,6 @@ void initCuptiCbidStr() {
REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020); REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020); REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020); REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000); REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000); REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
......
...@@ -63,14 +63,7 @@ class DeviceTracer { ...@@ -63,14 +63,7 @@ class DeviceTracer {
uint32_t correlation_id; uint32_t correlation_id;
uint64_t bytes; uint64_t bytes;
}; };
struct ActiveKindRecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
int64_t device_id;
int64_t thread_id;
uint32_t correlation_id;
};
virtual ~DeviceTracer() {} virtual ~DeviceTracer() {}
// Needs to be called once before use. // Needs to be called once before use.
virtual void Enable() = 0; virtual void Enable() = 0;
...@@ -92,10 +85,6 @@ class DeviceTracer { ...@@ -92,10 +85,6 @@ class DeviceTracer {
virtual void AddCPURecords(const std::string& anno, uint64_t start_ns, virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
int64_t thread_id) = 0; int64_t thread_id) = 0;
virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id,
uint32_t correlation_id) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
......
...@@ -132,7 +132,7 @@ def __bootstrap__(): ...@@ -132,7 +132,7 @@ def __bootstrap__():
'allocator_strategy', 'reader_queue_speed_test_mode', 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'inner_op_parallelism', 'enable_parallel_graph', 'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size' 'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
......
...@@ -206,12 +206,12 @@ class CompiledProgram(object): ...@@ -206,12 +206,12 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize # FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass. # if turn on python memory optimize, turn off the inplace_pass.
if self._build_strategy.memory_optimize is None: # memory_optimize and enable_inplace default are True, but we can disable them on purpose
self._build_strategy.memory_optimize = False \ if self._program and self._program._is_mem_optimized:
if self._program and self._program._is_mem_optimized else True self._build_strategy.memory_optimize = False
if self._build_strategy.enable_inplace is None:
self._build_strategy.enable_inplace = False \ if self._program and self._program._is_mem_optimized:
if self._program and self._program._is_mem_optimized else True self._build_strategy.enable_inplace = False
# TODO(wuyi): trainer endpoings should be passed in through # TODO(wuyi): trainer endpoings should be passed in through
# build_strategy, not program.xxx. # build_strategy, not program.xxx.
......
...@@ -261,45 +261,42 @@ def _as_lodtensor(data, place): ...@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):
class Executor(object): class Executor(object):
""" """
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
ParallelExecutor. Python executor takes a program, adds feed operators and fetch operators to this program according
Python executor takes a program, add feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all the variables(or names) that user wants to get after program runs. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list. operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary It stores the global variables into the global scope, and creates a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished. variables. The contents in local scope may be discarded after every minibatch forward/backward
But the global scope variables will be persistent through different runs. finished. But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
Example: Example:
.. code-block:: python
# First create the Executor. .. code-block:: python
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) # First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
# Run the startup program once and only once. exe = fluid.Executor(place)
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program()) # Run the startup program once and only once.
# Not need to optimize/compile the startup program.
# Run the main program directly without compile. exe.run(fluid.default_startup_program())
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict, # Run the main program directly without compile.
fetch_list=[loss.name]) loss, = exe.run(fluid.default_main_program(),
# Or, compiled the program and run. See `CompiledProgram` for more detail. feed=feed_dict,
compiled_prog = compiler.CompiledProgram( fetch_list=[loss.name])
fluid.default_main_program()).with_data_parallel( # Or, compiled the program and run. See `CompiledProgram` for more detail.
loss_name=loss.name) compiled_prog = compiler.CompiledProgram(
loss, = exe.run(compiled_prog, fluid.default_main_program()).with_data_parallel(
feed=feed_dict, loss_name=loss.name)
fetch_list=[loss.name]) loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args: Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
""" """
def __init__(self, place): def __init__(self, place):
...@@ -382,6 +379,12 @@ class Executor(object): ...@@ -382,6 +379,12 @@ class Executor(object):
] ]
return outs return outs
'''
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
'''
def close(self): def close(self):
""" """
Close this executor. Close this executor.
...@@ -389,9 +392,6 @@ class Executor(object): ...@@ -389,9 +392,6 @@ class Executor(object):
You can no longer use this executor after calling this method. You can no longer use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to For the distributed training, this method would free the resource on PServers related to
the current Trainer. the current Trainer.
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example: Example:
>>> cpu = core.CPUPlace() >>> cpu = core.CPUPlace()
......
...@@ -15,36 +15,22 @@ ...@@ -15,36 +15,22 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest
from paddle.fluid.tests.unittests.test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride from paddle.fluid.tests.unittests.test_conv2d_transpose_op import conv2dtranspose_forward_naive, TestConv2dTransposeOp
class TestMKLDNN(TestConv2dTransposeOp): def conv2d_bias_naive(out, bias):
def init_op_type(self): _, out_c, _, _ = out.shape
self.is_test = True
self.use_mkldnn = True
self.data_format = "NCHW"
self.op_type = "conv2d_transpose"
self._cpu_only = True
def test_check_grad(self):
return
def test_check_grad_no_input(self): for l in range(out_c):
return out[:, l, :, :] = out[:, l, :, :] + bias[l]
return out
def test_check_grad_no_filter(self):
return
class TestMKLDNNWithPad(TestWithPad): class TestConv2dTransposeMKLDNNOp(TestConv2dTransposeOp):
def init_op_type(self):
self.is_test = True
self.use_mkldnn = True
self.data_format = "NCHW"
self.op_type = "conv2d_transpose"
self._cpu_only = True
def test_check_grad(self): def test_check_grad(self):
return return
...@@ -54,24 +40,64 @@ class TestMKLDNNWithPad(TestWithPad): ...@@ -54,24 +40,64 @@ class TestMKLDNNWithPad(TestWithPad):
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
return return
class TestMKLDNNWithStride(TestWithStride):
def init_op_type(self): def init_op_type(self):
self.is_test = True
self.use_mkldnn = True
self.data_format = "NCHW" self.data_format = "NCHW"
self.op_type = "conv2d_transpose" self.op_type = "conv2d_transpose"
self._cpu_only = True self._cpu_only = True
def test_check_grad(self): def init_test_case(self):
return self.use_mkldnn = True
self.is_test = True
def test_check_grad_no_input(self): self.pad = [0, 0]
return self.fuse_bias = False
self.bias_size = None
def test_check_grad_no_filter(self): self.fuse_relu = False
return self.stride = [1, 1]
self.dilations = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
if __name__ == '__main__': f_c = self.input_size[1]
unittest.main() self.filter_size = [f_c, 6, 3, 3]
self.groups = 1
def setUp(self):
TestConv2dTransposeOp.setUp(self)
output = self.outputs['Output']
if self.fuse_bias and self.bias_size is not None:
bias = np.random.random(self.bias_size).astype(self.dtype)
output = conv2d_bias_naive(output, bias)
output = output.astype(self.dtype)
self.attrs['fuse_bias'] = self.fuse_bias
self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
if self.fuse_relu:
output = np.maximum(output, 0).astype(self.dtype)
self.attrs['fuse_bias'] = self.fuse_bias
self.attrs['fuse_relu'] = self.fuse_relu
self.outputs['Output'] = output
class TestMKLDNNFuseBias(TestConv2dTransposeMKLDNNOp):
def init_test_case(self):
TestConv2dTransposeMKLDNNOp.init_test_case(self)
self.pad = [1, 1]
self.fuse_bias = True
self.bias_size = [6]
class TestMKLDNNWithPad(TestConv2dTransposeMKLDNNOp):
def init_test_case(self):
TestConv2dTransposeMKLDNNOp.init_test_case(self)
self.pad = [1, 1]
self.input_size = [2, 3, 10, 10]
class TestMKLDNNWithStride(TestConv2dTransposeMKLDNNOp):
def init_test_case(self):
TestConv2dTransposeMKLDNNOp.init_test_case(self)
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 6, 6] # NCHW
...@@ -115,6 +115,9 @@ class TestDistRunnerBase(object): ...@@ -115,6 +115,9 @@ class TestDistRunnerBase(object):
strategy.allow_op_delay = False strategy.allow_op_delay = False
build_stra = fluid.BuildStrategy() build_stra = fluid.BuildStrategy()
# FIXME force disable enable_inplace and memory_optimize
build_stra.enable_inplace = False
build_stra.memory_optimize = False
if args.use_reduce: if args.use_reduce:
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
......
...@@ -123,6 +123,9 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -123,6 +123,9 @@ class TestMNIST(TestParallelExecutorBase):
# NOTE(dzh): # NOTE(dzh):
# need to make it compatible with elewise fuse act # need to make it compatible with elewise fuse act
# FIXME (liuwei12)
# the new memory optimize strategy will crash this unittest
# add enable_inplace=False here to force pass the unittest
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={"image": img,
...@@ -131,6 +134,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -131,6 +134,7 @@ class TestMNIST(TestParallelExecutorBase):
fuse_elewise_add_act_ops=False, fuse_elewise_add_act_ops=False,
memory_opt=False, memory_opt=False,
use_ir_memory_optimize=False, use_ir_memory_optimize=False,
enable_inplace=False,
optimizer=_optimizer) optimizer=_optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
model, model,
...@@ -140,6 +144,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -140,6 +144,7 @@ class TestMNIST(TestParallelExecutorBase):
fuse_elewise_add_act_ops=True, fuse_elewise_add_act_ops=True,
memory_opt=False, memory_opt=False,
use_ir_memory_optimize=False, use_ir_memory_optimize=False,
enable_inplace=False,
optimizer=_optimizer) optimizer=_optimizer)
for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# nlp model stack of op operate on lod. It's a classical test case in optimize pass.
from __future__ import print_function
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import unittest
import paddle.fluid.core as core
from paddle.fluid import compiler, Program, program_guard
from paddle.fluid.executor import Executor
from paddle.fluid.backward import append_backward
from paddle.fluid.optimizer import MomentumOptimizer
from ir_memory_optimize_net_base import TestIrMemOptBase
class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
def check_network_convergence(self, use_cuda=True, py_opt=False,
iter_num=5):
prog = Program()
startup_prog = Program()
prog.random_seed = 100
startup_prog.random_seed = 100
with program_guard(prog, startup_prog):
image = layers.data(name='x', shape=[784], dtype='float32')
label = layers.data(name='y', shape=[1], dtype='int64')
limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
cond = layers.less_than(x=label, y=limit)
ie = layers.IfElse(cond)
with ie.true_block():
true_image = ie.input(image)
hidden = layers.fc(input=true_image, size=100, act='tanh')
prob = layers.fc(input=hidden, size=10, act='softmax')
ie.output(prob)
with ie.false_block():
false_image = ie.input(image)
hidden = layers.fc(input=false_image, size=200, act='tanh')
prob = layers.fc(input=hidden, size=10, act='softmax')
ie.output(prob)
prob = ie()
loss = layers.cross_entropy(input=prob[0], label=label)
avg_loss = layers.mean(loss)
optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
optimizer.minimize(avg_loss, startup_prog)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=200)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = Executor(place)
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_cuda = use_cuda
if py_opt:
fluid.memory_optimize(fluid.default_main_program())
train_cp = compiler.CompiledProgram(fluid.default_main_program())
train_cp = train_cp.with_data_parallel(
loss_name=avg_loss.name, exec_strategy=exec_strategy)
fetch_list = [avg_loss.name]
exe.run(startup_prog)
PASS_NUM = 100
loop = 0
ret = []
for pass_id in range(PASS_NUM):
for data in train_reader():
x_data = np.array([x[0] for x in data]).astype("float32")
y_data = np.array([x[1] for x in data]).astype("int64")
y_data = y_data.reshape((y_data.shape[0], 1))
outs = exe.run(train_cp,
feed={'x': x_data,
'y': y_data},
fetch_list=[avg_loss])
loop += 1
ret.append(outs[0])
if iter_num == loop:
return ret
return ret
def test_ifelse(self):
ret1 = self.check_network_convergence(False, True)
print(ret1)
ret2 = self.check_network_convergence(False, False)
print(ret2)
self.assertTrue(np.allclose(ret1, ret2))
if fluid.core.is_compiled_with_cuda():
ret1 = self.check_network_convergence(True, True)
print(ret1)
ret2 = self.check_network_convergence(True, False)
print(ret2)
self.assertTrue(np.allclose(ret1, ret2))
#self.assertEqual(ret1, ret2)
if __name__ == "__main__":
unittest.main()
...@@ -59,8 +59,12 @@ class TestFetchAndFeed(unittest.TestCase): ...@@ -59,8 +59,12 @@ class TestFetchAndFeed(unittest.TestCase):
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup) exe.run(startup)
#FIXME force disable enable_inplace and memory_optimize to pass the unittest
build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = False
build_strategy.memory_optimize = False
train_cp = compiler.CompiledProgram(main_program).with_data_parallel( train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
loss_name=loss.name) loss_name=loss.name, build_strategy=build_strategy)
run_parallel_exe(train_cp, exe, use_cuda, data, label, loss) run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)
......
...@@ -96,6 +96,9 @@ class TestPassBuilder(unittest.TestCase): ...@@ -96,6 +96,9 @@ class TestPassBuilder(unittest.TestCase):
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
self.assertFalse(build_strategy.fuse_elewise_add_act_ops) self.assertFalse(build_strategy.fuse_elewise_add_act_ops)
build_strategy.fuse_elewise_add_act_ops = True build_strategy.fuse_elewise_add_act_ops = True
#FIXME: currently fuse_elewise_add_act_ops not compatible with below options
build_strategy.enable_inplace = False
build_strategy.memory_optimize = False
pass_builder = build_strategy._finalize_strategy_and_create_passes() pass_builder = build_strategy._finalize_strategy_and_create_passes()
self.assertTrue("fuse_elewise_add_act_pass" in self.assertTrue("fuse_elewise_add_act_pass" in
[p.type() for p in pass_builder.all_passes()]) [p.type() for p in pass_builder.all_passes()])
......
...@@ -142,6 +142,10 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor): ...@@ -142,6 +142,10 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
#FIXME force use old memory optimzie strategy here to pass the unittest
#since open the new strategy will crash the unittest
fluid.memory_optimize(fluid.default_main_program())
train_cp = compiler.CompiledProgram(fluid.default_main_program()) train_cp = compiler.CompiledProgram(fluid.default_main_program())
if use_parallel_executor: if use_parallel_executor:
train_cp = train_cp.with_data_parallel(loss_name=loss.name) train_cp = train_cp.with_data_parallel(loss_name=loss.name)
......
...@@ -49,6 +49,21 @@ class TestSequenceEraseOpInt32(OpTest): ...@@ -49,6 +49,21 @@ class TestSequenceEraseOpInt32(OpTest):
self.check_output() self.check_output()
class TestSequenceEraseOpInt32LoD2(OpTest):
def setUp(self):
self.op_type = "sequence_erase"
in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
lod = [[1, 3], [9, 4, 11, 6]]
tokens = [2, 3, 5]
out_seq, new_lod0 = sequence_erase(in_seq, lod[-1], tokens)
self.attrs = {'tokens': tokens}
self.inputs = {'X': (in_seq, lod)}
self.outputs = {'Out': (out_seq, lod[:-1] + [new_lod0])}
def test_check_output(self):
self.check_output()
class TestSequenceEraseOpInt64(OpTest): class TestSequenceEraseOpInt64(OpTest):
def setUp(self): def setUp(self):
self.op_type = "sequence_erase" self.op_type = "sequence_erase"
......
...@@ -131,7 +131,7 @@ class Timeline(object): ...@@ -131,7 +131,7 @@ class Timeline(object):
if (k, event.device_id, "CPU") not in self._devices: if (k, event.device_id, "CPU") not in self._devices:
pid = self._allocate_pid() pid = self._allocate_pid()
self._devices[(k, event.device_id, "CPU")] = pid self._devices[(k, event.device_id, "CPU")] = pid
# -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy) # -1 device id represents CUDA api call
if event.device_id == -1: if event.device_id == -1:
self._chrome_trace.emit_pid("%s:cuda_api" % k, pid) self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
else: else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册