提交 a135fec1 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into gen_nccl_id_op

...@@ -80,6 +80,8 @@ parser.add_argument( ...@@ -80,6 +80,8 @@ parser.add_argument(
type=str, type=str,
default="", default="",
help="Comma-separated list of hostname:port pairs") help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--profile", action='store_true', help="If set, profile a few steps.")
# Flags for defining the tf.train.Server # Flags for defining the tf.train.Server
parser.add_argument( parser.add_argument(
...@@ -183,8 +185,8 @@ def main(): ...@@ -183,8 +185,8 @@ def main():
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
train_pass_acc.reset() train_pass_acc.reset()
for batch_id, data in enumerate(train_reader()):
ts = time.time() def run_step(batch_id, data):
img_data = np.array( img_data = np.array(
map(lambda x: x[0].reshape(data_shape), data)).astype( map(lambda x: x[0].reshape(data_shape), data)).astype(
"float32") "float32")
...@@ -196,14 +198,28 @@ def main(): ...@@ -196,14 +198,28 @@ def main():
feed={"pixel": img_data, feed={"pixel": img_data,
"label": y_data}, "label": y_data},
fetch_list=[avg_cost, batch_acc, batch_size]) fetch_list=[avg_cost, batch_acc, batch_size])
return loss, acc, b_size
if args.profile and args.task_index == 0:
# warmup.
for batch_id, data in enumerate(train_reader()):
if batch_id > 5: break
run_step(batch_id, data)
with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
for batch_id, data in enumerate(train_reader()):
if batch_id > 5: break
run_step(batch_id, data)
for batch_id, data in enumerate(train_reader()):
ts = time.time()
loss, acc, b_size = run_step(batch_id, data)
iters += 1 iters += 1
num_samples += len(data) num_samples += len(data)
train_pass_acc.add(value=acc, weight=b_size) train_pass_acc.add(value=acc, weight=b_size)
print( print(
"Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
"Speed = %.2f img/s " % (args.task_index, pass_id, iters, "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
loss, acc, len(data) / (time.time() - ts))
len(data) / (time.time() - ts))
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time pass_elapsed = time.time() - start_time
......
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda) nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc) set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc)
add_subdirectory(convert) add_subdirectory(convert)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/io_converter.h"
#include <cuda.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace tensorrt {
using platform::is_gpu_place;
using platform::is_cpu_place;
class DefaultInputConverter : public EngineInputConverter {
public:
DefaultInputConverter() {}
// NOTE out is GPU memory.
virtual void operator()(const LoDTensor& in, void* out,
size_t max_size) override {
PADDLE_ENFORCE(out != nullptr);
PADDLE_ENFORCE_LE(in.memory_size(), max_size);
const auto& place = in.place();
if (is_cpu_place(place)) {
PADDLE_ENFORCE(stream_ != nullptr);
PADDLE_ENFORCE_EQ(0,
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
cudaMemcpyHostToDevice, *stream_));
} else if (is_gpu_place(place)) {
PADDLE_ENFORCE_EQ(0,
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
cudaMemcpyHostToHost, *stream_));
} else {
PADDLE_THROW("Unknown device for converter");
}
cudaStreamSynchronize(*stream_);
}
};
REGISTER_TENSORRT_INPUT_CONVERTER(mul, DefaultInputConverter);
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <unordered_map>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace paddle {
namespace inference {
namespace tensorrt {
using framework::LoDTensor;
/*
* Convert Input from Fluid to an Engine.
* TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in
* most cases just need to copy the data.
*/
class EngineInputConverter {
public:
EngineInputConverter() {}
virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
void SetStream(cudaStream_t* stream) { stream_ = stream; }
static void Run(const std::string& in_op_type, const LoDTensor& in, void* out,
size_t max_size, cudaStream_t* stream) {
PADDLE_ENFORCE(stream != nullptr);
auto* converter = Registry<EngineInputConverter>::Lookup(in_op_type);
PADDLE_ENFORCE_NOT_NULL(converter);
converter->SetStream(stream);
(*converter)(in, out, max_size);
}
virtual ~EngineInputConverter() {}
protected:
cudaStream_t* stream_{nullptr};
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \
struct trt_input_##in_op_type__##_converter { \
trt_input_##in_op_type__##_converter() { \
::paddle::inference::Registry<EngineInputConverter>::Register< \
Converter__>(#in_op_type__); \
} \
}; \
trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__;
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/tensorrt/io_converter.h"
#include <gtest/gtest.h>
namespace paddle {
namespace inference {
namespace tensorrt {
class EngineInputConverterTester : public ::testing::Test {
public:
void SetUp() override { tensor.Resize({10, 10}); }
framework::LoDTensor tensor;
};
TEST_F(EngineInputConverterTester, DefaultCPU) {
void* buffer;
tensor.mutable_data<float>(platform::CPUPlace());
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
cudaStream_t stream;
EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(),
&stream);
}
TEST_F(EngineInputConverterTester, DefaultGPU) {
void* buffer;
tensor.mutable_data<float>(platform::CUDAPlace());
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
cudaStream_t stream;
EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(),
&stream);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -24,6 +24,11 @@ function(inference_test TARGET_NAME) ...@@ -24,6 +24,11 @@ function(inference_test TARGET_NAME)
endforeach() endforeach()
endfunction(inference_test) endfunction(inference_test)
####################
# Inference tests here depend on fluid/tests/book. If users want to run
# individual test with ctest, they need to run tests in fluid/tests/book
# first to generate saved model.
####################
# This unittest is buggy! # This unittest is buggy!
#inference_test(fit_a_line) #inference_test(fit_a_line)
inference_test(image_classification ARGS vgg resnet) inference_test(image_classification ARGS vgg resnet)
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <unordered_map>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
// NOTE not thread-safe.
template <typename T>
struct Singleton {
static T& Global() {
static T* x = new T;
return *x;
}
Singleton() = delete;
Singleton& operator=(const Singleton&) = delete;
};
/*
* An registor for any type.
* NOTE not thread-safe.
*/
template <typename ItemParent>
struct Registry {
static Registry& Global() {
static auto* x = new Registry<ItemParent>;
return *x;
}
template <typename ItemChild>
static void Register(const std::string& name) {
PADDLE_ENFORCE_EQ(items_.count(name), 0);
items_[name] = new ItemChild;
}
static ItemParent* Lookup(const std::string& name) {
auto it = items_.find(name);
if (it == items_.end()) return nullptr;
return it->second;
}
~Registry() {
for (auto& item : items_) {
delete item.second;
}
}
private:
Registry() = default;
static std::unordered_map<std::string, ItemParent*> items_;
};
template <typename ItemParent>
std::unordered_map<std::string, ItemParent*> Registry<ItemParent>::items_;
} // namespace inference
} // namespace paddle
...@@ -70,6 +70,10 @@ message VariableMessage { ...@@ -70,6 +70,10 @@ message VariableMessage {
bytes rows = 9; bytes rows = 9;
// Look up table block execution output variable name. // Look up table block execution output variable name.
string out_varname = 10; string out_varname = 10;
// If true, the ps server will start profiling, the ps
// server stops profiling and generates a profile to /tmp/profile_ps_*
// when profile switches from true to false.
bool profile = 11;
} }
message VoidMessage {} message VoidMessage {}
...@@ -26,6 +26,7 @@ limitations under the License. */ ...@@ -26,6 +26,7 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h" #include "paddle/fluid/operators/detail/proto_encoder_helper.h"
#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/detail/variable_response.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -48,6 +49,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -48,6 +49,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
void* payload = nullptr; void* payload = nullptr;
size_t payload_size = 0; size_t payload_size = 0;
ProtoEncodeHelper e(static_cast<char*>(buf), 1024); ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
// Note: normally the profiler is enabled in 1 trainer, hence only
// 1 trainer returns true for ShouldSendProfileState(). It tells PS
// servers the trainer's profiling state so that PS can follow the
// trainer.
if (platform::ShouldSendProfileState()) {
e.WriteBool(VarMsg::kProfileFieldNumber, platform::IsProfileEnabled());
}
e.WriteString(VarMsg::kVarnameFieldNumber, name); e.WriteString(VarMsg::kVarnameFieldNumber, name);
if (var->IsType<framework::LoDTensor>()) { if (var->IsType<framework::LoDTensor>()) {
e.WriteUint64(VarMsg::kTypeFieldNumber, 0); e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <nccl.h> #include <nccl.h>
#endif #endif
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
...@@ -446,7 +447,26 @@ int VariableResponse::Parse(Source* source) { ...@@ -446,7 +447,26 @@ int VariableResponse::Parse(Source* source) {
meta_.set_out_varname(temp); meta_.set_out_varname(temp);
break; break;
} }
case sendrecv::VariableMessage::kProfileFieldNumber: {
bool profiling;
if (!input.ReadRaw(reinterpret_cast<void*>(&profiling), 1)) {
return tag;
}
meta_.set_profile(profiling);
int64_t listener_id = platform::ListenerId();
if (listener_id <= 0) {
break;
}
if (profiling && !platform::IsProfileEnabled()) {
platform::EnableProfiler(platform::ProfilerState::kCPU);
} else if (!profiling && platform::IsProfileEnabled()) {
// TODO(panyx0718): Should we allow to customize file dir.
platform::DisableProfiler(
platform::EventSortingKey::kDefault,
string::Sprintf("/tmp/profile_ps_%lld", listener_id));
}
break;
}
default: { default: {
// Unknown tag, return unknown error. // Unknown tag, return unknown error.
return -1; return -1;
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -294,6 +295,8 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -294,6 +295,8 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
void ListenAndServOp::RunImpl(const framework::Scope &scope, void ListenAndServOp::RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const { const platform::Place &dev_place) const {
// Mark this as PS that it should decide profiling by listening from trainer.
platform::SetProfileListener();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place); auto &dev_ctx = *pool.Get(dev_place);
framework::Scope &recv_scope = scope.NewScope(); framework::Scope &recv_scope = scope.NewScope();
......
...@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits>
#include <map> #include <map>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <random>
#include <string> #include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
...@@ -33,6 +36,9 @@ namespace platform { ...@@ -33,6 +36,9 @@ namespace platform {
struct EventList; struct EventList;
static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false;
// The profiler state, the initial value is ProfilerState::kDisabled // The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled; static ProfilerState g_state = ProfilerState::kDisabled;
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
...@@ -219,13 +225,12 @@ void EnableProfiler(ProfilerState state) { ...@@ -219,13 +225,12 @@ void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enbale profling, since the input state is ",
"ProfilerState::kDisabled"); "ProfilerState::kDisabled");
PADDLE_ENFORCE(g_state == ProfilerState::kDisabled, if (state == g_state) {
"The profiling state should be disabled when calling ", return;
"EnableProfiler.");
g_state = state;
if (g_state == ProfilerState::kAll) {
GetDeviceTracer()->Enable();
} }
g_state = state;
should_send_profile_state = true;
GetDeviceTracer()->Enable();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) { if (g_state == ProfilerState::kCUDA) {
// Generate some dummy events first to reduce the startup overhead. // Generate some dummy events first to reduce the startup overhead.
...@@ -435,8 +440,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -435,8 +440,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string& profile_path) {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, if (g_state == ProfilerState::kDisabled) return;
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_", nullptr);
...@@ -444,12 +448,25 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -444,12 +448,25 @@ void DisableProfiler(EventSortingKey sorted_key,
ParseEvents(all_events, sorted_key); ParseEvents(all_events, sorted_key);
ResetProfiler(); ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) { if (tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
} }
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
should_send_profile_state = true;
}
bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
bool ShouldSendProfileState() { return should_send_profile_state; }
void SetProfileListener() {
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6(
1, std::numeric_limits<int64_t>::max());
profiler_lister_id = dist6(rng);
} }
int64_t ListenerId() { return profiler_lister_id; }
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -114,5 +114,13 @@ void ResetProfiler(); ...@@ -114,5 +114,13 @@ void ResetProfiler();
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path); const std::string& profile_path);
// Test if the profiler is currently enabled.
bool IsProfileEnabled();
// Whether the trainer should send profiling state to PS.
bool ShouldSendProfileState();
// Mark current process as PS by assigning a lister id.
void SetProfileListener();
int64_t ListenerId();
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -60,6 +60,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\ ...@@ -60,6 +60,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
'transpiler'
'nets', 'nets',
'optimizer', 'optimizer',
'learning_rate_decay', 'learning_rate_decay',
......
...@@ -1042,13 +1042,14 @@ class Program(object): ...@@ -1042,13 +1042,14 @@ class Program(object):
Returns(Program): Returns(Program):
The cloned Program object. The cloned Program object.
""" """
p = Program()
if for_test: if for_test:
p.desc = core.inference_optimize(self.desc) p = self.inference_optimize()
else: else:
p = Program()
p.desc = core.ProgramDesc(self.desc) p.desc = core.ProgramDesc(self.desc)
p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())] p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
p.sync_with_cpp() p.sync_with_cpp()
p.copy_param_info_from(self) p.copy_param_info_from(self)
return p return p
...@@ -1061,7 +1062,7 @@ class Program(object): ...@@ -1061,7 +1062,7 @@ class Program(object):
if isinstance(t, Variable): if isinstance(t, Variable):
# After transpiler processing, the op that output this # After transpiler processing, the op that output this
# variable maybe has been changed, so t.op is not reliable # variable maybe has been changed, so t.op is not reliable
# and we need to find the current op that generate this # and we need to find the current op that generate this
# variable here. # variable here.
t.op = None t.op = None
global_block = self.global_block() global_block = self.global_block()
...@@ -1087,8 +1088,16 @@ class Program(object): ...@@ -1087,8 +1088,16 @@ class Program(object):
return res return res
def inference_optimize(self): def inference_optimize(self):
# this is an alternative implement before
# core.inference_optimize being fixed.
res = Program() res = Program()
res.desc = core.inference_optimize(self.desc) res.desc = core.ProgramDesc(self.desc)
for i in xrange(res.desc.num_blocks()):
block = res.desc.block(i)
for j in xrange(block.op_size()):
op = block.op(j)
if op.has_attr('is_test'):
op.set_attr('is_test', True)
res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())] res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
res.sync_with_cpp() res.sync_with_cpp()
return res return res
......
...@@ -251,7 +251,7 @@ class EditDistance(MetricBase): ...@@ -251,7 +251,7 @@ class EditDistance(MetricBase):
self.instance_error += seq_num - seq_right_count self.instance_error += seq_num - seq_right_count
self.total_distance += total_distance self.total_distance += total_distance
def eval(): def eval(self):
if self.seq_num == 0: if self.seq_num == 0:
raise ValueError( raise ValueError(
"There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance." "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
...@@ -280,6 +280,7 @@ class DetectionMAP(MetricBase): ...@@ -280,6 +280,7 @@ class DetectionMAP(MetricBase):
super(DetectionMAP, self).__init__(name) super(DetectionMAP, self).__init__(name)
# the current map value # the current map value
self.value = .0 self.value = .0
self.weight = .0
def update(self, value, weight): def update(self, value, weight):
if not _is_number_or_matrix_(value): if not _is_number_or_matrix_(value):
...@@ -340,8 +341,8 @@ class Auc(MetricBase): ...@@ -340,8 +341,8 @@ class Auc(MetricBase):
raise ValueError("The 'predictions' must be a numpy ndarray.") raise ValueError("The 'predictions' must be a numpy ndarray.")
kepsilon = 1e-7 # to account for floating point imprecisions kepsilon = 1e-7 # to account for floating point imprecisions
thresholds = [(i + 1) * 1.0 / (num_thresholds - 1) thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
for i in range(num_thresholds - 2)] for i in range(self._num_thresholds - 2)]
thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
# caculate TP, FN, TN, FP count # caculate TP, FN, TN, FP count
...@@ -358,19 +359,20 @@ class Auc(MetricBase): ...@@ -358,19 +359,20 @@ class Auc(MetricBase):
fp += 1 fp += 1
else: else:
tn += 1 tn += 1
tp_list[idx_thresh] += tp self.tp_list[idx_thresh] += tp
fn_list[idx_thresh] += fn self.fn_list[idx_thresh] += fn
tn_list[idx_thresh] += tn self.tn_list[idx_thresh] += tn
fp_list[idx_thresh] += fp self.fp_list[idx_thresh] += fp
def eval(self): def eval(self):
epsilon = self._epsilon epsilon = self._epsilon
num_thresholds = self._num_thresholds num_thresholds = self._num_thresholds
tpr = (tp_list.astype("float32") + epsilon) / ( tpr = (self.tp_list.astype("float32") + epsilon) / (
tp_list + fn_list + epsilon) self.tp_list + self.fn_list + epsilon)
fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon) fpr = self.fp_list.astype("float32") / (
rec = (tp_list.astype("float32") + epsilon) / ( self.fp_list + self.tn_list + epsilon)
tp_list + fp_list + epsilon) rec = (self.tp_list.astype("float32") + epsilon) / (
self.tp_list + self.fp_list + epsilon)
x = fpr[:num_thresholds - 1] - fpr[1:] x = fpr[:num_thresholds - 1] - fpr[1:]
y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
......
...@@ -19,10 +19,11 @@ import executor ...@@ -19,10 +19,11 @@ import executor
import data_feeder import data_feeder
import contextlib import contextlib
import io import io
import transpiler
# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
import optimizer as opt_module import optimizer as opt_module
import distribute_transpiler from transpiler import distribute_transpiler
__all__ = [ __all__ = [
'Trainer', 'Trainer',
......
...@@ -68,7 +68,8 @@ packages=['paddle', ...@@ -68,7 +68,8 @@ packages=['paddle',
'paddle.fluid', 'paddle.fluid',
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers'] 'paddle.fluid.layers',
'paddle.fluid.transpiler']
if '${WITH_FLUID_ONLY}'== 'OFF': if '${WITH_FLUID_ONLY}'== 'OFF':
packages+=['paddle.proto', packages+=['paddle.proto',
......
...@@ -22,7 +22,11 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2 ...@@ -22,7 +22,11 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
'--profile_path', type=str, default='', help='Input profile file name.') '--profile_path',
type=str,
default='',
help='Input profile file name. If there are multiple file, the format '
'should be trainer1=file1,trainer2=file2,ps=file3')
parser.add_argument( parser.add_argument(
'--timeline_path', type=str, default='', help='Output timeline file name.') '--timeline_path', type=str, default='', help='Output timeline file name.')
args = parser.parse_args() args = parser.parse_args()
...@@ -108,8 +112,8 @@ class _ChromeTraceFormatter(object): ...@@ -108,8 +112,8 @@ class _ChromeTraceFormatter(object):
class Timeline(object): class Timeline(object):
def __init__(self, profile_pb): def __init__(self, profile_dict):
self._profile_pb = profile_pb self._profile_dict = profile_dict
self._pid = 0 self._pid = 0
self._devices = dict() self._devices = dict()
self._chrome_trace = _ChromeTraceFormatter() self._chrome_trace = _ChromeTraceFormatter()
...@@ -120,35 +124,37 @@ class Timeline(object): ...@@ -120,35 +124,37 @@ class Timeline(object):
return cur_pid return cur_pid
def _allocate_pids(self): def _allocate_pids(self):
for event in self._profile_pb.events: for k, profile_pb in self._profile_dict.iteritems():
if event.type == profiler_pb2.Event.CPU: for event in profile_pb.events:
if (event.device_id, "CPU") not in self._devices: if event.type == profiler_pb2.Event.CPU:
pid = self._allocate_pid() if (k, event.device_id, "CPU") not in self._devices:
self._devices[(event.device_id, "CPU")] = pid pid = self._allocate_pid()
self._chrome_trace.emit_pid("cpu:block:%d" % self._devices[(k, event.device_id, "CPU")] = pid
(event.device_id), pid) self._chrome_trace.emit_pid("%s:cpu:block:%d" %
elif event.type == profiler_pb2.Event.GPUKernel: (k, event.device_id), pid)
if (event.device_id, "GPUKernel") not in self._devices: elif event.type == profiler_pb2.Event.GPUKernel:
pid = self._allocate_pid() if (k, event.device_id, "GPUKernel") not in self._devices:
self._devices[(event.device_id, "GPUKernel")] = pid pid = self._allocate_pid()
self._chrome_trace.emit_pid("gpu:%d" % (event.device_id), self._devices[(k, event.device_id, "GPUKernel")] = pid
pid) self._chrome_trace.emit_pid("%s:gpu:%d" %
(k, event.device_id), pid)
def _allocate_events(self): def _allocate_events(self):
for event in self._profile_pb.events: for k, profile_pb in self._profile_dict.iteritems():
if event.type == profiler_pb2.Event.CPU: for event in profile_pb.events:
type = "CPU" if event.type == profiler_pb2.Event.CPU:
elif event.type == profiler_pb2.Event.GPUKernel: type = "CPU"
type = "GPUKernel" elif event.type == profiler_pb2.Event.GPUKernel:
pid = self._devices[(event.device_id, type)] type = "GPUKernel"
args = {'name': event.name} pid = self._devices[(k, event.device_id, type)]
if event.memcopy.bytes > 0: args = {'name': event.name}
args = {'mem_bytes': event.memcopy.bytes} if event.memcopy.bytes > 0:
# TODO(panyx0718): Chrome tracing only handles ms. However, some args = {'mem_bytes': event.memcopy.bytes}
# ops takes micro-seconds. Hence, we keep the ns here. # TODO(panyx0718): Chrome tracing only handles ms. However, some
self._chrome_trace.emit_region( # ops takes micro-seconds. Hence, we keep the ns here.
event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, self._chrome_trace.emit_region(
event.sub_device_id, 'Op', event.name, args) event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
event.sub_device_id, 'Op', event.name, args)
def generate_chrome_trace(self): def generate_chrome_trace(self):
self._allocate_pids() self._allocate_pids()
...@@ -163,11 +169,23 @@ timeline_path = '/tmp/timeline' ...@@ -163,11 +169,23 @@ timeline_path = '/tmp/timeline'
if args.timeline_path: if args.timeline_path:
timeline_path = args.timeline_path timeline_path = args.timeline_path
with open(profile_path, 'r') as f: profile_paths = profile_path.split(',')
profile_s = f.read() profile_dict = dict()
profile_pb = profiler_pb2.Profile() if len(profile_path) == 1:
profile_pb.ParseFromString(profile_s) with open(profile_path, 'r') as f:
profile_s = f.read()
tl = Timeline(profile_pb) profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(profile_s)
profile_dict['trainer'] = profile_pb
else:
for profile_path in profile_paths:
k, v = profile_path.split('=')
with open(v, 'r') as f:
profile_s = f.read()
profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(profile_s)
profile_dict[k] = profile_pb
tl = Timeline(profile_dict)
with open(timeline_path, 'w') as f: with open(timeline_path, 'w') as f:
f.write(tl.generate_chrome_trace()) f.write(tl.generate_chrome_trace())
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册