提交 234a1d92 编写于 作者: D dzhwinter

Merge remote-tracking branch 'origin/develop' into windows/debug

test=develop
......@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <cstddef> // for size_t
namespace paddle {
namespace framework {
......@@ -26,6 +27,7 @@ struct ExecutionStrategy {
bool allow_op_delay_{false};
size_t num_iteration_per_drop_scope_{100};
ExecutorType type_{kDefault};
bool dry_run_{false};
};
} // namespace details
......
......@@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
size_t complete = 0;
while (op_to_run != nullptr) {
try {
op_to_run->Run(strategy_.use_cuda_);
if (LIKELY(!strategy_.dry_run_)) {
op_to_run->Run(strategy_.use_cuda_);
}
++complete;
} catch (...) {
exception_.Catch(std::current_exception());
......
......@@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp(
if (VLOG_IS_ON(10)) {
VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
}
op->Run(strategy_.use_cuda_);
if (LIKELY(!strategy_.dry_run_)) {
op->Run(strategy_.use_cuda_);
}
VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--;
ready_var_q->Extend(op->Outputs());
......
......@@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
// Use topological sort algorithm
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
~ThreadedSSAGraphExecutor() {}
~ThreadedSSAGraphExecutor() final = default;
private:
void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
......
......@@ -38,9 +38,20 @@ class ParallelExecutorPrivate {
explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
: places_(places) {}
~ParallelExecutorPrivate() {
if (own_local_scope_) {
for (size_t i = 1; i < local_scopes_.size(); ++i) {
// Skip the first scope, since it is the global scope.
Scope *local_scope = local_scopes_[i];
if (global_scope_->HasKid(local_scope)) {
global_scope_->DeleteScope(local_scope);
}
}
}
}
std::vector<platform::Place> places_;
std::vector<Scope *> local_scopes_;
Scope *global_scope_;
Scope *global_scope_; // not owned
std::unique_ptr<details::SSAGraphExecutor> executor_;
#ifdef PADDLE_WITH_CUDA
......@@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() {
for (auto &p : member_->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
if (member_->own_local_scope_) {
for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
Scope *local_scope = member_->local_scopes_[i];
if (member_->global_scope_->HasKid(local_scope)) {
member_->global_scope_->DeleteScope(local_scope);
}
}
}
// member_ must be destructed before gcs_ since the destructor of
// ReferenceCountOpHandle use raw pointers of gcs_ inside.
member_.reset();
......
......@@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) {
ThreadPool::~ThreadPool() {
{
// notify all threads to stop running
std::lock_guard<std::mutex> l(mutex_);
std::unique_lock<std::mutex> l(mutex_);
running_ = false;
scheduled_.notify_all();
}
scheduled_.notify_all();
for (auto& t : threads_) {
t->join();
......@@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() {
void ThreadPool::TaskLoop() {
while (true) {
std::unique_lock<std::mutex> lock(mutex_);
Task task;
scheduled_.wait(
lock, [this] { return !this->tasks_.empty() || !this->running_; });
{
std::unique_lock<std::mutex> lock(mutex_);
scheduled_.wait(
lock, [this] { return !this->tasks_.empty() || !this->running_; });
if (!running_ || tasks_.empty()) {
return;
}
if (!running_ && tasks_.empty()) {
return;
}
if (tasks_.empty()) {
PADDLE_THROW("This thread has no task to Run");
}
// pop a task from the task queue
auto task = std::move(tasks_.front());
tasks_.pop();
lock.unlock();
// pop a task from the task queue
task = std::move(tasks_.front());
tasks_.pop();
}
// run the task
task();
......
......@@ -58,7 +58,7 @@ class ThreadPool {
~ThreadPool();
// Run pushes a function to the task queue and returns a std::future
// object. To wait for the completion of the task, call
// object. To wait for the completion of the task, call
// std::future::wait().
template <typename Callback>
std::future<void> Run(Callback fn) {
......@@ -69,7 +69,6 @@ class ThreadPool {
template <typename Callback>
std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
Callback fn) {
std::unique_lock<std::mutex> lock(mutex_);
Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
try {
fn();
......@@ -84,7 +83,13 @@ class ThreadPool {
return nullptr;
});
std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
tasks_.push(std::move(task));
{
std::unique_lock<std::mutex> lock(mutex_);
if (!running_) {
PADDLE_THROW("enqueue on stopped ThreadPool");
}
tasks_.push(std::move(task));
}
scheduled_.notify_one();
return f;
}
......
if(WITH_TESTING)
include(test.cmake) # some generic cmake funtion for inference
include(tests/test.cmake) # some generic cmake funtion for inference
endif()
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
......
......@@ -18,6 +18,21 @@ namespace paddle {
namespace inference {
namespace tensorrt {
bool to_skip_merging_optimize(TensorRTEngine* engine_,
const std::vector<int>& filters,
const std::vector<int>& strides,
const std::vector<int>& paddings,
std::string input_name) {
if (engine_->itensor_quote_num[input_name] > 0) {
return true;
}
if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
engine_->itensor_quote_num[input_name] += 1;
return false;
}
class Conv2dOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
......@@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter {
PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("Input").front());
// Declare weights
auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(Y_v);
......@@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter {
std::move(weight_tensor);
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
if (test_mode ||
to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
paddings, op_desc.Input("Input").front())) {
engine_->DeclareOutput(output_name);
}
}
......
......@@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
buffer_sizes_[name] = 0;
}
bool TensorRTEngine::HasDeclared(const std::string &name) {
return buffer_sizes_.count(name) > 0;
}
void TensorRTEngine::DeclareOutput(const std::string &name) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
name);
......
......@@ -91,6 +91,8 @@ class TensorRTEngine : public EngineBase {
const std::string& name);
// Set the itensor_map_[name] as the network's output, and set its name.
void DeclareOutput(const std::string& name);
// Check if the ITensor has been declared
bool HasDeclared(const std::string& name);
// GPU memory address for an ITensor with specific name. One can operate on
// these memory directly for acceleration, for example, output the converted
......@@ -132,6 +134,16 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map;
// TODO: (NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into
// one conv, and then trigger bug. So, We should use strategy to avoid this
// optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
itensor_quote_num;
private:
// the max batch size
int max_batch_;
......
set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
function(download_model install_dir model_name)
if (NOT EXISTS ${install_dir})
inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
endif()
endfunction()
function(download_model_and_data install_dir model_name data_name)
if (NOT EXISTS ${install_dir})
inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
......@@ -13,6 +19,13 @@ function(inference_analysis_api_test target install_dir filename)
ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
endfunction()
function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
download_model(${install_dir} ${model_name})
inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${install_dir}/model)
endfunction()
# RNN1
if(NOT APPLE)
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
......@@ -61,17 +74,13 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
# ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR})
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
endif()
inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
# resnet50
set(RESNET50_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
if (NOT EXISTS ${RESNET50_INSTALL_DIR})
inference_download_and_uncompress(${RESNET50_INSTALL_DIR} ${INFERENCE_URL} "resnet50_model.tar.gz")
endif()
inference_analysis_test(test_analyzer_resnet50 SRCS analyzer_resnet50_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_INSTALL_DIR}/model)
inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
"${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
# anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
......@@ -30,25 +30,7 @@ void SetConfig(AnalysisConfig *cfg) {
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
PaddleTensor input;
// channel=3, height/width=318
std::vector<int> shape({FLAGS_batch_size, 3, 318, 318});
input.shape = shape;
input.dtype = PaddleDType::FLOAT32;
// fill input data, for profile easily, do not use random data here.
size_t size = FLAGS_batch_size * 3 * 318 * 318;
input.data.Resize(size * sizeof(float));
float *input_data = static_cast<float *>(input.data.data());
for (size_t i = 0; i < size; i++) {
*(input_data + i) = static_cast<float>(i) / size;
}
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
SetFakeImageInput(inputs, FLAGS_infer_model);
}
// Easy for profiling independently.
......@@ -61,13 +43,6 @@ void profile(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
size_t size = GetSize(outputs[0]);
// output is a 512-dimension feature
EXPECT_EQ(size, 512 * FLAGS_batch_size);
}
}
TEST(Analyzer_resnet50, profile) { profile(); }
......@@ -83,8 +58,7 @@ TEST(Analyzer_resnet50, fuse_statis) {
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
LOG(INFO) << "num_ops: " << num_ops;
}
// Compare result of NativeConfig and AnalysisConfig
......
......@@ -25,6 +25,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
......@@ -105,6 +106,34 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
return fuse_statis;
}
void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
const std::string &dirname) {
// Set fake_image_data
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
std::vector<std::vector<int64_t>> feed_target_shapes =
GetFeedTargetShapes(dirname, true, "model", "params");
int dim1 = feed_target_shapes[0][1];
int dim2 = feed_target_shapes[0][2];
int dim3 = feed_target_shapes[0][3];
PaddleTensor input;
std::vector<int> shape({FLAGS_batch_size, dim1, dim2, dim3});
input.shape = shape;
input.dtype = PaddleDType::FLOAT32;
// fill input data, for profile easily, do not use random data here.
size_t size = FLAGS_batch_size * dim1 * dim2 * dim3;
input.data.Resize(size * sizeof(float));
float *input_data = static_cast<float *>(input.data.data());
for (size_t i = 0; i < size; i++) {
*(input_data + i) = static_cast<float>(i) / size;
}
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
}
void TestOneThreadPrediction(
const AnalysisConfig &config,
const std::vector<std::vector<PaddleTensor>> &inputs,
......
......@@ -93,11 +93,16 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
}
}
TEST(trt_models_test, main) {
std::vector<std::string> infer_models = {"mobilenet", "resnet50",
"resnext50"};
for (auto &model_dir : infer_models) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
}
TEST(trt_models_test, mobilenet) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/mobilenet");
}
TEST(trt_models_test, resnet50) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnet50");
}
TEST(trt_models_test, resnext50) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnext50");
}
} // namespace paddle
......@@ -18,7 +18,6 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -94,15 +93,15 @@ void CheckError(const paddle::framework::LoDTensor& output1,
std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
paddle::framework::Executor* executor, paddle::framework::Scope* scope,
const std::string& dirname, const bool is_combined = false) {
const std::string& dirname, const bool is_combined = false,
const std::string& prog_filename = "__model_combined__",
const std::string& param_filename = "__params_combined__") {
std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
if (is_combined) {
// All parameters are saved in a single file.
// Hard-coding the file names of program and parameters in unittest.
// The file names should be consistent with that used in Python API
// `fluid.io.save_inference_model`.
std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__";
inference_program =
paddle::inference::Load(executor, scope, dirname + "/" + prog_filename,
dirname + "/" + param_filename);
......@@ -115,12 +114,15 @@ std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
}
std::vector<std::vector<int64_t>> GetFeedTargetShapes(
const std::string& dirname, const bool is_combined = false) {
const std::string& dirname, const bool is_combined = false,
const std::string& prog_filename = "__model_combined__",
const std::string& param_filename = "__params_combined__") {
auto place = paddle::platform::CPUPlace();
auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope();
auto inference_program = InitProgram(&executor, scope, dirname, is_combined);
auto inference_program = InitProgram(&executor, scope, dirname, is_combined,
prog_filename, param_filename);
auto& global_block = inference_program->Block(0);
const std::vector<std::string>& feed_target_names =
......@@ -136,15 +138,6 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
return feed_target_shapes;
}
void Compile(paddle::framework::ProgramDesc* program) {
std::unique_ptr<paddle::framework::ir::Graph> g(
new paddle::framework::ir::Graph(*program));
auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
"graph_to_program_pass");
pass->SetNotOwned<paddle::framework::ProgramDesc>("program", program);
pass->Apply(std::move(g));
}
template <typename Place, bool CreateVars = true, bool PrepareContext = false>
void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
......@@ -182,7 +175,6 @@ void TestInference(const std::string& dirname,
paddle::platform::DeviceContextPool::Instance().Get(place));
inference_program = InitProgram(&executor, scope, dirname, is_combined);
}
Compile(inference_program.get());
// Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
......@@ -261,5 +253,3 @@ void TestInference(const std::string& dirname,
delete scope;
}
USE_PASS(graph_to_program_pass);
......@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>);
ops::grad_functor<double>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
......@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
const Out out_conj = Eigen::numext::conj(out);
dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
dx.device(d) = static_cast<T>(0.5) * dout / out;
}
};
......@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(factor) *
x.pow(static_cast<T>(factor - static_cast<T>(1)));
x.pow(static_cast<T>(factor) - static_cast<T>(1));
}
};
......
......@@ -119,8 +119,8 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
// 2. m += g_m * g_m
math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
auto grad_square = sqare_func(context, grad_merge, grad_merge);
auto grad_square =
SquareSelectedRows<platform::CPUDeviceContext, T>(context, grad_merge);
math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
functor(context, grad_square, moment);
......
......@@ -84,8 +84,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
framework::Vector<int64_t> merge_rows(grad_merge.rows());
// 2. m += g_m * g_m
math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
auto grad_square = sqare_func(context, grad_merge, grad_merge);
auto grad_square =
SquareSelectedRows<platform::CUDADeviceContext, T>(context, grad_merge);
math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
functor(context, grad_square, moment);
......
......@@ -28,6 +28,20 @@ struct SparseAdagradFunctor {
framework::Tensor *moment, framework::Tensor *param);
};
template <typename DeviceContext, typename T>
framework::SelectedRows SquareSelectedRows(
const DeviceContext &context, const framework::SelectedRows &input) {
framework::SelectedRows out;
out.set_rows(input.rows());
out.set_height(input.height());
out.mutable_value()->mutable_data<T>(input.value().dims(),
context.GetPlace());
auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
auto e_in = framework::EigenVector<T>::Flatten(input.value());
e_out.device(*context.eigen_device()) = e_in.square();
return out;
}
template <typename DeviceContext, typename T>
class AdagradOpKernel : public framework::OpKernel<T> {
public:
......
......@@ -219,8 +219,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());
d_scale->mutable_data<T>(ctx.GetPlace());
d_bias->mutable_data<T>(ctx.GetPlace());
d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
if ((N * H * W * D) == 1) {
......@@ -272,8 +272,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
const void *saved_mean_data = saved_mean->template data<T>();
const void *saved_var_data = saved_var->template data<T>();
const void *saved_mean_data =
saved_mean->template data<BatchNormParamType<T>>();
const void *saved_var_data =
saved_var->template data<BatchNormParamType<T>>();
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
......@@ -281,10 +283,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
data_desc_, d_y->template data<T>(), data_desc_,
d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
scale->template data<T>(),
d_scale->template mutable_data<T>(ctx.GetPlace()),
d_bias->template mutable_data<T>(ctx.GetPlace()), epsilon,
saved_mean_data, saved_var_data));
scale->template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
// clean when exit.
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
......@@ -304,4 +306,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>);
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
......@@ -143,9 +143,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
// Currently tensor core is only enabled using this algo
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
VLOG(5) << "use cudnn_tensor_op_math";
} else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math";
}
#endif
......@@ -361,7 +363,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>);
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
......
......@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/platform/float16.h"
namespace plat = paddle::platform;
namespace ops = paddle::operators;
using CUDACtx = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(cross_entropy,
ops::CrossEntropyOpKernel<CUDACtx, float>,
ops::CrossEntropyOpKernel<CUDACtx, double>);
REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
ops::CrossEntropyOpKernel<CUDACtx, double>,
ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
......@@ -286,10 +286,10 @@ int GRPCVariableResponse::Parse(Source* source) {
platform::EnableProfiler(platform::ProfilerState::kCPU);
} else if (profiling == platform::kDisableProfiler &&
platform::IsProfileEnabled()) {
// TODO(panyx0718): Should we allow to customize file dir.
platform::DisableProfiler(
platform::EventSortingKey::kDefault,
string::Sprintf("/tmp/profile_ps_%lld", listener_id));
string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path,
listener_id));
}
break;
}
......
......@@ -51,7 +51,6 @@ bool RequestSendHandler::Handle(const std::string& varname,
// Async
if (!sync_mode_) {
VLOG(3) << "async process var: " << varname;
rpc_server_->Profiler().OneStep();
try {
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
scope);
......
......@@ -20,42 +20,10 @@
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_int32(rpc_server_profile_period, 0,
"the period of listen_and_serv to do profile");
DEFINE_string(rpc_server_profile_path, "/dev/null",
"the profile log file path");
namespace paddle {
namespace operators {
namespace distributed {
RPCServerProfiler::RPCServerProfiler(int profile_period,
const std::string& profile_log_path)
: profile_period_(profile_period), profile_log_path_(profile_log_path) {
step_ = 0;
}
void RPCServerProfiler::OneStep() {
PADDLE_ENFORCE_LE(step_, profile_period_,
"step_ should not be larger then "
"profile_period_");
if (profile_period_ <= 0) {
return;
}
if (step_ == 0) {
auto pf_state = paddle::platform::ProfilerState::kCPU;
paddle::platform::EnableProfiler(pf_state);
}
if (step_ == profile_period_) {
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
profile_log_path_);
step_ = 0;
} else {
step_++;
}
}
void RPCServer::ShutDown() {
LOG(INFO) << "RPCServer ShutDown ";
ShutDownImpl();
......
......@@ -23,30 +23,14 @@
#include "paddle/fluid/operators/distributed/request_handler.h"
DECLARE_int32(rpc_server_profile_period);
DECLARE_string(rpc_server_profile_path);
namespace paddle {
namespace operators {
namespace distributed {
class RPCServerProfiler {
public:
RPCServerProfiler(int profile_period, const std::string& profile_log_path);
void OneStep();
private:
const int profile_period_;
std::string profile_log_path_;
int step_;
};
class RPCServer {
public:
explicit RPCServer(const std::string& address, int client_num)
: cur_cond_(0),
profiler_(FLAGS_rpc_server_profile_period,
FLAGS_rpc_server_profile_path),
bind_address_(address),
exit_flag_(false),
selected_port_(0),
......@@ -86,7 +70,6 @@ class RPCServer {
void Complete();
void ResetBarrierCounter();
RPCServerProfiler& Profiler() { return profiler_; }
bool NeedResetAllVars();
......@@ -101,7 +84,6 @@ class RPCServer {
std::unordered_map<std::string, int> rpc_cond_map_;
std::atomic<int> cur_cond_;
std::condition_variable rpc_cond_;
RPCServerProfiler profiler_;
protected:
std::string bind_address_;
......
......@@ -16,6 +16,9 @@
#include <vector>
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
DEFINE_string(rpc_server_profile_path, "./profile_ps",
"the profile log file path");
namespace paddle {
namespace operators {
namespace distributed {
......
......@@ -27,6 +27,8 @@
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
DECLARE_string(rpc_server_profile_path);
namespace paddle {
namespace operators {
namespace distributed {
......
......@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
......@@ -364,7 +364,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
int j = blockIdx.x;
int i = threadIdx.x;
int tid = threadIdx.x;
T val = 0;
T val(0);
do {
int x_offset = i * w + j;
......@@ -432,7 +432,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int tid = threadIdx.x;
int j = blockIdx.x;
T val = 0;
T val(0);
int ttid = tid;
while (true) {
......
......@@ -134,7 +134,6 @@ void ListenAndServOp::RunSyncLoop(
rpc_service_->ResetBarrierCounter();
while (true) {
rpc_service_->Profiler().OneStep();
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(distributed::kRequestSend);
......
......@@ -51,7 +51,7 @@ struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
T* dy) const {
const int block_size = 512;
dim3 threads(block_size, 1);
dim3 grid(1, (rows + block_size - 1) / block_size);
dim3 grid((rows + block_size - 1) / block_size, 1);
CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
x_norm, y_norm, x, y, z, dz, rows, cols, dy);
}
......
......@@ -21,6 +21,16 @@ namespace operators {
namespace math {
namespace {
__device__ __forceinline__ float real_log(float x) { return logf(x); }
__device__ __forceinline__ double real_log(double x) { return log(x); }
__device__ __forceinline__ platform::float16 real_log(
const platform::float16& val) {
return static_cast<platform::float16>(logf(static_cast<float>(val)));
}
template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
const int N, const int D,
......@@ -29,8 +39,8 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index);
Y[i] = ignore_index == label[i]
? 0
: -math::TolerableValue<T>()(log(X[i * D + label[i]]));
? static_cast<T>(0)
: -math::TolerableValue<T>()(real_log(X[i * D + label[i]]));
}
}
......@@ -38,12 +48,12 @@ template <typename T>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int class_num) {
int tid = threadIdx.x;
T val = 0;
T val(0);
int idx = blockIdx.x * class_num + tid;
int end = blockIdx.x * class_num + class_num;
for (; idx < end; idx += blockDim.x) {
val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
val += math::TolerableValue<T>()(real_log(X[idx])) * label[idx];
}
val = paddle::platform::reduceSum(val, tid, blockDim.x);
......@@ -53,8 +63,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
}
} // namespace
using Tensor = framework::Tensor;
template <typename T>
class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
public:
......@@ -89,6 +97,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
template class CrossEntropyFunctor<platform::CUDADeviceContext,
platform::float16>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <limits>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/hostdevice.h"
......@@ -33,6 +34,26 @@ struct TolerableValue {
}
};
// NOTE(dzh): float16 value clip behave different.
// 1. Our ValueClipping has a hardcore threshold 1e20
// for float number. 1e20 will resulting in overflow in float16.
// 2. float16 should expose the the real number overflow to python.
// because mixed-training depends the inf/nan value to determine
// if the scale value will be adjusted.
// Also. In standard implementation of cross entropy, other
// framework not has the ValueClipping.
template <>
struct TolerableValue<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& x) const {
if (platform::isfinite(x))
return x;
else if (x > static_cast<platform::float16>(0))
return std::numeric_limits<platform::float16>::max();
else
return std::numeric_limits<platform::float16>::min();
}
};
template <typename DeviceContext, typename T>
class CrossEntropyFunctor {
public:
......
......@@ -36,7 +36,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
.template Get<jitkernel::VAddReluKernel<T>>(N);
for (int i = 0; i < M; i++) {
T* dst = Y + i * N;
vaddrelu->Compute(B, dst, dst);
vaddrelu->Compute(B, dst, dst, N);
}
} else {
const auto& vadd = jitkernel::KernelPool::Instance()
......@@ -47,7 +47,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
#endif
for (int i = 0; i < M; i++) {
T* dst = Y + i * N;
vadd->Compute(B, dst, dst);
vadd->Compute(B, dst, dst, N);
}
}
}
......
......@@ -24,19 +24,29 @@ namespace gen {
using namespace platform::jit; // NOLINT
bool VMulJitCode::init(int d) {
bool VVVJitCode::init(int d) {
// It's not necessary to use avx512 since it would slow down the frequency
// and this kernel is not compute bound.
return MayIUse(avx);
}
void VMulJitCode::generate() {
void VVVJitCode::generate() {
// do not need push stack, and do not need save avx512reg if do not use avx512
int offset = 0;
if (with_relu_) {
vxorps(ymm_zero, ymm_zero, ymm_zero);
}
for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) {
vmovups(ymm_src1, ptr[param1 + offset]);
vmovups(ymm_src2, ptr[param2 + offset]);
vmulps(ymm_dst, ymm_src1, ymm_src2);
if (type_ == operand_type::mul) {
vmulps(ymm_dst, ymm_src1, ymm_src2);
} else if (type_ == operand_type::add) {
vaddps(ymm_dst, ymm_src1, ymm_src2);
}
if (with_relu_) {
vmaxps(ymm_dst, ymm_zero, ymm_dst);
}
vmovups(ptr[param3 + offset], ymm_dst);
offset += sizeof(float) * AVX_FLOAT_BLOCK;
}
......@@ -44,7 +54,14 @@ void VMulJitCode::generate() {
if (rest >= 4) {
vmovups(xmm_src1, ptr[param1 + offset]);
vmovups(xmm_src2, ptr[param2 + offset]);
vmulps(xmm_dst, xmm_src1, xmm_src2);
if (type_ == operand_type::mul) {
vmulps(xmm_dst, xmm_src1, xmm_src2);
} else if (type_ == operand_type::add) {
vaddps(xmm_dst, xmm_src1, xmm_src2);
}
if (with_relu_) {
vmaxps(xmm_dst, xmm_zero, xmm_dst);
}
vmovups(ptr[param3 + offset], xmm_dst);
offset += sizeof(float) * 4;
rest -= 4;
......@@ -52,7 +69,14 @@ void VMulJitCode::generate() {
if (rest >= 2) {
vmovq(xmm_src1, ptr[param1 + offset]);
vmovq(xmm_src2, ptr[param2 + offset]);
vmulps(xmm_dst, xmm_src1, xmm_src2);
if (type_ == operand_type::mul) {
vmulps(xmm_dst, xmm_src1, xmm_src2);
} else if (type_ == operand_type::add) {
vaddps(xmm_dst, xmm_src1, xmm_src2);
}
if (with_relu_) {
vmaxps(xmm_dst, xmm_zero, xmm_dst);
}
vmovq(ptr[param3 + offset], xmm_dst);
offset += sizeof(float) * 2;
rest -= 2;
......@@ -60,12 +84,18 @@ void VMulJitCode::generate() {
if (rest > 0) {
vmovss(xmm_src1, ptr[param1 + offset]);
vmovss(xmm_src2, ptr[param2 + offset]);
vmulss(xmm_dst, xmm_src1, xmm_src2);
if (type_ == operand_type::mul) {
vmulss(xmm_dst, xmm_src1, xmm_src2);
} else if (type_ == operand_type::add) {
vaddss(xmm_dst, xmm_src1, xmm_src2);
}
if (with_relu_) {
vmaxps(xmm_dst, xmm_zero, xmm_dst);
}
vmovss(ptr[param3 + offset], xmm_dst);
}
ret();
}
} // namespace gen
} // namespace jitkernel
} // namespace math
......
......@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/operators/math/jit_gen.h"
namespace paddle {
namespace operators {
namespace math {
......@@ -29,28 +29,47 @@ using ymm_t = const Xbyak::Ymm;
using zmm_t = const Xbyak::Zmm;
using Label = Xbyak::Label;
class VMulJitCode : public JitCode {
// function: vec = Operand(vec, vec) (maybe with relu)
typedef enum { mul = 0, add } operand_type;
class VVVJitCode : public JitCode {
public:
DECLARE_JIT_CODE(VMulJitCode);
explicit VMulJitCode(int d, size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), num_(d) {}
const char* name() const override {
std::string base = "VVVJitCode";
if (type_ == operand_type::mul) {
base += "_Mul";
} else if (type_ == operand_type::add) {
base += "_Add";
}
base += (with_relu_ ? "_relu" : "");
return base.c_str();
}
explicit VVVJitCode(int d, operand_type type, bool with_relu,
size_t code_size = 256 * 1024, void* code_ptr = nullptr)
: JitCode(code_size, code_ptr),
num_(d),
type_(type),
with_relu_(with_relu) {}
static bool init(int d);
void generate() override;
private:
int num_;
operand_type type_;
bool with_relu_;
reg64_t param1{abi_param1};
reg64_t param2{abi_param2};
reg64_t param3{abi_param3};
xmm_t xmm_src1 = xmm_t(0);
xmm_t xmm_src2 = xmm_t(1);
xmm_t xmm_dst = xmm_t(2);
xmm_t xmm_dst = xmm_t(1);
xmm_t xmm_zero = xmm_t(2);
ymm_t ymm_src1 = ymm_t(0);
ymm_t ymm_src2 = ymm_t(1);
ymm_t ymm_dst = ymm_t(2);
ymm_t ymm_dst = ymm_t(1);
ymm_t ymm_zero = ymm_t(2);
};
} // namespace gen
......
......@@ -71,26 +71,26 @@ class VMulKernel : public Kernel {
template <typename T>
class VAddKernel : public Kernel {
public:
virtual void Compute(const T *x, const T *y, T *z) const = 0;
void (*Compute)(const T *, const T *, T *, int);
};
template <typename T>
class VScalKernel : public Kernel {
class VAddReluKernel : public Kernel {
public:
virtual void Compute(const T a, const T *x, T *y) const = 0;
virtual void Compute(const T a, T *x) const = 0;
void (*Compute)(const T *, const T *, T *, int);
};
template <typename T>
class VAddBiasKernel : public Kernel {
class VScalKernel : public Kernel {
public:
virtual void Compute(const T a, const T *x, T *y) const = 0;
virtual void Compute(const T a, T *x) const = 0;
};
template <typename T>
class VAddReluKernel : public Kernel {
class VAddBiasKernel : public Kernel {
public:
virtual void Compute(const T *x, const T *y, T *z) const = 0;
virtual void Compute(const T a, const T *x, T *y) const = 0;
};
template <typename T>
......
......@@ -38,6 +38,21 @@ void VMulRefer(const T* x, const T* y, T* z, int n) {
}
}
template <typename T>
void VAddRefer(const T* x, const T* y, T* z, int n) {
for (int i = 0; i < n; ++i) {
z[i] = x[i] + y[i];
}
}
template <typename T>
void VAddReluRefer(const T* x, const T* y, T* z, int n) {
for (int i = 0; i < n; ++i) {
z[i] = x[i] + y[i];
z[i] = z[i] > 0 ? z[i] : 0;
}
}
#ifdef PADDLE_WITH_MKLML
template <typename T>
void VMulMKL(const T* x, const T* y, T* z, int n);
......@@ -46,28 +61,45 @@ template <>
void VMulMKL<float>(const float* x, const float* y, float* z, int n) {
platform::dynload::vsMul(n, x, y, z);
}
template <>
void VMulMKL<double>(const double* x, const double* y, double* z, int n) {
platform::dynload::vdMul(n, x, y, z);
}
template <typename T>
void VAddMKL(const T* x, const T* y, T* z, int n);
template <>
void VAddMKL<float>(const float* x, const float* y, float* z, int n) {
platform::dynload::vsAdd(n, x, y, z);
}
template <>
void VAddMKL<double>(const double* x, const double* y, double* z, int n) {
platform::dynload::vdAdd(n, x, y, z);
}
#endif
#define DECLARE_STATIC_FUNC \
static inline std::string name(int d) { \
PADDLE_THROW("DType should be either float or double"); \
} \
static inline bool useJIT(int d) { return false; } \
static inline bool useMKL(int d) { return false; }
/* VMUL JitKernel */
template <typename T>
class VMulKernelImpl : public VMulKernel<T> {
public:
static inline std::string name(int d) {
PADDLE_THROW("DType should be either float or double");
}
static inline bool useJIT(int d) { return false; }
static inline bool useMKL(int d) { return false; }
DECLARE_STATIC_FUNC;
explicit VMulKernelImpl(int d) : VMulKernel<T>() {
#ifdef PADDLE_WITH_XBYAK
if (useJIT(d)) {
// roughly estimate the size of code
size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
jitcode_.reset(new gen::VMulJitCode(d, sz > 4096 ? sz : 4096));
jitcode_.reset(new gen::VVVJitCode(d, gen::operand_type::mul, false,
sz > 4096 ? sz : 4096));
this->Compute =
jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
return;
......@@ -85,14 +117,14 @@ class VMulKernelImpl : public VMulKernel<T> {
#ifdef PADDLE_WITH_XBYAK
private:
std::unique_ptr<gen::VMulJitCode> jitcode_{nullptr};
std::unique_ptr<gen::VVVJitCode> jitcode_{nullptr};
#endif
};
#ifdef PADDLE_WITH_XBYAK
template <>
bool VMulKernelImpl<float>::useJIT(int d) {
return gen::VMulJitCode::init(d);
return gen::VVVJitCode::init(d);
}
#endif
......@@ -108,63 +140,93 @@ bool VMulKernelImpl<double>::useMKL(int d) {
}
#endif
REGISTER_JITKERNEL(vmul, VMulKernel);
/* VADD JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
/* VAdd JitKernel */
template <typename T>
class VAddKernelImpl : public VAddKernel<T> {
public:
explicit VAddKernelImpl(int d) : VAddKernel<T>() { this->num_ = d; }
void Compute(const T* x, const T* y, T* z) const override {
for (int i = 0; i < this->num_; ++i) {
z[i] = x[i] + y[i];
DECLARE_STATIC_FUNC;
explicit VAddKernelImpl(int d) : VAddKernel<T>() {
#ifdef PADDLE_WITH_XBYAK
if (useJIT(d)) {
size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
jitcode_.reset(new gen::VVVJitCode(d, gen::operand_type::add, false,
sz > 4096 ? sz : 4096));
this->Compute =
jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
return;
}
#endif
#ifdef PADDLE_WITH_MKLML
if (useMKL(d)) {
this->Compute = VAddMKL<T>;
return;
}
#endif
this->Compute = VAddRefer<T>;
}
#ifdef PADDLE_WITH_XBYAK
private:
std::unique_ptr<gen::VVVJitCode> jitcode_{nullptr};
#endif
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VAddKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
platform::dynload::vsAdd(this->num_, x, y, z); \
}
#ifdef PADDLE_WITH_XBYAK
template <>
bool VAddKernelImpl<float>::useJIT(int d) {
return gen::VVVJitCode::init(d);
}
#endif
#define MKL_DOUBLE(isa, block) \
template <> \
void VAddKernelImpl<double, isa, block>::Compute( \
const double* x, const double* y, double* z) const { \
platform::dynload::vdAdd(this->num_, x, y, z); \
}
#ifdef PADDLE_WITH_MKLML
template <>
bool VAddKernelImpl<float>::useMKL(int d) {
return d > 512;
}
FOR_EACH_ISA(MKL_FLOAT, kGT16);
FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
template <>
bool VAddKernelImpl<double>::useMKL(int d) {
return true;
}
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VAddKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy; \
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \
tmpx = _mm256_add_ps(tmpx, tmpy); \
_mm256_storeu_ps(z, tmpx); \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
/* VAddRelu JitKernel */
template <typename T>
class VAddReluKernelImpl : public VAddReluKernel<T> {
public:
DECLARE_STATIC_FUNC;
explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() {
#ifdef PADDLE_WITH_XBYAK
if (useJIT(d)) {
size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
jitcode_.reset(new gen::VVVJitCode(d, gen::operand_type::add, true,
sz > 4096 ? sz : 4096));
this->Compute =
jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
return;
}
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
this->Compute = VAddReluRefer<T>;
}
#ifdef PADDLE_WITH_XBYAK
private:
std::unique_ptr<gen::VVVJitCode> jitcode_{nullptr};
#endif
#ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f);
};
#ifdef PADDLE_WITH_XBYAK
template <>
bool VAddReluKernelImpl<float>::useJIT(int d) {
return gen::VVVJitCode::init(d);
}
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
#undef DECLARE_STATIC_FUNC
REGISTER_JITKERNEL(vmul, VMulKernel);
REGISTER_JITKERNEL(vadd, VAddKernel);
REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
/* VSCAL JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
......@@ -401,98 +463,9 @@ class VIdentityKernelImpl : public VIdentityKernel<T> {
void Compute(const T* x, T* y) const override {}
};
/* VAddRelu JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VAddReluKernelImpl : public VAddReluKernel<T> {
public:
explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() { this->num_ = d; }
void Compute(const T* x, const T* y, T* z) const override {
for (int i = 0; i < this->num_; ++i) {
z[i] = x[i] + y[i];
z[i] = z[i] > 0 ? z[i] : 0;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VAddReluKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx = _mm256_loadu_ps(x); \
__m256 tmpy = _mm256_loadu_ps(y); \
tmpy = _mm256_add_ps(tmpx, tmpy); \
tmpy = _mm256_max_ps(tmpy, _mm256_setzero_ps()); \
_mm256_storeu_ps(z, tmpy); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VAddReluKernelImpl<float, isa, kEQ16>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(y); \
tmp0 = _mm256_add_ps(tmp0, tmp1); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_loadu_ps(x + 8); \
__m256 tmp2 = _mm256_loadu_ps(y + 8); \
tmp1 = _mm256_add_ps(tmp1, tmp2); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(z, tmp0); \
_mm256_storeu_ps(z + 8, tmp1); \
}
#define INTRI_COMMON_FLOAT(isa, block) \
template <> \
VAddReluKernelImpl<float, isa, block>::VAddReluKernelImpl(int d) \
: VAddReluKernel<float>() { \
this->num_ = d; \
this->end_ = d - d % AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
} \
template <> \
void VAddReluKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 zeros = _mm256_setzero_ps(); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmpx = _mm256_loadu_ps(x + i); \
__m256 tmpy = _mm256_loadu_ps(y + i); \
tmpy = _mm256_add_ps(tmpx, tmpy); \
tmpy = _mm256_max_ps(tmpy, zeros); \
_mm256_storeu_ps(z + i, tmpy); \
} \
for (int i = this->end_; i < this->num_; ++i) { \
z[i] = x[i] + y[i]; \
z[i] = z[i] > 0 ? z[i] : 0; \
} \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
INTRI16_FLOAT(jit::avx);
INTRI_COMMON_FLOAT(jit::avx, kGT16);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2);
INTRI_COMMON_FLOAT(jit::avx2, kGT16);
#endif
#ifdef __AVX512F__
// TODO(TJ): refine avx512
INTRI8_FLOAT(jit::avx512f);
INTRI16_FLOAT(jit::avx512f);
INTRI_COMMON_FLOAT(jit::avx512f, kGT16);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_COMMON_FLOAT
REGISTER_JITKERNEL_DEPRECATED(vadd, VAddKernel);
REGISTER_JITKERNEL_DEPRECATED(vscal, VScalKernel);
REGISTER_JITKERNEL_DEPRECATED(vaddb, VAddBiasKernel);
REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel);
REGISTER_JITKERNEL_DEPRECATED(vaddrelu, VAddReluKernel);
REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel);
} // namespace jitkernel
......
......@@ -177,7 +177,7 @@ class LSTMKernelImpl : public LSTMKernel<T> {
act_cand_d_->Compute(gates, gates);
vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
vadd_d_->Compute(gates + d_, gates + d2_, ct);
vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_->Compute(ct, gates + d2_);
......@@ -287,16 +287,16 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
/* get fgated and igated*/
vmul_d_->Compute(wp_data, ct_1, checked, d_);
vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_);
vadd_d2_->Compute(checked, gates + d_, gates + d_);
vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_);
act_gate_d2_->Compute(gates + d_, gates + d_);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
act_cand_d_->Compute(gates, gates);
vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
vadd_d_->Compute(gates + d_, gates + d2_, ct);
vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
/* get ogated*/
vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
act_gate_d_->Compute(gates + d3_, gates + d3_);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_->Compute(ct, gates + d2_);
......@@ -310,7 +310,7 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
vmul_d_->Compute(gates, gates + d_, ct, d_);
/* get outgated, put W_oc * C_t on igated */
vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_->Compute(gates + d3_, gates + d3_);
act_cell_d_->Compute(ct, gates + d2_);
......
......@@ -371,7 +371,7 @@ void lstm_ctht_better(
vtanh_d->Compute(gates, gates);
vmul_d->Compute(gates, gates + d, gates + d, d);
vmul_d->Compute(ct_1, gates + d2, gates + d2, d);
vadd_d->Compute(gates + d, gates + d2, ct);
vadd_d->Compute(gates + d, gates + d2, ct, d);
/* H_t = act_cell(C_t) * ogated */
vtanh_d->Compute(ct, gates + d2);
vmul_d->Compute(gates + d2, gates + d * 3, ht, d);
......@@ -695,7 +695,7 @@ TEST(JitKernel, vadd) {
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, y_data, ztgt_data);
ker->Compute(x_data, y_data, ztgt_data, d);
}
auto ttgte = GetCurrentUS();
......@@ -723,8 +723,8 @@ void vaddrelu_better(
const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd,
const std::shared_ptr<
const paddle::operators::math::jitkernel::VReluKernel<float>>& vrelu,
const float* x, const float* y, float* z) {
vadd->Compute(x, y, z);
const float* x, const float* y, float* z, int d) {
vadd->Compute(x, y, z, d);
vrelu->Compute(z, z);
}
......@@ -752,12 +752,12 @@ TEST(JitKernel, vaddrelu) {
auto trefe = GetCurrentUS();
auto tmkls = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data);
vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data, d);
}
auto tmkle = GetCurrentUS();
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, y_data, ztgt_data);
ker->Compute(x_data, y_data, ztgt_data, d);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
......@@ -801,7 +801,11 @@ TEST(JitKernel, pool) {
std::dynamic_pointer_cast<const jit::Kernel>(pvmul_d));
const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulfjit4");
EXPECT_EQ(pvmul_f, pvmul_from_key);
#if defined(__APPLE__) || defined(__OSX__) || defined(_WIN32)
EXPECT_EQ(pvmul_from_key, nullptr);
#else
EXPECT_EQ(pvmul_from_key, pvmul_f);
#endif
const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulfjit");
EXPECT_TRUE(pvmul_from_key2 == nullptr);
}
......@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
......@@ -81,7 +82,7 @@ template <typename T, int block_size>
__global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
const int64_t* rows, T* tensor_out,
int64_t row_numel) {
const int ty = blockIdx.y;
const int ty = blockIdx.x;
int tid = threadIdx.x;
selected_rows += ty * row_numel;
......@@ -119,11 +120,11 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto* out_data = output->data<T>();
SetConstant<platform::CUDADeviceContext, T> functor;
functor(context, output, 0.0);
functor(context, output, static_cast<T>(0));
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid(1, in1_rows.size());
dim3 grid(in1_rows.size(), 1);
SelectedRowsAddTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
......@@ -137,6 +138,9 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
platform::float16>;
template <typename T>
struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
......@@ -176,6 +180,8 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext,
platform::float16>;
namespace {
template <typename T, int block_size>
......@@ -183,7 +189,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
const int64_t* rows,
T* tensor_out,
int64_t row_numel) {
const int ty = blockIdx.y;
const int ty = blockIdx.x;
int tid = threadIdx.x;
selected_rows += ty * row_numel;
......@@ -216,7 +222,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
auto* in2_data = input2->data<T>();
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid(1, in1_rows.size());
dim3 grid(in1_rows.size(), 1);
SelectedRowsAddToTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
......@@ -228,6 +234,8 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
platform::float16>;
namespace scatter {
......@@ -288,7 +296,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
context.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
constant_functor(context, out.mutable_value(), static_cast<T>(0));
auto* out_data = out.mutable_value()->data<T>();
auto* input_data = input.value().data<T>();
......@@ -348,7 +356,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
context.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
constant_functor(context, out.mutable_value(), static_cast<T>(0));
auto* out_data = out.mutable_value()->data<T>();
......@@ -375,12 +383,13 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
template struct MergeAdd<platform::CUDADeviceContext, double>;
template struct MergeAdd<platform::CUDADeviceContext, int>;
template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
template <typename T, int block_size>
__global__ void UpdateToTensorKernel(const T* selected_rows,
const int64_t* rows, const ScatterOps& op,
T* tensor_out, int64_t row_numel) {
const int ty = blockIdx.y;
const int ty = blockIdx.x;
int tid = threadIdx.x;
selected_rows += ty * row_numel;
......@@ -449,7 +458,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
auto* in2_data = input2->data<T>();
dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
dim3 grid(1, in1_rows.size());
dim3 grid(in1_rows.size(), 1);
UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
op, in2_data, in1_row_numel);
......
......@@ -88,57 +88,6 @@ struct MergeAdd {
framework::SelectedRows* output);
};
template <typename DeviceContext, typename T>
struct Add {
framework::SelectedRows operator()(const DeviceContext& context,
const framework::SelectedRows& input1,
const framework::SelectedRows& input2) {
framework::SelectedRows out;
out.set_rows(input1.rows());
out.set_height(input1.height());
out.mutable_value()->mutable_data<T>(input1.value().dims(),
context.GetPlace());
auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
e_out.device(*context.eigen_device()) = e_in1 + e_in2;
return out;
}
};
template <typename DeviceContext, typename T>
struct Mul {
// multiply two SelectedRows
framework::SelectedRows operator()(const DeviceContext& context,
const framework::SelectedRows& input1,
const framework::SelectedRows& input2) {
framework::SelectedRows out;
out.set_rows(input1.rows());
out.set_height(input1.height());
out.mutable_value()->mutable_data<T>(input1.value().dims(),
context.GetPlace());
auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
e_out.device(*context.eigen_device()) = e_in1 * e_in2;
return out;
}
// multiply scalar to SelectedRows
framework::SelectedRows operator()(const DeviceContext& context,
const framework::SelectedRows& input1,
const T input2) {
framework::SelectedRows out;
out.set_rows(input1.rows());
out.set_height(input1.height());
out.mutable_value()->mutable_data<T>(input1.value().dims(),
context.GetPlace());
auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
e_out.device(*context.eigen_device()) = input2 * e_in1;
return out;
}
};
enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
// out = seleted_rows_in / tensor
......
......@@ -96,12 +96,15 @@ template class SoftmaxCUDNNFunctor<float>;
template class SoftmaxCUDNNFunctor<double>;
template class SoftmaxGradCUDNNFunctor<float>;
template class SoftmaxGradCUDNNFunctor<double>;
template class SoftmaxGradCUDNNFunctor<platform::float16>;
template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext,
platform::float16>;
} // namespace math
} // namespace operators
......
......@@ -15,11 +15,15 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
......@@ -55,8 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
IG->mutable_data<T>(context.GetPlace());
T ig_size = static_cast<T>(IG->numel());
Eigen::DSizes<int, 1> bcast(ig_size);
Eigen::DSizes<int, 1> bcast(static_cast<int>(ig_size));
EigenVector<T>::Flatten(*IG).device(
*context.template device_context<DeviceContext>().eigen_device()) =
(EigenVector<T>::From(*OG) / ig_size).broadcast(bcast);
......
......@@ -20,6 +20,7 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
ops::MulKernel<plat::CUDADeviceContext, double>,
ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(mul_grad,
ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>,
ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
......@@ -178,7 +178,8 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<plat::float16>);
REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNGradOpKernel<float>,
ops::PoolCUDNNGradOpKernel<double>);
ops::PoolCUDNNGradOpKernel<double>,
ops::PoolCUDNNGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<float>,
......
......@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/platform/float16.h"
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
scale,
......@@ -20,4 +22,6 @@ REGISTER_OP_CUDA_KERNEL(
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
int64_t>);
int64_t>,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
......@@ -80,4 +80,5 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops::SoftmaxCUDNNKernel<plat::float16>);
REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
ops::SoftmaxGradCUDNNKernel<float>,
ops::SoftmaxGradCUDNNKernel<double>);
ops::SoftmaxGradCUDNNKernel<double>,
ops::SoftmaxGradCUDNNKernel<plat::float16>);
......@@ -23,4 +23,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>,
ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
......@@ -11,10 +11,13 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
......@@ -61,7 +61,7 @@ class SumKernel : public framework::OpKernel<T> {
if (start != 2) {
math::SetConstant<DeviceContext, T> constant_functor;
constant_functor(context.template device_context<DeviceContext>(),
out, 0.0);
out, static_cast<T>(0));
}
}
......
......@@ -223,7 +223,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Add outputs
for (auto& output : output_maps) {
engine->DeclareOutput(output);
if (!engine->HasDeclared(output)) {
engine->DeclareOutput(output);
}
}
engine->FreezeNetwork();
......
......@@ -116,6 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
platform::SetNumThreads(FLAGS_paddle_num_threads);
#endif
#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__)
if (platform::jit::MayIUse(platform::jit::avx)) {
#ifndef __AVX__
LOG(WARNING) << "AVX is available, Please re-compile on local machine";
......@@ -157,8 +158,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
AVX_GUIDE(AVX, NonAVX);
}
#endif
#undef AVX_GUIDE
#endif
}
void InitGLOG(const std::string &prog_name) {
......
......@@ -226,7 +226,7 @@ RecordBlock::~RecordBlock() {
void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ",
"Can't enable profiling, since the input state is ",
"ProfilerState::kDisabled");
std::lock_guard<std::mutex> l(profiler_mu);
......
......@@ -742,7 +742,12 @@ All parameter, weight, gradient are variables in Paddle.
will clean up the temp variables at the end of the current iteration.
2. In some NLP model, it may cause the GPU memory is insufficient,
in this case, you should reduce `num_iteration_per_drop_scope`.
)DOC");
)DOC")
.def_property("_dry_run",
[](const ExecutionStrategy &self) { return self.dry_run_; },
[](ExecutionStrategy &self, bool dry_run) {
self.dry_run_ = dry_run;
});
exec_strategy.def_property(
"use_experimental_executor",
......
......@@ -118,7 +118,6 @@ def __bootstrap__():
]
if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline')
read_env_flags.append('rpc_server_profile_period')
read_env_flags.append('rpc_server_profile_path')
read_env_flags.append('enable_rpc_profiler')
read_env_flags.append('rpc_send_thread_num')
......
......@@ -65,7 +65,7 @@ def is_persistable(var):
Examples:
.. code-block:: python
param = fluid.default_main_program().global_block().var('fc.w')
param = fluid.default_main_program().global_block().var('fc.b')
res = fluid.io.is_persistable(param)
"""
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
......@@ -625,8 +625,13 @@ def save_inference_model(dirname,
main_program._distributed_lookup_table,
main_program._endpoints)
if not os.path.isdir(dirname):
# when a pserver and a trainer running on the same machine, mkdir may conflict
try:
os.makedirs(dirname)
except OSError as e:
if e.errno != errno.EEXIST:
raise
if model_filename is not None:
model_basename = os.path.basename(model_filename)
else:
......
......@@ -60,7 +60,7 @@ def data(name,
For example if shape=[1], the resulting shape is [-1, 1].
2. If shape contains -1, such as shape=[1, -1],
append_batch_size will be enforced to be be False (ineffective).
dtype(int|float): The type of data : float32, float_16, int etc
dtype(basestring): The type of data : float32, float_16, int etc
type(VarType): The output type. By default it is LOD_TENSOR.
lod_level(int): The LoD Level. 0 means the input data is not a sequence.
stop_gradient(bool): A boolean that mentions whether gradient should flow.
......
......@@ -41,9 +41,6 @@ def convert_reader_to_recordio_file(
"""
Convert a Python Reader to a recordio file.
Please see :ref:`api_guide_python_reader` and :ref:`api_guide_reader_op` for
details.
Examples:
>>> import paddle.fluid as fluid
......
......@@ -54,14 +54,6 @@ def get_numeric_gradient(place,
def product(dim):
return six.moves.reduce(lambda a, b: a * b, dim, 1)
def get_output():
sum = []
op.run(scope, place)
for output_name in output_names:
sum.append(
np.array(scope.find_var(output_name).get_tensor()).mean())
return np.array(sum).sum() / len(output_names)
tensor_to_check = scope.find_var(input_to_check).get_tensor()
tensor_size = product(tensor_to_check.shape())
tensor_to_check_dtype = tensor_to_check._dtype()
......@@ -77,6 +69,15 @@ def get_numeric_gradient(place,
raise ValueError("Not supported data type " + str(
tensor_to_check_dtype))
def get_output():
sum = []
op.run(scope, place)
for output_name in output_names:
sum.append(
np.array(scope.find_var(output_name).get_tensor()).astype(
tensor_to_check_dtype).mean())
return tensor_to_check_dtype(np.array(sum).sum() / len(output_names))
gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
def __get_elem__(tensor, i):
......
......@@ -223,106 +223,81 @@ class TestWithInput1x1Filter1x1(TestConv2dOp):
#----------------Conv2dCUDNN----------------
class TestCUDNN(TestConv2dOp):
def init_kernel_type(self):
self.use_cudnn = True
class TestFP16CUDNN(TestConv2dOp):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
def create_test_cudnn_class(parent, cls_name):
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNNCase(parent):
def init_kernel_type(self):
self.use_cudnn = True
cls_name = "{0}".format(cls_name)
TestCUDNNCase.__name__ = cls_name
globals()[cls_name] = TestCUDNNCase
class TestCUDNNWithPad(TestWithPad):
def init_kernel_type(self):
self.use_cudnn = True
class TestFP16CUDNNWithPad(TestWithPad):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestCUDNNWithStride(TestWithStride):
def init_kernel_type(self):
self.use_cudnn = True
class TestFP16CUDNNWithStride(TestWithStride):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
create_test_cudnn_class(TestConv2dOp, "TestPool2DCUDNNOp")
create_test_cudnn_class(TestWithPad, "TestPool2DCUDNNOpCase1")
create_test_cudnn_class(TestWithStride, "TestPool2DCUDNNOpCase2")
create_test_cudnn_class(TestWithGroup, "TestPool2DCUDNNOpCase3")
create_test_cudnn_class(TestWith1x1, "TestPool2DCUDNNOpCase4")
create_test_cudnn_class(TestWithInput1x1Filter1x1, "TestPool2DCUDNNOpCase4")
class TestCUDNNWithGroup(TestWithGroup):
def init_kernel_type(self):
self.use_cudnn = True
class TestFP16CUDNNWithGroup(TestWithGroup):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
#----------------Conv2dCUDNN----------------
class TestCUDNNWith1x1(TestWith1x1):
def init_kernel_type(self):
self.use_cudnn = True
def create_test_cudnn_fp16_class(parent, cls_name, grad_check=True):
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestConv2DCUDNNFp16(parent):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
class TestFP16CUDNNWith1x1(TestWith1x1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
def test_check_output(self):
if core.is_compiled_with_cuda():
def test_check_grad_no_filter(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestCUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
self.use_cudnn = True
class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
if core.is_float16_supported(place) and grad_check:
self.check_grad_with_place(
place, ['Input'],
'Output',
max_relative_error=0.02,
no_grad_set=set(['Filter']))
def test_check_grad_no_input(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
if core.is_float16_supported(place) and grad_check:
self.check_grad_with_place(
place, ['Filter'],
'Output',
max_relative_error=0.02,
no_grad_set=set(['Input']))
cls_name = "{0}".format(cls_name)
TestConv2DCUDNNFp16.__name__ = cls_name
globals()[cls_name] = TestConv2DCUDNNFp16
create_test_cudnn_fp16_class(
TestConv2dOp, "TestPool2DCUDNNFp16Op", grad_check=False)
create_test_cudnn_fp16_class(
TestWithPad, "TestPool2DCUDNNFp16OpCase1", grad_check=False)
create_test_cudnn_fp16_class(
TestWithStride, "TestPool2DCUDNNFp16OpCase2", grad_check=False)
create_test_cudnn_fp16_class(
TestWithGroup, "TestPool2DCUDNNFp16OpCase3", grad_check=False)
create_test_cudnn_fp16_class(
TestWith1x1, "TestPool2DCUDNNFp16OpCase4", grad_check=False)
create_test_cudnn_fp16_class(
TestWithInput1x1Filter1x1, "TestPool2DCUDNNFp16OpCase4", grad_check=False)
# -------TestDepthwiseConv
class TestDepthwiseConv(TestConv2dOp):
......
......@@ -16,28 +16,58 @@ from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest, randomize_probability
class TestCrossEntropyOp1(OpTest):
class TestCrossEntropyOp(OpTest):
"""Test cross-entropy with discrete one-hot labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
batch_size = 30
class_num = 10
self.soft_label = False
self.ignore_index = -100
self.dtype = np.float64
self.batch_size = 30
self.class_num = 10
self.init_dtype_type()
self.init_attr_type()
self.init_bs_class_num()
self.init_x()
self.init_label()
self.get_cross_entropy()
self.inputs = {"X": self.x, "Label": self.label}
self.outputs = {"Y": self.cross_entropy}
self.attrs = {
"soft_label": self.soft_label,
"ignore_index": self.ignore_index
}
def init_x(self):
self.x = randomize_probability(
self.batch_size, self.class_num, dtype=self.dtype)
def init_label(self):
self.label = np.random.randint(
0, self.class_num, (self.batch_size, 1), dtype="int64")
def get_cross_entropy(self):
self.cross_entropy = np.asmatrix(
[[-np.log(self.x[i][self.label[i][0]])]
for i in range(self.x.shape[0])],
dtype="float64")
X = randomize_probability(batch_size, class_num, dtype='float64')
def init_attr_type(self):
pass
label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
cross_entropy = np.asmatrix(
[[-np.log(X[i][label[i][0]])] for i in range(X.shape[0])],
dtype="float64")
def init_dtype_type(self):
pass
self.inputs = {"X": X, "Label": label}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": False}
def init_bs_class_num(self):
pass
def test_check_output(self):
self.check_output()
......@@ -46,197 +76,231 @@ class TestCrossEntropyOp1(OpTest):
self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
class TestCrossEntropyOp2(OpTest):
class TestCrossEntropyOp2(TestCrossEntropyOp):
"""Test cross-entropy with vectorized soft labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
batch_size = 5
class_num = 37
def init_label(self):
self.label = np.random.uniform(
0.1, 1.0, [self.batch_size, self.class_num]).astype(self.dtype)
self.label /= self.label.sum(axis=1, keepdims=True)
X = randomize_probability(batch_size, class_num)
label = np.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float32")
label /= label.sum(axis=1, keepdims=True)
cross_entropy = (-label * np.log(X)).sum(
axis=1, keepdims=True).astype("float32")
def get_cross_entropy(self):
self.cross_entropy = (-self.label * np.log(self.x)).sum(
axis=1, keepdims=True).astype(self.dtype)
self.inputs = {"X": X, "Label": label}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": True}
def init_attr_type(self):
self.soft_label = True
def test_check_output(self):
self.check_output()
def init_dtype_type(self):
self.dtype = np.float32
def init_bs_class_num(self):
self.batch_size = 5
self.class_num = 37
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp3(OpTest):
class TestCrossEntropyOp3(TestCrossEntropyOp):
"""Test cross-entropy with vectorized one-hot representation of labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
batch_size = 5
class_num = 17
def init_label(self):
self.label_index = np.random.randint(0, self.class_num,
(self.batch_size))
self.label = np.zeros(self.x.shape).astype(self.dtype)
self.label[np.arange(self.batch_size), self.label_index] = 1
X = randomize_probability(batch_size, class_num)
label_index = np.random.randint(
0, class_num, (batch_size), dtype="int32")
label = np.zeros(X.shape)
label[np.arange(batch_size), label_index] = 1
def get_cross_entropy(self):
self.cross_entropy = np.asmatrix(
[[-np.log(self.x[i][self.label_index[i]])]
for i in range(self.x.shape[0])]).astype(self.dtype)
cross_entropy = np.asmatrix(
[[-np.log(X[i][label_index[i]])] for i in range(X.shape[0])],
dtype="float32")
cross_entropy2 = (-label * np.log(X)).sum(
axis=1, keepdims=True).astype("float32")
def init_attr_type(self):
self.soft_label = True
self.inputs = {"X": X, "Label": label.astype(np.float32)}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": True}
def init_dtype_type(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output()
def init_bs_class_num(self):
self.batch_size = 5
self.class_num = 17
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp4(OpTest):
class TestCrossEntropyOp4(TestCrossEntropyOp):
"""Test high rank tensor cross-entropy with discrete one-hot labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
shape = [10, 2, 4]
ins_num = np.prod(np.array(shape))
class_num = 10
def init_x(self):
self.shape = [10, 2, 4]
self.ins_num = np.prod(np.array(self.shape))
self.X_2d = randomize_probability(self.ins_num,
self.class_num).astype(self.dtype)
self.x = self.X_2d.reshape(self.shape + [self.class_num])
X_2d = randomize_probability(ins_num, class_num, dtype='float64')
def init_label(self):
self.label_2d = np.random.randint(
0, self.class_num, (self.ins_num, 1), dtype="int64")
self.label = self.label_2d.reshape(self.shape + [1])
label_2d = np.random.randint(0, class_num, (ins_num, 1), dtype="int64")
def get_cross_entropy(self):
cross_entropy_2d = np.asmatrix(
[[-np.log(X_2d[i][label_2d[i][0]])] for i in range(X_2d.shape[0])],
dtype="float64")
[[-np.log(self.X_2d[i][self.label_2d[i][0]])]
for i in range(self.X_2d.shape[0])]).astype(self.dtype)
self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
[1])
X = X_2d.reshape(shape + [class_num])
label = label_2d.reshape(shape + [1])
cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
def init_attr_type(self):
self.soft_label = False
self.inputs = {"X": X, "Label": label}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": False}
def test_check_output(self):
self.check_output()
def init_dtype_type(self):
self.dtype = np.float64
def test_check_grad(self):
self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
def init_bs_class_num(self):
self.class_num = 10
class TestCrossEntropyOp5(OpTest):
class TestCrossEntropyOp5(TestCrossEntropyOp):
"""Test high rank tensor cross-entropy with vectorized soft labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
shape = [4, 3]
ins_num = np.prod(np.array(shape))
class_num = 37
def init_x(self):
self.shape = [4, 3]
self.ins_num = np.prod(np.array(self.shape))
self.X_2d = randomize_probability(self.ins_num,
self.class_num).astype(self.dtype)
self.x = self.X_2d.reshape(self.shape + [self.class_num])
X_2d = randomize_probability(ins_num, class_num)
label_2d = np.random.uniform(0.1, 1.0,
[ins_num, class_num]).astype("float32")
label_2d /= label_2d.sum(axis=1, keepdims=True)
cross_entropy_2d = (-label_2d * np.log(X_2d)).sum(
axis=1, keepdims=True).astype("float32")
def init_label(self):
self.label_2d = np.random.uniform(
0.1, 1.0, [self.ins_num, self.class_num]).astype(self.dtype)
self.label_2d /= self.label_2d.sum(axis=1, keepdims=True)
self.label = self.label_2d.reshape(self.shape + [self.class_num])
X = X_2d.reshape(shape + [class_num])
label = label_2d.reshape(shape + [class_num])
cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
def get_cross_entropy(self):
cross_entropy_2d = (-self.label_2d * np.log(self.X_2d)).sum(
axis=1, keepdims=True).astype(self.dtype)
self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
[1])
self.inputs = {"X": X, "Label": label}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": True}
def init_attr_type(self):
self.soft_label = True
def test_check_output(self):
self.check_output()
def init_dtype_type(self):
self.dtype = np.float32
def init_bs_class_num(self):
self.class_num = 37
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp6(OpTest):
class TestCrossEntropyOp6(TestCrossEntropyOp):
"""Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
shape = [4, 3, 2]
ins_num = np.prod(np.array(shape))
class_num = 17
X_2d = randomize_probability(ins_num, class_num)
label_index_2d = np.random.randint(
0, class_num, (ins_num), dtype="int32")
label_2d = np.zeros(X_2d.shape)
label_2d[np.arange(ins_num), label_index_2d] = 1
def init_x(self):
self.shape = [4, 3, 2]
self.ins_num = np.prod(np.array(self.shape))
self.X_2d = randomize_probability(self.ins_num,
self.class_num).astype(self.dtype)
self.x = self.X_2d.reshape(self.shape + [self.class_num])
def init_label(self):
self.label_index_2d = np.random.randint(
0, self.class_num, (self.ins_num), dtype="int64")
label_2d = np.zeros(self.X_2d.shape)
label_2d[np.arange(self.ins_num), self.label_index_2d] = 1
self.label = label_2d.reshape(self.shape + [self.class_num]).astype(
self.dtype)
def get_cross_entropy(self):
cross_entropy_2d = np.asmatrix(
[[-np.log(X_2d[i][label_index_2d[i]])]
for i in range(X_2d.shape[0])],
dtype="float32")
[[-np.log(self.X_2d[i][self.label_index_2d[i]])]
for i in range(self.X_2d.shape[0])])
self.cross_entropy = np.array(cross_entropy_2d).reshape(
self.shape + [1]).astype(self.dtype)
X = X_2d.reshape(shape + [class_num])
label = label_2d.reshape(shape + [class_num])
cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
def init_attr_type(self):
self.soft_label = True
self.inputs = {"X": X, "Label": label.astype(np.float32)}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": True}
def init_dtype_type(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output()
def init_bs_class_num(self):
self.class_num = 17
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp7(OpTest):
class TestCrossEntropyOp7(TestCrossEntropyOp):
"""Test cross-entropy with ignore index.
"""
def setUp(self):
self.op_type = "cross_entropy"
batch_size = 30
class_num = 10
ignore_index = 3
X = randomize_probability(batch_size, class_num, dtype='float64')
label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
cross_entropy = np.asmatrix(
[[-np.log(X[i][label[i][0]])]
if label[i][0] != ignore_index else [0]
for i in range(X.shape[0])],
dtype="float64")
self.inputs = {"X": X, "Label": label}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": False, "ignore_index": ignore_index}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
def init_label(self):
self.label = np.random.randint(
0, self.class_num, (self.batch_size, 1), dtype="int64")
def get_cross_entropy(self):
self.cross_entropy = np.asmatrix(
[[-np.log(self.x[i][self.label[i][0]])]
if self.label[i][0] != self.ignore_index else [0]
for i in range(self.x.shape[0])]).astype(self.dtype)
def init_attr_type(self):
self.soft_label = False
self.ignore_index = 3
def init_dtype_type(self):
self.dtype = np.float64
def init_bs_class_num(self):
self.batch_size = 30
self.class_num = 10
# Add Fp16 test
def create_test_class(parent, cls_name):
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCrossEntropyFP16Op(parent):
def init_dtype_type(self):
return np.float16
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-1)
def test_check_grad(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['X'], 'Y', max_relative_error=0.9)
cls_name = "{0}".format(cls_name)
TestCrossEntropyFP16Op.__name__ = cls_name
globals()[cls_name] = TestCrossEntropyFP16Op
create_test_class(TestCrossEntropyOp, "TestCrossEntropyF16Op")
#create_test_class(TestCrossEntropyOp2, "TestCrossEntropyF16Op2")
create_test_class(TestCrossEntropyOp3, "TestCrossEntropyF16Op3")
create_test_class(TestCrossEntropyOp4, "TestCrossEntropyF16Op4")
#create_test_class(TestCrossEntropyOp5, "TestCrossEntropyF16Op5")
create_test_class(TestCrossEntropyOp6, "TestCrossEntropyF16Op6")
create_test_class(TestCrossEntropyOp7, "TestCrossEntropyF16Op7")
if __name__ == "__main__":
unittest.main()
......@@ -17,14 +17,20 @@ from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle.fluid.core as core
class TestMeanOp(OpTest):
def setUp(self):
self.op_type = "mean"
self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
self.dtype = np.float32
self.init_dtype_type()
self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
self.outputs = {'Out': np.mean(self.inputs["X"])}
def init_dtype_type(self):
pass
def test_check_output(self):
self.check_output()
......@@ -32,5 +38,23 @@ class TestMeanOp(OpTest):
self.check_grad(['X'], 'Out')
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16MeanOp(TestMeanOp):
def init_dtype_type(self):
self.dtype = np.float16
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-3)
def test_checkout_grad(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['X'], 'Out', max_relative_error=0.8)
if __name__ == "__main__":
unittest.main()
......@@ -23,12 +23,17 @@ from op_test import OpTest
class TestMulOp(OpTest):
def setUp(self):
self.op_type = "mul"
self.dtype = np.float32
self.init_dtype_type()
self.inputs = {
'X': np.random.random((2, 5)).astype("float32"),
'Y': np.random.random((5, 3)).astype("float32")
'X': np.random.random((2, 5)).astype(self.dtype),
'Y': np.random.random((5, 3)).astype(self.dtype)
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
def init_dtype_type(self):
pass
def test_check_output(self):
self.check_output()
......@@ -47,9 +52,11 @@ class TestMulOp(OpTest):
class TestMulOp2(OpTest):
def setUp(self):
self.op_type = "mul"
self.dtype = np.float32
self.init_dtype_type()
self.inputs = {
'X': np.random.random((3, 4, 4, 3)).astype("float32"),
'Y': np.random.random((2, 6, 1, 2, 3)).astype("float32")
'X': np.random.random((3, 4, 4, 3)).astype(self.dtype),
'Y': np.random.random((2, 6, 1, 2, 3)).astype(self.dtype)
}
self.attrs = {
'x_num_col_dims': 2,
......@@ -60,6 +67,9 @@ class TestMulOp2(OpTest):
result = result.reshape(3, 4, 1, 2, 3)
self.outputs = {'Out': result}
def init_dtype_type(self):
pass
def test_check_output(self):
self.check_output()
......@@ -75,40 +85,76 @@ class TestMulOp2(OpTest):
['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
class TestFP16MulOp1(OpTest):
def setUp(self):
self.op_type = "mul"
x = np.random.random((3, 5)).astype("float16")
y = np.random.random((5, 4)).astype("float16")
self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
self.outputs = {'Out': np.dot(x, y)}
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16MulOp1(TestMulOp):
def init_dtype_type(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-1)
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-1)
def test_check_grad_normal(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['X', 'Y'], 'Out', max_relative_error=0.5)
class TestFP16MulOp2(OpTest):
def setUp(self):
self.op_type = "mul"
x = np.random.random((3, 4, 4, 3)).astype("float16")
y = np.random.random((2, 6, 1, 2, 3)).astype("float16")
self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
self.attrs = {
'x_num_col_dims': 2,
'y_num_col_dims': 2,
}
result = np.dot(x.reshape(3 * 4, 4 * 3), y.reshape(2 * 6, 1 * 2 * 3))
result = result.reshape(3, 4, 1, 2, 3)
self.outputs = {'Out': result}
def test_check_grad_ingore_x(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['Y'],
'Out',
max_relative_error=0.5,
no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['X'],
'Out',
max_relative_error=0.5,
no_grad_set=set('Y'))
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16MulOp2(TestMulOp2):
def init_dtype_type(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-1)
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-1)
def test_check_grad_normal(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['X', 'Y'], 'Out', max_relative_error=0.9)
def test_check_grad_ingore_x(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['Y'],
'Out',
max_relative_error=0.5,
no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['X'],
'Out',
max_relative_error=0.9,
no_grad_set=set('Y'))
if __name__ == "__main__":
......
......@@ -16,6 +16,7 @@ from __future__ import print_function
import paddle.dataset.conll05 as conll05
import paddle.fluid as fluid
import paddle.fluid.core as core
import unittest
import paddle
import numpy as np
......@@ -177,32 +178,36 @@ class TestCRFModel(unittest.TestCase):
def test_update_sparse_parameter_all_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy, use_cuda=True)
if core.is_compiled_with_cuda():
self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy, use_cuda=True)
self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy, use_cuda=False)
def test_update_dense_parameter_all_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy, use_cuda=True)
if core.is_compiled_with_cuda():
self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy, use_cuda=True)
self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy, use_cuda=False)
def test_update_sparse_parameter_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy, use_cuda=True)
if core.is_compiled_with_cuda():
self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy, use_cuda=True)
self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy, use_cuda=False)
def test_update_dense_parameter_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy, use_cuda=True)
if core.is_compiled_with_cuda():
self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy, use_cuda=True)
self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy, use_cuda=False)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import unittest
import logging
import six
class TestBase(unittest.TestCase):
def main(self,
network_func,
iter=100,
iter_per_pe=100,
use_gpu=True,
use_experimental_executor=False):
if use_gpu and not fluid.core.is_compiled_with_cuda():
logging.warning(
"Paddle is not compiled with CUDA, skip GPU unittests")
return
main_prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.Scope()
with fluid.program_guard(main_prog, startup_prog):
with fluid.scope_guard(scope):
loss = network_func()
fluid.Executor(
fluid.CUDAPlace(0)
if use_gpu else fluid.CPUPlace()).run(startup_prog)
for _ in six.moves.xrange(iter):
exe_strategy = fluid.ExecutionStrategy()
exe_strategy._dry_run = True
exe_strategy.use_experimental_executor = use_experimental_executor
pe = fluid.ParallelExecutor(
use_cuda=True,
loss_name=loss.name,
main_program=main_prog,
exec_strategy=exe_strategy)
for _ in six.moves.xrange(iter_per_pe):
pe.run([])
class TestMNISTDryRun(TestBase):
def test_mnist_dry_run(self):
for use_gpu in (False, True):
for use_experimental_executor in (False, True):
self.main(
network_func=TestMNISTDryRun.network_func,
use_gpu=use_gpu,
use_experimental_executor=use_experimental_executor)
@staticmethod
def network_func():
img = fluid.layers.data(name='img', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in six.moves.xrange(10):
hidden = fluid.layers.fc(input=img, size=200, act='tanh')
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
fluid.optimizer.Adam().minimize(avg_loss)
return avg_loss
if __name__ == '__main__':
unittest.main()
......@@ -14,30 +14,18 @@
from __future__ import print_function
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import paddle
import paddle.dataset.mnist as mnist
import unittest
import os
MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
import numpy as np
import paddle.fluid.core as core
import os
import paddle.fluid as fluid
from parallel_executor_test_base import TestParallelExecutorBase
def simple_fc_net(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
......@@ -53,17 +41,8 @@ def simple_fc_net(use_feed):
def fc_with_batchnorm(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(1):
......@@ -88,19 +67,6 @@ class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=4)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
fluid.recordio_writer.convert_reader_to_recordio_file(
MNIST_RECORDIO_FILE, reader, feeder)
def _init_data(self):
np.random.seed(5)
......@@ -111,10 +77,6 @@ class TestMNIST(TestParallelExecutorBase):
def _compare_reduce_and_allreduce(self, model, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(
model, use_cuda=use_cuda, use_reduce=True)
self.check_network_convergence(
model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
img, label = self._init_data()
......@@ -140,9 +102,6 @@ class TestMNIST(TestParallelExecutorBase):
def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
img, label = self._init_data()
......@@ -199,8 +158,6 @@ class TestMNIST(TestParallelExecutorBase):
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
img, label = self._init_data()
self.check_network_convergence(
......
......@@ -15,10 +15,10 @@
from __future__ import print_function
import unittest
from test_pool2d_op import TestPool2d_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
from test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
class TestMKLDNNCase1(TestPool2d_Op):
class TestMKLDNNCase1(TestPool2D_Op):
def init_kernel_type(self):
self.use_mkldnn = True
......
......@@ -81,7 +81,7 @@ def avg_pool2D_forward_naive(x,
return out
class TestPool2d_Op(OpTest):
class TestPool2D_Op(OpTest):
def setUp(self):
self.op_type = "pool2d"
self.use_cudnn = False
......@@ -160,7 +160,7 @@ class TestPool2d_Op(OpTest):
self.exclusive = True
class TestCase1(TestPool2d_Op):
class TestCase1(TestPool2D_Op):
def init_test_case(self):
self.shape = [2, 3, 7, 7]
self.ksize = [3, 3]
......@@ -175,7 +175,7 @@ class TestCase1(TestPool2d_Op):
self.global_pool = False
class TestCase2(TestPool2d_Op):
class TestCase2(TestPool2D_Op):
def init_test_case(self):
self.shape = [2, 3, 7, 7]
self.ksize = [3, 3]
......@@ -190,7 +190,7 @@ class TestCase2(TestPool2d_Op):
self.global_pool = False
class TestCase3(TestPool2d_Op):
class TestCase3(TestPool2D_Op):
def init_pool_type(self):
self.pool_type = "max"
self.pool2D_forward_naive = max_pool2D_forward_naive
......@@ -208,127 +208,98 @@ class TestCase5(TestCase2):
self.pool2D_forward_naive = max_pool2D_forward_naive
#--------------------test pool2d--------------------
class TestCUDNNCase1(TestPool2d_Op):
def init_kernel_type(self):
self.use_cudnn = True
#--------------------test pool2d cudnn--------------------
class TestFP16CUDNNCase1(TestPool2d_Op):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def create_test_cudnn_class(parent):
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNNCase(parent):
def init_kernel_type(self):
self.use_cudnn = True
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
cls_name = "{0}_{1}".format(parent.__name__, "CUDNNOp")
TestCUDNNCase.__name__ = cls_name
globals()[cls_name] = TestCUDNNCase
class TestCUDNNCase2(TestCase1):
def init_kernel_type(self):
self.use_cudnn = True
create_test_cudnn_class(TestPool2D_Op)
create_test_cudnn_class(TestCase1)
create_test_cudnn_class(TestCase2)
create_test_cudnn_class(TestCase3)
create_test_cudnn_class(TestCase4)
create_test_cudnn_class(TestCase5)
#--------------------test pool2d cudnn_fp16--------------------
class TestFP16CUDNNCase2(TestCase1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
def create_test_cudnn_fp16_class(parent, check_grad=True):
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNNFp16Case(parent):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase3(TestCase2):
def init_kernel_type(self):
self.use_cudnn = True
class TestFP16CUDNNCase3(TestCase2):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
def test_check_grad(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
if core.is_float16_supported(
place) and self.pool_type != "max" and check_grad:
self.check_grad_with_place(
place, set(['X']), 'Out', max_relative_error=0.07)
cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16Op")
TestCUDNNFp16Case.__name__ = cls_name
globals()[cls_name] = TestCUDNNFp16Case
class TestCUDNNCase4(TestCase3):
def init_kernel_type(self):
self.use_cudnn = True
create_test_cudnn_fp16_class(TestPool2D_Op)
create_test_cudnn_fp16_class(TestCase1, check_grad=False)
create_test_cudnn_fp16_class(TestCase2)
create_test_cudnn_fp16_class(TestCase3)
create_test_cudnn_fp16_class(TestCase4)
create_test_cudnn_fp16_class(TestCase5)
class TestFP16CUDNNCase4(TestCase3):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
#--------------------test pool2d use ceil mode--------------------
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
def create_test_cudnn_use_ceil_class(parent):
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestPool2DUseCeilCase(parent):
def init_kernel_type(self):
self.use_cudnn = True
class TestCUDNNCase5(TestCase4):
def init_kernel_type(self):
self.use_cudnn = True
class TestFP16CUDNNCase5(TestCase4):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase6(TestCase5):
def init_kernel_type(self):
self.use_cudnn = True
def init_ceil_mode(self):
self.ceil_mode = True
class TestFP16CUDNNCase6(TestCase5):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
cls_name = "{0}_{1}".format(parent.__name__, "CUDNNOpCeilMode")
TestPool2DUseCeilCase.__name__ = cls_name
globals()[cls_name] = TestPool2DUseCeilCase
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
create_test_cudnn_use_ceil_class(TestPool2D_Op)
create_test_cudnn_use_ceil_class(TestCase1)
class TestCeilModeCase1(TestCUDNNCase1):
def init_ceil_mode(self):
self.ceil_mode = True
def create_test_use_ceil_class(parent):
class TestPool2DUseCeilCase(parent):
def init_ceil_mode(self):
self.ceil_mode = True
class TestCeilModeCase2(TestCUDNNCase2):
def init_ceil_mode(self):
self.ceil_mode = True
cls_name = "{0}_{1}".format(parent.__name__, "CeilModeCast")
TestPool2DUseCeilCase.__name__ = cls_name
globals()[cls_name] = TestPool2DUseCeilCase
class TestCeilModeCase3(TestCase1):
def init_ceil_mode(self):
self.ceil_mode = True
class TestCeilModeCase4(TestCase2):
def init_ceil_mode(self):
self.ceil_mode = True
create_test_use_ceil_class(TestCase1)
create_test_use_ceil_class(TestCase2)
class TestAvgInclude(TestCase2):
......@@ -336,7 +307,10 @@ class TestAvgInclude(TestCase2):
self.exclusive = False
class TestCUDNNAvgInclude(TestCUDNNCase3):
class TestCUDNNAvgInclude(TestCase2):
def init_kernel_type(self):
self.use_cudnn = True
def init_exclusive(self):
self.exclusive = False
......
......@@ -24,9 +24,16 @@ from paddle.fluid.op import Operator
class TestScaleOp(OpTest):
def setUp(self):
self.op_type = "scale"
self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
self.dtype = np.float32
self.init_dtype_type()
self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
self.attrs = {'scale': -2.3}
self.outputs = {'Out': self.inputs['X'] * self.attrs['scale']}
self.outputs = {
'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
}
def init_dtype_type(self):
pass
def test_check_output(self):
self.check_output()
......@@ -36,9 +43,15 @@ class TestScaleOp(OpTest):
class TestScaleOpSelectedRows(unittest.TestCase):
def init_dtype_type(self):
pass
def check_with_place(self, place, in_name, out_name):
scope = core.Scope()
self.dtype = np.float32
self.init_dtype_type()
# create and initialize Grad Variable
in_height = 10
in_rows = [0, 4, 7]
......@@ -49,7 +62,7 @@ class TestScaleOpSelectedRows(unittest.TestCase):
in_selected_rows.set_height(in_height)
in_selected_rows.set_rows(in_rows)
in_array = np.random.random(
(len(in_rows), in_row_numel)).astype("float32")
(len(in_rows), in_row_numel)).astype(self.dtype)
in_tensor = in_selected_rows.get_tensor()
in_tensor.set(in_array, place)
......@@ -87,5 +100,41 @@ class TestScaleOpSelectedRows(unittest.TestCase):
self.check_with_place(place, 'in', 'in')
# Add FP16 test
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestScaleFp16Op(TestScaleOp):
def init_dtype_type(self):
self.dtype = np.float16
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=0.002)
def test_check_grad(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ["X"], "Out", max_relative_error=0.05)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows):
def init_dtype_type(self):
self.dtype = np.float16
def test_scale_selected_rows(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_with_place(place, 'in', 'out')
def test_scale_selected_rows_inplace(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_with_place(place, 'in', 'in')
if __name__ == "__main__":
unittest.main()
......@@ -62,12 +62,11 @@ class TestSoftmaxOp(OpTest):
self.check_output()
def test_check_grad(self):
if self.dtype == np.float16:
return
if self.use_cudnn:
if self.use_cudnn or self.dtype == np.float16:
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ["X"], "Out", max_relative_error=0.01)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ["X"], "Out", max_relative_error=0.01)
else:
self.check_grad(["X"], "Out", max_relative_error=0.01)
......@@ -103,10 +102,23 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
# FIXME: If the x_shape is [10, 10], gradient failed.
def test_check_grad(self):
pass
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxFP16Op2(TestSoftmaxFP16Op):
class TestSoftmaxFP16Op2(TestSoftmaxOp):
def init_kernel_type(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
def get_x_shape(self):
return [2, 3, 4, 5]
......
......@@ -24,16 +24,20 @@ from paddle.fluid.op import Operator
class TestSumOp(OpTest):
def setUp(self):
self.op_type = "sum"
self.init_kernel_type()
self.use_mkldnn = False
self.init_kernel_type()
x0 = np.random.random((3, 4)).astype('float32')
x1 = np.random.random((3, 4)).astype('float32')
x2 = np.random.random((3, 4)).astype('float32')
x0 = np.random.random((3, 4)).astype(self.dtype)
x1 = np.random.random((3, 4)).astype(self.dtype)
x2 = np.random.random((3, 4)).astype(self.dtype)
self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
y = x0 + x1 + x2
self.outputs = {'Out': y}
self.attrs = {'use_mkldnn': self.use_mkldnn}
def init_kernel_type(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output()
......@@ -59,8 +63,11 @@ class TestSelectedRowsSumOp(OpTest):
self.check_input_and_optput(core.Scope(), place, inplace, False, False,
False)
def init_kernel_type(self):
self.dtype = np.float32
def _get_array(self, row_num, row_numel):
array = np.ones((row_num, row_numel)).astype("float32")
array = np.ones((row_num, row_numel)).astype(self.dtype)
for i in range(row_num):
array[i] *= i
return array
......@@ -129,5 +136,36 @@ class TestSelectedRowsSumOp(OpTest):
self.check_with_place(place, inplace)
class TestFP16SumOp(TestSumOp):
def init_kernel_type(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
# FIXME: Because of the precision fp16, max_relative_error
# should be 0.15 here.
def test_check_grad(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad(['x0'], 'Out', max_relative_error=0.15)
class TestFP16SelectedRowsSumOp(TestSelectedRowsSumOp):
def init_kernel_type(self):
self.dtype = np.float16
def test_w_is_selected_rows(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
for inplace in [True, False]:
self.check_with_place(place, inplace)
if __name__ == "__main__":
unittest.main()
......@@ -14,7 +14,8 @@ RC = 0
def git_commit():
try:
cmd = ['git', 'rev-parse', 'HEAD']
git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE,
cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
except:
git_commit = 'Unknown'
git_commit = git_commit.decode()
......@@ -44,7 +45,7 @@ def get_patch():
def is_taged():
try:
cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
git_tag = git_tag.decode()
except:
return False
......@@ -55,8 +56,7 @@ def is_taged():
return False
def write_version_py(filename='paddle/version.py'):
cnt = '''
# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
#
full_version = '%(major)d.%(minor)d.%(patch)s'
major = '%(major)d'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册