提交 5373fe29 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into refine/im2col

......@@ -46,6 +46,7 @@
| tianbingsz | Tian-Bing Xu |
| tpatejko | Tomasz Patejko |
| typhoonzero | Yi Wu |
| velconia | Qi-Yang Min |
| wanghaoshuang | Hao-Shuang Wang |
| wangyang59 | Yang Wang |
| wangzhen-nlp | Zhen Wang |
......
......@@ -85,8 +85,7 @@ def dist_transpile(trainer_id, args):
trainer_id,
pservers=pserver_endpoints,
trainers=trainers,
sync_mode=not args.async_mode,
slice_var_up=not args.no_split_var)
sync_mode=not args.async_mode)
if training_role == "PSERVER":
pserver_program = t.get_pserver_program(current_endpoint)
pserver_startup_program = t.get_startup_program(current_endpoint,
......
......@@ -50,7 +50,7 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
PATCH_COMMAND git apply ${PADDLE_SOURCE_DIR}/patches/grpc/fix_too_early_destory.patch
PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h
# NOTE(yuyang18):
# Disable -Werror, otherwise the compile will fail in MacOS.
# It seems that we cannot configure that by make command.
......
......@@ -148,18 +148,11 @@ if (WITH_ANAKIN AND WITH_GPU)
list(APPEND inference_deps anakin_inference_lib)
endif()
copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared
SRCS ${src_dir}/${module}/paddle_inference_api.h
${src_dir}/${module}/demo_ci
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api*
DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference
)
list(APPEND inference_deps inference_api_lib)
set(module "inference")
copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
)
set(module "platform")
......
......@@ -8,9 +8,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
if(WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type)
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
......@@ -110,7 +110,7 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
# cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc )
......
......@@ -715,6 +715,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
node->Op()->Type(), places_[op_dev_id]));
// TODO(panyx0718): This might not be needed anymore.
if (node->Op()->Type() == "send_barrier") {
ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "send");
} else if (node->Op()->Type() == "recv") {
......
......@@ -24,6 +24,68 @@ namespace paddle {
namespace framework {
namespace ir {
std::vector<std::string> FindDistTrainSendVars(
const std::vector<ir::Node *> &nodes) {
std::vector<std::string> send_vars;
// since parameters are all in block 0,
// it's enough to only scan send ops in block 0
for (auto &node : nodes) {
auto op_vars = node->Op()->InputArgumentNames();
send_vars.reserve(send_vars.size() +
std::distance(op_vars.begin(), op_vars.end()));
send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
}
return send_vars;
}
std::vector<std::string> FindDistTrainRecvVars(
const std::vector<ir::Node *> &nodes) {
std::vector<std::string> recv_vars;
for (auto &node : nodes) {
auto op_vars = node->Op()->OutputArgumentNames();
recv_vars.reserve(recv_vars.size() +
std::distance(op_vars.begin(), op_vars.end()));
recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
}
return recv_vars;
}
bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
const std::vector<std::string> &recv_vars) {
if (send_vars.size() == 0 || recv_vars.size() == 0) {
return false;
}
/**
* Check any of opvars contains `.block` and in sendvars
*/
auto checker = [](const std::vector<std::string> &opvars,
const std::vector<std::string> &rpc_vars) -> bool {
for (auto &var : opvars) {
// a variable name with the suffix `.block` means it's a splited
// variable by (DistributeTranspiler)
// [python/paddle/fluid/transpiler/distribute_transpiler.py]
if (var.find(".block") != std::string::npos &&
std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
return true;
}
}
return false;
};
std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
for (ir::Node *input : node->inputs) {
input_var_names.push_back(input->Name());
}
for (ir::Node *output : node->outputs) {
output_var_names.push_back(output->Name());
}
return checker(output_var_names, send_vars) ||
checker(input_var_names, recv_vars);
}
Graph::Graph(const ProgramDesc &program) : program_(program) {
VLOG(3) << "block in program:" << program_.Size();
std::unordered_map<std::string, VarDesc *> all_vars;
......@@ -61,6 +123,64 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
var->inputs.push_back(node);
}
}
std::vector<ir::Node *> send_ops;
ir::Node *send_bar = nullptr;
std::vector<ir::Node *> recv_ops;
ir::Node *fetch_bar = nullptr;
for (ir::Node *node : Nodes()) {
if (node->Name() == "send") {
send_ops.push_back(node);
} else if (node->Name() == "send_barrier") {
PADDLE_ENFORCE(!send_bar, "only has one send barrier");
send_bar = node;
} else if (node->Name() == "recv") {
recv_ops.push_back(node);
} else if (node->Name() == "fetch_barrier") {
PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier");
fetch_bar = node;
}
}
if (send_bar) {
for (ir::Node *send : send_ops) {
ir::Node *dep_var = CreateControlDepVar();
send->outputs.push_back(dep_var);
dep_var->inputs.push_back(send);
send_bar->inputs.push_back(dep_var);
dep_var->outputs.push_back(send_bar);
}
for (ir::Node *recv : recv_ops) {
ir::Node *dep_var = CreateControlDepVar();
recv->inputs.push_back(dep_var);
dep_var->outputs.push_back(recv);
send_bar->outputs.push_back(dep_var);
dep_var->inputs.push_back(send_bar);
}
}
if (fetch_bar) {
for (ir::Node *recv : recv_ops) {
ir::Node *dep_var = CreateControlDepVar();
recv->outputs.push_back(dep_var);
dep_var->inputs.push_back(recv);
fetch_bar->inputs.push_back(dep_var);
dep_var->outputs.push_back(fetch_bar);
}
}
std::vector<std::string> send_vars = FindDistTrainSendVars(send_ops);
std::vector<std::string> recv_vars = FindDistTrainRecvVars(recv_ops);
for (ir::Node *node : Nodes()) {
if (IsDistTrainOp(node, send_vars, recv_vars)) {
if (fetch_bar && node->Name() == "concat") {
ir::Node *dep_var = CreateControlDepVar();
fetch_bar->outputs.push_back(dep_var);
dep_var->inputs.push_back(fetch_bar);
node->inputs.push_back(dep_var);
dep_var->outputs.push_back(node);
}
}
}
/**
* We only handle write after read(WAR), since it should not have a write
* after write in program. If there are write after write operators, we need
......
......@@ -14,8 +14,15 @@ cc_library(paddle_fluid_api
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
# paddle_fluid_origin exclude inference api interface
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
if(NOT APPLE)
add_subdirectory(api)
endif()
# Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
......@@ -24,7 +31,7 @@ endif()
# Create shared library
cc_library(paddle_fluid_shared SHARED
SRCS io.cc
SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
DEPS ${fluid_modules} paddle_fluid_api)
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
......@@ -32,12 +39,21 @@ if(NOT APPLE)
# TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
# check symbol hidden
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
"execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
" ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n"
"if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
" message(FATAL_ERROR \"Check symbol failed.\")\n"
"endif()\n")
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
DEPENDS paddle_fluid_shared)
add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
endif()
if(WITH_TESTING)
# both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
add_subdirectory(tests/book)
endif()
if(NOT APPLE)
add_subdirectory(api)
endif()
......@@ -42,35 +42,8 @@ function(inference_api_test TARGET_NAME)
endif(WITH_TESTING)
endfunction(inference_api_test)
cc_library(paddle_inference_api
SRCS api.cc api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
if(NOT APPLE)
set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/api.sym")
set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
# Here the shared library doesn't depend on other fluid libraries, or double free will occur.
cc_library(paddle_inference_api_shared SHARED
SRCS api.cc api_impl.cc)
add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
if(NOT APPLE)
set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/api.map")
set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
"execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
" ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n"
"if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
" message(FATAL_ERROR \"Check symbol failed.\")\n"
"endif()\n")
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
DEPENDS paddle_inference_api_shared)
add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
endif()
cc_test(test_paddle_inference_api
SRCS api_tester.cc
......
......@@ -55,11 +55,9 @@ endif()
# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
if(WITH_STATIC_LIB)
set(DEPS
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.a
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
else()
set(DEPS
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.so
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
endif()
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
......
......@@ -3,8 +3,8 @@
lib=$1
if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l)
num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l)
num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep T | wc -l)
if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
......
# Add TRT tests
nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc
DEPS tensorrt_engine mul_op)
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS
${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
......@@ -13,3 +13,6 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
......@@ -49,5 +49,4 @@ class MulOpConverter : public OpConverter {
} // namespace inference
} // namespace paddle
USE_OP(mul);
REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
*/
class Pool2dOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4)
<< "convert a fluid pool2d op to tensorrt pool2d layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
std::string pool_type =
boost::get<std::string>(op_desc.GetAttr("pooling_type"));
std::vector<int> ksize =
boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
if (pool_type == "max") {
nv_pool_type = nvinfer1::PoolingType::kMAX;
} else if (pool_type == "avg") {
nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
} else {
PADDLE_THROW("TensorRT unsupported pooling type!");
}
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
*const_cast<nvinfer1::ITensor*>(input1),
nv_pool_type, nv_ksize);
PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
layer->setStride(nv_strides);
layer->setPadding(nv_paddings);
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(pool2d);
REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
......@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) {
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(1);
validator.Execute(5);
}
} // namespace tensorrt
......
......@@ -24,9 +24,8 @@ TEST(fc_op, test) {
std::unordered_set<std::string> parameters({"mul-Y"});
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1));
validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1));
validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
// Prepare Op description
......@@ -38,7 +37,7 @@ TEST(fc_op, test) {
validator.SetOp(*desc.Proto());
validator.Execute(1);
validator.Execute(10);
}
} // namespace tensorrt
......
......@@ -23,7 +23,7 @@ namespace tensorrt {
TEST(MulOpConverter, main) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(10, parameters, scope, 1000);
TRTConvertValidation validator(10, parameters, scope, 1000, false);
validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
......@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) {
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(1);
validator.Execute(2);
}
} // namespace tensorrt
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <fstream>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(Pool2dOpConverter, main) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(5, parameters, scope, 1 << 15);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("pool2d");
desc.SetInput("X", {"pool2d-X"});
desc.SetOutput("Out", {"pool2d-Out"});
std::vector<int> ksize({2, 2});
std::vector<int> strides({2, 2});
std::vector<int> paddings({0, 0});
std::string pooling_t = "max";
desc.SetAttr("pooling_type", pooling_t);
desc.SetAttr("ksize", ksize);
desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings);
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(3);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(pool2d);
......@@ -63,13 +63,16 @@ class TRTConvertValidation {
public:
TRTConvertValidation() = delete;
TRTConvertValidation(int batch_size,
TRTConvertValidation(int max_batch_size,
const std::unordered_set<std::string>& parameters,
framework::Scope& scope, // NOLINT
int workspace_size = 1 << 10)
: parameters_(parameters), scope_(scope) {
int workspace_size = 1 << 10, bool if_add_batch = true)
: parameters_(parameters),
scope_(scope),
if_add_batch_(if_add_batch),
max_batch_size_(max_batch_size) {
// create engine.
engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_));
engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
engine_->InitNetwork();
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
......@@ -84,7 +87,7 @@ class TRTConvertValidation {
// Declare a parameter varaible in the scope.
void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
DeclVar(name, dims);
DeclVar(name, dims, true);
}
void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
......@@ -92,12 +95,18 @@ class TRTConvertValidation {
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
// Init Fluid tensor.
std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
// There is no batchsize in ITensor's shape, but We should add it to
// tensor's shape of fluid. If the variable is not parameter and the
// if_add_batch_ flag is true, add the max batchsize to dim_vec.
if (is_param != true && if_add_batch_ == true)
dim_vec.insert(dim_vec.begin(), max_batch_size_);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
......@@ -131,6 +140,7 @@ class TRTConvertValidation {
void Execute(int batch_size) {
// Execute Fluid Op
PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
op_->Run(scope_, place);
......@@ -149,9 +159,15 @@ class TRTConvertValidation {
auto* var = scope_.FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &fluid_out);
size_t fluid_out_size = fluid_out.size();
if (if_add_batch_ == true) {
fluid_out_size =
batch_size * (framework::product(tensor->dims()) / max_batch_size_);
}
// Compare two output
ASSERT_FALSE(fluid_out.empty());
for (size_t i = 0; i < fluid_out.size(); i++) {
for (size_t i = 0; i < fluid_out_size; i++) {
// Loose the threshold for CI in different machine model.
EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
}
......@@ -167,6 +183,12 @@ class TRTConvertValidation {
std::unique_ptr<framework::OpDesc> op_desc_;
const std::unordered_set<std::string>& parameters_;
framework::Scope& scope_;
// The ITensor of trt does not cotain the batch size,
// bug, in most cases, we need to set batch size for
// fluid's tensor shape. This variable indicates
// whether to add batch size to tensor shape of fluid.
bool if_add_batch_;
int max_batch_size_;
};
} // namespace tensorrt
......
......@@ -113,7 +113,7 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
ASSERT_EQ(y_cpu[1], 14.5);
}
TEST_F(TensorRTEngineTest, test_conv2d_temp) {
TEST_F(TensorRTEngineTest, test_conv2d) {
// Weight in CPU memory.
float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
float raw_bias[1] = {0};
......@@ -146,6 +146,37 @@ TEST_F(TensorRTEngineTest, test_conv2d_temp) {
ASSERT_EQ(y_cpu[1], 6.0);
}
TEST_F(TensorRTEngineTest, test_pool2d) {
// Weight in CPU memory.
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3{1, 2, 2});
nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
auto* pool_layer =
TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x),
pool_t, nvinfer1::DimsHW{2, 2});
PADDLE_ENFORCE(pool_layer != nullptr);
pool_layer->setStride(nvinfer1::DimsHW{1, 1});
pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
engine_->DeclareOutput(pool_layer, 0, "y");
engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
8 * sizeof(float));
engine_->Execute(2);
LOG(INFO) << "to get output";
float* y_cpu = new float[2];
engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
ASSERT_EQ(y_cpu[0], 2.0);
ASSERT_EQ(y_cpu[1], 5.0);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
......@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME)
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(test_inference_${TARGET_NAME}${arg}
SRCS test_inference_${TARGET_NAME}.cc
DEPS paddle_fluid
DEPS paddle_fluid_origin
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
set_tests_properties(test_inference_${TARGET_NAME}${arg}
PROPERTIES DEPENDS test_${TARGET_NAME})
......@@ -43,6 +43,6 @@ inference_test(word2vec)
# TODO(TJ): clean me up
cc_test(test_inference_nlp
SRCS test_inference_nlp.cc
DEPS paddle_fluid
DEPS paddle_fluid_origin
ARGS
--model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
......@@ -20,9 +20,6 @@ limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#ifdef PADDLE_WITH_MKLML
#include <omp.h>
#endif
DEFINE_string(model_path, "", "Directory of the inference model.");
DEFINE_string(data_file, "", "File of input index data.");
......@@ -30,6 +27,7 @@ DEFINE_int32(repeat, 100, "Running the inference program repeat times");
DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
DEFINE_int32(num_threads, 1, "Number of threads should be used");
DECLARE_bool(use_mkldnn);
DECLARE_int32(paddle_num_threads);
inline double GetCurrentMs() {
struct timeval time;
......@@ -160,12 +158,7 @@ TEST(inference, nlp) {
std::unique_ptr<paddle::framework::Scope> scope(
new paddle::framework::Scope());
#ifdef PADDLE_WITH_MKLML
// only use 1 thread number per std::thread
omp_set_dynamic(0);
omp_set_num_threads(1);
paddle::platform::SetNumThreads(1);
#endif
paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
double start_ms = 0, stop_ms = 0;
if (FLAGS_num_threads > 1) {
......
......@@ -270,6 +270,9 @@ op_library(cos_sim_op DEPS cos_sim_functor)
op_library(parallel_do_op DEPS executor)
op_library(unsqueeze_op DEPS reshape_op)
op_library(squeeze_op DEPS reshape_op)
op_library(extract_rows_op DEPS memory)
op_library(flatten_op DEPS reshape_op)
if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col)
......
......@@ -17,9 +17,9 @@ if(WITH_GRPC)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(grpc_serde_test SRCS grpc_serde_test.cc
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL)
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL)
return()
endif()
......
......@@ -49,6 +49,7 @@ void GRPCClient::SendComplete() {
}
GRPCClient::~GRPCClient() {
stopped_ = true;
Wait();
cq_.Shutdown();
{
......@@ -275,7 +276,7 @@ void GRPCClient::Proceed() {
void* tag = nullptr;
bool ok = false;
while (cq_.Next(&tag, &ok)) {
while (!stopped_ && cq_.Next(&tag, &ok)) {
BaseProcessor* c = static_cast<BaseProcessor*>(tag);
GPR_ASSERT(ok);
PADDLE_ENFORCE(c);
......
......@@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
class GRPCClient : public RPCClient {
public:
GRPCClient() : ok_(true), completed_(false) {}
GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
virtual ~GRPCClient();
bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
......@@ -237,6 +237,8 @@ class GRPCClient : public RPCClient {
// mutex for sending complete message only once
std::mutex completed_mutex_;
bool completed_;
volatile bool stopped_;
};
} // namespace distributed
......
......@@ -30,7 +30,7 @@ namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace distributed = paddle::operators::distributed;
USE_OP(lookup_table);
USE_NO_KERNEL_OP(lookup_sparse_table);
std::unique_ptr<distributed::RPCServer> g_rpc_service;
std::unique_ptr<distributed::RequestHandler> g_req_handler;
......@@ -42,13 +42,13 @@ framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
framework::VariableNameMap output({{"Output", {"out"}}});
auto op = block->AppendOp();
op->SetType("lookup_table");
op->SetType("lookup_sparse_table");
op->SetInput("W", {"w"});
op->SetInput("Ids", {"ids"});
op->SetOutput("Out", {"out"});
auto& out = *root_block->Var("out");
out.SetType(framework::proto::VarType::SELECTED_ROWS);
out.SetType(framework::proto::VarType::LOD_TENSOR);
out.SetShape({10, 10});
return block;
......@@ -59,20 +59,19 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
w_var->GetMutable<framework::SelectedRows>();
auto out_var = scope->Var("out");
out_var->GetMutable<framework::SelectedRows>();
out_var->GetMutable<framework::LoDTensor>();
auto ids_var = scope->Var("ids");
ids_var->GetMutable<framework::SelectedRows>();
ids_var->GetMutable<framework::LoDTensor>();
}
void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) {
CreateVarsOnScope(scope, place);
auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
auto rows = ids_var->mutable_rows();
for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
ids_var->mutable_value()->Resize({rows_numel, 1});
ids_var->mutable_value()->mutable_data<float>(*place);
auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
int64_t* ids_ptr =
ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
}
void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
......@@ -148,11 +147,11 @@ TEST(PREFETCH, CPU) {
client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
client->Wait();
auto var = scope.Var(out_var_name);
auto value = var->GetMutable<framework::SelectedRows>()->value();
auto ptr = value.mutable_data<float>(place);
auto value = var->GetMutable<framework::LoDTensor>();
auto ptr = value->mutable_data<float>(place);
for (int64_t i = 0; i < rows_numel; ++i) {
EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
EXPECT_EQ(ptr[0 + i * value->dims()[1]], static_cast<float>(i * 2));
}
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class ExtractRowsOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ExtractRowsOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ExtractRowsOp should not be null.");
PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X")[0],
framework::proto::VarType::SELECTED_ROWS,
"The type of input(X) must be SelectedRows.");
auto in_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(
"Out", framework::make_ddim(std::vector<int64_t>{in_dims[0], 1}));
}
};
class ExtractRowsOp : public framework::OperatorBase {
public:
ExtractRowsOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto in_rows = in.rows();
auto out_dim = framework::make_ddim(
std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
if (paddle::platform::is_gpu_place(in.place())) {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto *dev_ctx = pool.Get(in.place());
auto src_ptr = in_rows.Data(in.place());
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(*dev_ctx)
.stream();
memory::Copy(boost::get<platform::CUDAPlace>(out->place()), dst_ptr,
boost::get<platform::CUDAPlace>(in.place()), src_ptr,
in_rows.size() * sizeof(int64_t), stream);
#else
PADDLE_THROW("Not compiled with CUDA.");
#endif
} else {
memory::Copy(platform::CPUPlace(), dst_ptr, platform::CPUPlace(),
in_rows.data(), in_rows.size() * sizeof(int64_t));
}
}
};
class ExtractRowsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(SelectedRows). The input tensor of extract_rows operator,"
" and its type is SelectedRows.");
AddOutput("Out", "(Tensor). The the rows of input(X).");
AddComment(R"DOC(
ExtractRows Operator.
The function of extract_rows_op is extracting the rows from the input(X)
whose type is SelectedRows.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(extract_rows, ops::ExtractRowsOp, ops::ExtractRowsOpMaker,
ops::ExtractRowsOpInferShape);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class FlattenOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input (X) of Flatten op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output (Output) of Flatten op should not be null.");
const auto &axis = ctx->Attrs().Get<int>("axis");
const auto &in_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(axis >= 0, "The axis should be greater than or equal to 0.");
PADDLE_ENFORCE(
axis <= in_dims.size(),
"The axis should be less than or equal to input tensor's rank.");
const auto &out_dims = GetOutputShape(axis, in_dims);
ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
if (in_dims[0] == out_dims[0]) {
// Only pass LoD when the first dimension of output and Input(X)
// are the same.
ctx->ShareLoD("X", "Out");
}
}
static std::vector<int32_t> GetOutputShape(const int axis,
const framework::DDim &in_dims) {
int64_t outer = 1, inner = 1;
for (int i = 0; i < in_dims.size(); ++i) {
if (i < axis) {
outer *= in_dims[i];
} else {
inner *= in_dims[i];
}
}
std::vector<int32_t> out_shape(2);
out_shape[0] = outer;
out_shape[1] = inner;
return out_shape;
}
};
class FlattenOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axis = Attr<int>("axis");
auto in_dims =
scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
framework::AttributeMap attrs;
attrs["shape"] = out_dims;
attrs["inplace"] = false;
// Invoke Reshape Op
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class FlattenOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) A tensor of rank >= axis.");
AddOutput("Out",
"A 2D tensor is reshaped input tensor. The input dimensions"
"up to axis are flattened to the outer dimension of the output"
"and the remaining input dimensions are flattened into the inner"
"dimension of the output.");
AddAttr<int>("axis",
"(int)"
"Indicate up to which input dimensions (exclusive) should be"
"flattened to the outer dimension of the output. The value"
"for axis must be in the range [0, R], where R is the rank of"
"the input tensor. When axis = 0, the shape of the output"
"tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
"input tensor is (d_0, d_1, ... d_n).")
.SetDefault(1);
AddComment(R"DOC(
Flatten Operator
Flattens the input tensor into a 2D matrix.
Examples:
Case 1:
Given
X.shape = (3, 100, 100, 4)
and
axis = 2
We get:
Out.shape = (3 * 100, 4 * 100)
Case 2:
Given
X.shape = (3, 100, 100, 4)
and
axis = 0
We get:
Out.shape = (1, 3 * 100 * 100 * 4)
)DOC");
}
};
class FlattenGradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
context->SetOutputDim(framework::GradVarName("X"),
context->GetInputDim("X"));
context->ShareLoD("X", framework::GradVarName("X"));
}
};
class FlattenGradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto in_dims =
scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(in_dims);
attrs["inplace"] = false;
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators
} // namespace paddle
USE_OP(reshape);
namespace ops = paddle::operators;
REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
ops::FlattenOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
......@@ -33,19 +33,15 @@ class LookupTableOp : public framework::OperatorWithKernel {
auto table_dims = ctx->GetInputDim("W");
auto ids_dims = ctx->GetInputDim("Ids");
auto ids_var_type = ctx->GetInputsVarType("Ids").front();
// The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
// is LoDTensor, this tensor contains the ids to be looked up in W
// and it must be a column vector with rank = 2 while the 2nd dimension
// size must be 1, when Ids's type is SelectedRows, the rows of Ids
// contains the ids to be looked up in W;
if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
}
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
ctx->ShareLoD("Ids", /*->*/ "Out");
if (ctx->GetOutputsVarType("Out")[0] ==
framework::proto::VarType::LOD_TENSOR) {
ctx->ShareLoD("Ids", /*->*/ "Out");
}
}
protected:
......@@ -62,17 +58,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("W",
"(Tensor) The input represents embedding tensors, "
"which is a learnable parameter.");
AddInput(
"Ids",
"(Tensor or SelectedRows) Ids's type can be Tensor or "
"SelectedRows, when Ids's type is Tensor, this tensor contains "
"the ids to be looked up in W and it must be a column vector with "
"rank = 2 while the 2nd dimension size must be 1; when Ids's type is "
"SelectedRows, the rows of Ids contains the ids to be looked up "
"in W.");
AddOutput("Out",
"(Tensor or SelectedRows) The lookup results, which have the "
"same type as W.");
AddInput("Ids",
"An input with type int32 or int64 "
"contains the ids to be looked up in W. "
"Ids must be a column vector with rank = 2. "
"The 2nd dimension size must be 1.");
AddOutput("Out", "The lookup results, which have the same type as W.");
AddAttr<bool>("is_sparse",
"(boolean, default false) "
"Sparse update.")
......@@ -90,15 +81,10 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
Lookup Table Operator.
This operator is used to perform lookups on the parameter W,
then concatenated into a dense or sparse tensor.
The type of Ids(Input) is SelectedRows, Tensor or LoDTensor, when Ids's
type is SelectedRows, the rows of Ids contains the ids to be looked up in W;
when Ids's type is Tensor, this tensor contains the ids to be looked up in W
and it must be a column vector with rank = 2 while the 2nd dimension size must be 1,
at this time, Ids can carry the LoD (Level of Details) information, or not, and
the output only shares the LoD information with input Ids.
then concatenated into a dense tensor.
The input Ids can carry the LoD (Level of Details) information,
or not. And the output only shares the LoD information with input Ids.
)DOC");
}
......
......@@ -23,7 +23,7 @@ namespace operators {
template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
bool PaddingFlag>
__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
__global__ void LookupTable(T *output, const T *table, const int64_t *ids,
const int64_t N, const int64_t K, const int64_t D,
const int64_t padding_idx) {
int idx = threadIdx.x;
......@@ -33,8 +33,8 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
int64_t id = ids[idy];
PADDLE_ASSERT(id >= 0);
PADDLE_ASSERT(id < N);
T* out = output + idy * D;
const T* tab = table + id * D;
T *out = output + idy * D;
const T *tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) {
if (PaddingFlag) {
if (id == padding_idx)
......@@ -50,7 +50,7 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
}
template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
__global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
const int64_t N, const int64_t K,
const int64_t D) {
int idx = threadIdx.x;
......@@ -60,8 +60,8 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
int id = ids[idy];
PADDLE_ASSERT(id >= 0);
PADDLE_ASSERT(id < N);
const T* out = output + idy * D;
T* tab = table + id * D;
const T *out = output + idy * D;
T *tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) {
paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
}
......@@ -72,36 +72,19 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
template <typename T>
class LookupTableCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* table_t = context.Input<LoDTensor>("W");
void Compute(const framework::ExecutionContext &context) const override {
auto *table_t = context.Input<LoDTensor>("W");
auto *ids_t = context.Input<LoDTensor>("Ids");
auto *output_t = context.Output<LoDTensor>("Out");
int64_t padding_idx = context.Attr<int64_t>("padding_idx");
auto* ids_var = context.InputVar("Ids");
Tensor* output_t = context.Output<Tensor>("Out");
int64_t* ids;
int64_t K;
// The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
// is LoDTensor, this tensor contains the ids to be looked up in W;
// when Ids's type is SelectedRows, the rows of Ids contains the
// ids to be looked up in W.
if (ids_var->IsType<framework::LoDTensor>()) {
auto* ids_t = context.Input<LoDTensor>("Ids");
ids = const_cast<int64_t*>(ids_t->data<int64_t>());
K = ids_t->numel();
} else if (ids_var->IsType<framework::SelectedRows>()) {
auto* ids_t = context.Input<framework::SelectedRows>("Ids");
ids = const_cast<int64_t*>(ids_t->rows().CUDAData(context.GetPlace()));
K = ids_t->rows().size();
output_t->Resize({K, table_t->dims()[1]});
} else {
PADDLE_THROW("Unsupported Variable Type of Ids");
}
size_t N = table_t->dims()[0];
size_t D = table_t->dims()[1];
auto* table = table_t->data<T>();
auto* output = output_t->mutable_data<T>(context.GetPlace());
size_t K = ids_t->numel();
auto *ids = ids_t->data<int64_t>();
auto *table = table_t->data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace());
dim3 threads(128, 8);
dim3 grids(8, 1);
......@@ -122,19 +105,19 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
template <typename T>
class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto& dev_ctx =
void Compute(const framework::ExecutionContext &context) const override {
auto &dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
bool is_sparse = context.Attr<bool>("is_sparse");
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if (is_sparse) {
auto* ids = context.Input<LoDTensor>("Ids");
auto* table = context.Input<LoDTensor>("W");
auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
auto *ids = context.Input<LoDTensor>("Ids");
auto *table = context.Input<LoDTensor>("W");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
auto* ids_data = ids->data<int64_t>();
auto *ids_data = ids->data<int64_t>();
auto ids_dim = ids->dims();
auto stream = dev_ctx.stream();
......@@ -150,12 +133,12 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
d_table->set_rows(new_rows);
auto* d_table_value = d_table->mutable_value();
auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_dim[0], table->dims()[1]});
d_table_value->mutable_data<T>(context.GetPlace());
auto* d_table_data = d_table_value->data<T>();
auto* d_output_data = d_output->data<T>();
auto *d_table_data = d_table_value->data<T>();
auto *d_output_data = d_output->data<T>();
PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
d_output->numel() * sizeof(T), stream);
......@@ -168,9 +151,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
int N = d_table_t->dims()[0];
int D = d_table_t->dims()[1];
int K = ids_t->numel();
const int64_t* ids = ids_t->data<int64_t>();
const T* d_output = d_output_t->data<T>();
T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
const int64_t *ids = ids_t->data<int64_t>();
const T *d_output = d_output_t->data<T>();
T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_table_t);
t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
......
......@@ -36,43 +36,13 @@ template <typename T>
class LookupTableKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *ids_t = context.Input<LoDTensor>("Ids"); // int tensor
auto *output_t = context.Output<LoDTensor>("Out"); // float tensor
auto *table_var = context.InputVar("W");
auto *ids_var = context.InputVar("Ids");
Tensor *output_t = context.Output<Tensor>("Out");
int64_t padding_idx = context.Attr<int64_t>("padding_idx");
DDim table_dim;
if (table_var->IsType<LoDTensor>()) {
table_dim = context.Input<LoDTensor>("W")->dims();
} else if (table_var->IsType<SelectedRows>()) {
auto *table_t = context.Input<SelectedRows>("W");
table_dim = table_t->value().dims();
} else {
PADDLE_THROW(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows");
}
int64_t *ids;
int64_t ids_numel;
// The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
// is LoDTensor, this tensor contains the ids to be looked up in W;
// when Ids's type is SelectedRows, the rows of Ids contains the
// ids to be looked up in W.
if (ids_var->IsType<LoDTensor>()) {
auto *ids_t = context.Input<LoDTensor>("Ids");
ids = const_cast<int64_t *>(ids_t->data<int64_t>());
ids_numel = ids_t->numel();
} else if (ids_var->IsType<SelectedRows>()) {
auto *ids_t = context.Input<SelectedRows>("Ids");
ids = const_cast<int64_t *>(ids_t->rows().data());
ids_numel = ids_t->rows().size();
output_t->Resize({ids_numel, table_dim[1]});
} else {
PADDLE_THROW("Unsupported Variable Type of Ids");
}
int64_t padding_idx = context.Attr<int64_t>("padding_idx");
int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
int64_t ids_numel = ids_t->numel();
if (table_var->IsType<LoDTensor>()) {
auto *table_t = context.Input<LoDTensor>("W");
......
......@@ -163,7 +163,4 @@ REGISTER_OP_CPU_KERNEL(
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
// A trick to compile with the needed TensorRT op converter.
USE_TRT_CONVERTER(mul)
#endif // PADDLE_WITH_CUDA
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_MKLML
#include <omp.h>
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
......@@ -33,6 +34,7 @@ void SetNumThreads(int num_threads) {
#elif defined(PADDLE_WITH_MKLML)
int real_num_threads = num_threads > 1 ? num_threads : 1;
platform::dynload::MKL_Set_Num_Threads(real_num_threads);
omp_set_num_threads(num_threads);
#else
PADDLE_ENFORCE(false, "To be implemented.");
#endif
......
......@@ -23,6 +23,9 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/piece.h"
DEFINE_int32(paddle_num_threads, 1,
"Number of threads for each paddle instance.");
namespace paddle {
namespace framework {
......@@ -115,7 +118,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
places.emplace_back(platform::CPUPlace());
platform::DeviceContextPool::Init(places);
#ifndef PADDLE_WITH_MKLDNN
platform::SetNumThreads(1);
platform::SetNumThreads(FLAGS_paddle_num_threads);
#endif
}
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/// A completion queue implements a concurrent producer-consumer queue, with
/// two main API-exposed methods: \a Next and \a AsyncNext. These
/// methods are the essential component of the gRPC C++ asynchronous API.
/// There is also a \a Shutdown method to indicate that a given completion queue
/// will no longer have regular events. This must be called before the
/// completion queue is destroyed.
/// All completion queue APIs are thread-safe and may be used concurrently with
/// any other completion queue API invocation; it is acceptable to have
/// multiple threads calling \a Next or \a AsyncNext on the same or different
/// completion queues, or to call these methods concurrently with a \a Shutdown
/// elsewhere.
/// \remark{All other API calls on completion queue should be completed before
/// a completion queue destructor is called.}
#ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
#define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
#include <typeinfo>
#include <grpc/impl/codegen/atm.h>
#include <grpcpp/impl/codegen/completion_queue_tag.h>
#include <grpcpp/impl/codegen/core_codegen_interface.h>
#include <grpcpp/impl/codegen/grpc_library.h>
#include <grpcpp/impl/codegen/status.h>
#include <grpcpp/impl/codegen/time.h>
struct grpc_completion_queue;
namespace grpc {
template <class R>
class ClientReader;
template <class W>
class ClientWriter;
template <class W, class R>
class ClientReaderWriter;
template <class R>
class ServerReader;
template <class W>
class ServerWriter;
namespace internal {
template <class W, class R>
class ServerReaderWriterBody;
} // namespace internal
class Channel;
class ChannelInterface;
class ClientContext;
class CompletionQueue;
class Server;
class ServerBuilder;
class ServerContext;
class ServerInterface;
namespace internal {
class CompletionQueueTag;
class RpcMethod;
template <class ServiceType, class RequestType, class ResponseType>
class RpcMethodHandler;
template <class ServiceType, class RequestType, class ResponseType>
class ClientStreamingHandler;
template <class ServiceType, class RequestType, class ResponseType>
class ServerStreamingHandler;
template <class ServiceType, class RequestType, class ResponseType>
class BidiStreamingHandler;
class UnknownMethodHandler;
template <class Streamer, bool WriteNeeded>
class TemplatedBidiStreamingHandler;
template <class InputMessage, class OutputMessage>
class BlockingUnaryCallImpl;
} // namespace internal
extern CoreCodegenInterface* g_core_codegen_interface;
/// A thin wrapper around \ref grpc_completion_queue (see \ref
/// src/core/lib/surface/completion_queue.h).
/// See \ref doc/cpp/perf_notes.md for notes on best practices for high
/// performance servers.
class CompletionQueue : private GrpcLibraryCodegen {
public:
/// Default constructor. Implicitly creates a \a grpc_completion_queue
/// instance.
CompletionQueue()
: CompletionQueue(grpc_completion_queue_attributes{
GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, GRPC_CQ_DEFAULT_POLLING}) {}
/// Wrap \a take, taking ownership of the instance.
///
/// \param take The completion queue instance to wrap. Ownership is taken.
explicit CompletionQueue(grpc_completion_queue* take);
/// Destructor. Destroys the owned wrapped completion queue / instance.
~CompletionQueue() {
if (typeid(*g_core_codegen_interface).hash_code() !=
typeid(CoreCodegenInterface).hash_code()) {
g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
}
}
/// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
enum NextStatus {
SHUTDOWN, ///< The completion queue has been shutdown and fully-drained
GOT_EVENT, ///< Got a new event; \a tag will be filled in with its
///< associated value; \a ok indicating its success.
TIMEOUT ///< deadline was reached.
};
/// Read from the queue, blocking until an event is available or the queue is
/// shutting down.
///
/// \param tag[out] Updated to point to the read event's tag.
/// \param ok[out] true if read a successful event, false otherwise.
///
/// Note that each tag sent to the completion queue (through RPC operations
/// or alarms) will be delivered out of the completion queue by a call to
/// Next (or a related method), regardless of whether the operation succeeded
/// or not. Success here means that this operation completed in the normal
/// valid manner.
///
/// Server-side RPC request: \a ok indicates that the RPC has indeed
/// been started. If it is false, the server has been Shutdown
/// before this particular call got matched to an incoming RPC.
///
/// Client-side StartCall/RPC invocation: \a ok indicates that the RPC is
/// going to go to the wire. If it is false, it not going to the wire. This
/// would happen if the channel is either permanently broken or
/// transiently broken but with the fail-fast option. (Note that async unary
/// RPCs don't post a CQ tag at this point, nor do client-streaming
/// or bidi-streaming RPCs that have the initial metadata corked option set.)
///
/// Client-side Write, Client-side WritesDone, Server-side Write,
/// Server-side Finish, Server-side SendInitialMetadata (which is
/// typically included in Write or Finish when not done explicitly):
/// \a ok means that the data/metadata/status/etc is going to go to the
/// wire. If it is false, it not going to the wire because the call
/// is already dead (i.e., canceled, deadline expired, other side
/// dropped the channel, etc).
///
/// Client-side Read, Server-side Read, Client-side
/// RecvInitialMetadata (which is typically included in Read if not
/// done explicitly): \a ok indicates whether there is a valid message
/// that got read. If not, you know that there are certainly no more
/// messages that can ever be read from this stream. For the client-side
/// operations, this only happens because the call is dead. For the
/// server-sider operation, though, this could happen because the client
/// has done a WritesDone already.
///
/// Client-side Finish: \a ok should always be true
///
/// Server-side AsyncNotifyWhenDone: \a ok should always be true
///
/// Alarm: \a ok is true if it expired, false if it was canceled
///
/// \return true if got an event, false if the queue is fully drained and
/// shut down.
bool Next(void** tag, bool* ok) {
return (AsyncNextInternal(tag,
ok,
g_core_codegen_interface->gpr_inf_future(
GPR_CLOCK_REALTIME)) != SHUTDOWN);
}
/// Read from the queue, blocking up to \a deadline (or the queue's shutdown).
/// Both \a tag and \a ok are updated upon success (if an event is available
/// within the \a deadline). A \a tag points to an arbitrary location usually
/// employed to uniquely identify an event.
///
/// \param tag[out] Upon sucess, updated to point to the event's tag.
/// \param ok[out] Upon sucess, true if a successful event, false otherwise
/// See documentation for CompletionQueue::Next for explanation of ok
/// \param deadline[in] How long to block in wait for an event.
///
/// \return The type of event read.
template <typename T>
NextStatus AsyncNext(void** tag, bool* ok, const T& deadline) {
TimePoint<T> deadline_tp(deadline);
return AsyncNextInternal(tag, ok, deadline_tp.raw_time());
}
/// EXPERIMENTAL
/// First executes \a F, then reads from the queue, blocking up to
/// \a deadline (or the queue's shutdown).
/// Both \a tag and \a ok are updated upon success (if an event is available
/// within the \a deadline). A \a tag points to an arbitrary location usually
/// employed to uniquely identify an event.
///
/// \param F[in] Function to execute before calling AsyncNext on this queue.
/// \param tag[out] Upon sucess, updated to point to the event's tag.
/// \param ok[out] Upon sucess, true if read a regular event, false otherwise.
/// \param deadline[in] How long to block in wait for an event.
///
/// \return The type of event read.
template <typename T, typename F>
NextStatus DoThenAsyncNext(F&& f, void** tag, bool* ok, const T& deadline) {
CompletionQueueTLSCache cache = CompletionQueueTLSCache(this);
f();
if (cache.Flush(tag, ok)) {
return GOT_EVENT;
} else {
return AsyncNext(tag, ok, deadline);
}
}
/// Request the shutdown of the queue.
///
/// \warning This method must be called at some point if this completion queue
/// is accessed with Next or AsyncNext. \a Next will not return false
/// until this method has been called and all pending tags have been drained.
/// (Likewise for \a AsyncNext returning \a NextStatus::SHUTDOWN .)
/// Only once either one of these methods does that (that is, once the queue
/// has been \em drained) can an instance of this class be destroyed.
/// Also note that applications must ensure that no work is enqueued on this
/// completion queue after this method is called.
void Shutdown();
/// Returns a \em raw pointer to the underlying \a grpc_completion_queue
/// instance.
///
/// \warning Remember that the returned instance is owned. No transfer of
/// owership is performed.
grpc_completion_queue* cq() { return cq_; }
protected:
/// Private constructor of CompletionQueue only visible to friend classes
CompletionQueue(const grpc_completion_queue_attributes& attributes) {
cq_ = g_core_codegen_interface->grpc_completion_queue_create(
g_core_codegen_interface->grpc_completion_queue_factory_lookup(
&attributes),
&attributes,
NULL);
InitialAvalanching(); // reserve this for the future shutdown
}
private:
// Friend synchronous wrappers so that they can access Pluck(), which is
// a semi-private API geared towards the synchronous implementation.
template <class R>
friend class ::grpc::ClientReader;
template <class W>
friend class ::grpc::ClientWriter;
template <class W, class R>
friend class ::grpc::ClientReaderWriter;
template <class R>
friend class ::grpc::ServerReader;
template <class W>
friend class ::grpc::ServerWriter;
template <class W, class R>
friend class ::grpc::internal::ServerReaderWriterBody;
template <class ServiceType, class RequestType, class ResponseType>
friend class ::grpc::internal::RpcMethodHandler;
template <class ServiceType, class RequestType, class ResponseType>
friend class ::grpc::internal::ClientStreamingHandler;
template <class ServiceType, class RequestType, class ResponseType>
friend class ::grpc::internal::ServerStreamingHandler;
template <class Streamer, bool WriteNeeded>
friend class ::grpc::internal::TemplatedBidiStreamingHandler;
friend class ::grpc::internal::UnknownMethodHandler;
friend class ::grpc::Server;
friend class ::grpc::ServerContext;
friend class ::grpc::ServerInterface;
template <class InputMessage, class OutputMessage>
friend class ::grpc::internal::BlockingUnaryCallImpl;
/// EXPERIMENTAL
/// Creates a Thread Local cache to store the first event
/// On this completion queue queued from this thread. Once
/// initialized, it must be flushed on the same thread.
class CompletionQueueTLSCache {
public:
CompletionQueueTLSCache(CompletionQueue* cq);
~CompletionQueueTLSCache();
bool Flush(void** tag, bool* ok);
private:
CompletionQueue* cq_;
bool flushed_;
};
NextStatus AsyncNextInternal(void** tag, bool* ok, gpr_timespec deadline);
/// Wraps \a grpc_completion_queue_pluck.
/// \warning Must not be mixed with calls to \a Next.
bool Pluck(internal::CompletionQueueTag* tag) {
auto deadline =
g_core_codegen_interface->gpr_inf_future(GPR_CLOCK_REALTIME);
auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
cq_, tag, deadline, nullptr);
bool ok = ev.success != 0;
void* ignored = tag;
GPR_CODEGEN_ASSERT(tag->FinalizeResult(&ignored, &ok));
GPR_CODEGEN_ASSERT(ignored == tag);
// Ignore mutations by FinalizeResult: Pluck returns the C API status
return ev.success != 0;
}
/// Performs a single polling pluck on \a tag.
/// \warning Must not be mixed with calls to \a Next.
///
/// TODO: sreek - This calls tag->FinalizeResult() even if the cq_ is already
/// shutdown. This is most likely a bug and if it is a bug, then change this
/// implementation to simple call the other TryPluck function with a zero
/// timeout. i.e:
/// TryPluck(tag, gpr_time_0(GPR_CLOCK_REALTIME))
void TryPluck(internal::CompletionQueueTag* tag) {
auto deadline = g_core_codegen_interface->gpr_time_0(GPR_CLOCK_REALTIME);
auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
cq_, tag, deadline, nullptr);
if (ev.type == GRPC_QUEUE_TIMEOUT) return;
bool ok = ev.success != 0;
void* ignored = tag;
// the tag must be swallowed if using TryPluck
GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
}
/// Performs a single polling pluck on \a tag. Calls tag->FinalizeResult if
/// the pluck() was successful and returned the tag.
///
/// This exects tag->FinalizeResult (if called) to return 'false' i.e expects
/// that the tag is internal not something that is returned to the user.
void TryPluck(internal::CompletionQueueTag* tag, gpr_timespec deadline) {
auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
cq_, tag, deadline, nullptr);
if (ev.type == GRPC_QUEUE_TIMEOUT || ev.type == GRPC_QUEUE_SHUTDOWN) {
return;
}
bool ok = ev.success != 0;
void* ignored = tag;
GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
}
/// Manage state of avalanching operations : completion queue tags that
/// trigger other completion queue operations. The underlying core completion
/// queue should not really shutdown until all avalanching operations have
/// been finalized. Note that we maintain the requirement that an avalanche
/// registration must take place before CQ shutdown (which must be maintained
/// elsehwere)
void InitialAvalanching() {
gpr_atm_rel_store(&avalanches_in_flight_, static_cast<gpr_atm>(1));
}
void RegisterAvalanching() {
gpr_atm_no_barrier_fetch_add(&avalanches_in_flight_,
static_cast<gpr_atm>(1));
}
void CompleteAvalanching();
grpc_completion_queue* cq_; // owned
gpr_atm avalanches_in_flight_;
};
/// A specific type of completion queue used by the processing of notifications
/// by servers. Instantiated by \a ServerBuilder.
class ServerCompletionQueue : public CompletionQueue {
public:
bool IsFrequentlyPolled() { return polling_type_ != GRPC_CQ_NON_LISTENING; }
private:
grpc_cq_polling_type polling_type_;
friend class ServerBuilder;
/// \param is_frequently_polled Informs the GRPC library about whether the
/// server completion queue would be actively polled (by calling Next() or
/// AsyncNext()). By default all server completion queues are assumed to be
/// frequently polled.
ServerCompletionQueue(grpc_cq_polling_type polling_type)
: CompletionQueue(grpc_completion_queue_attributes{
GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, polling_type}),
polling_type_(polling_type) {}
};
} // namespace grpc
#endif // GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
diff --git a/include/grpcpp/impl/codegen/completion_queue.h b/include/grpcpp/impl/codegen/completion_queue.h
index 80c7c41982..3f7d8a7714 100644
--- a/include/grpcpp/impl/codegen/completion_queue.h
+++ b/include/grpcpp/impl/codegen/completion_queue.h
@@ -32,6 +32,8 @@
#ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
#define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+#include <typeinfo>
+
#include <grpc/impl/codegen/atm.h>
#include <grpcpp/impl/codegen/completion_queue_tag.h>
#include <grpcpp/impl/codegen/core_codegen_interface.h>
@@ -106,7 +108,9 @@ class CompletionQueue : private GrpcLibraryCodegen {
/// Destructor. Destroys the owned wrapped completion queue / instance.
~CompletionQueue() {
- g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
+ if (typeid(*g_core_codegen_interface).hash_code() != typeid(CoreCodegenInterface).hash_code()) {
+ g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
+ }
}
/// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
diff --git a/include/grpcpp/impl/codegen/grpc_library.h b/include/grpcpp/impl/codegen/grpc_library.h
index 17c904d71a..a092b2204d 100644
--- a/include/grpcpp/impl/codegen/grpc_library.h
+++ b/include/grpcpp/impl/codegen/grpc_library.h
@@ -19,6 +19,8 @@
#ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
#define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+#include <typeinfo>
+
#include <grpcpp/impl/codegen/core_codegen_interface.h>
namespace grpc {
@@ -47,7 +49,8 @@ class GrpcLibraryCodegen {
}
}
virtual ~GrpcLibraryCodegen() {
- if (grpc_init_called_) {
+ if (grpc_init_called_ &&
+ typeid(*g_glip).hash_code() != typeid(GrpcLibraryInterface).hash_code()) {
GPR_CODEGEN_ASSERT(g_glip &&
"gRPC library not initialized. See "
"grpc::internal::GrpcLibraryInitializer.");
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
#define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
#include <typeinfo>
#include <grpcpp/impl/codegen/core_codegen_interface.h>
namespace grpc {
class GrpcLibraryInterface {
public:
virtual ~GrpcLibraryInterface() = default;
virtual void init() = 0;
virtual void shutdown() = 0;
};
/// Initialized by \a grpc::GrpcLibraryInitializer from
/// <grpcpp/impl/grpc_library.h>
extern GrpcLibraryInterface* g_glip;
/// Classes that require gRPC to be initialized should inherit from this class.
class GrpcLibraryCodegen {
public:
GrpcLibraryCodegen(bool call_grpc_init = true) : grpc_init_called_(false) {
if (call_grpc_init) {
GPR_CODEGEN_ASSERT(g_glip &&
"gRPC library not initialized. See "
"grpc::internal::GrpcLibraryInitializer.");
g_glip->init();
grpc_init_called_ = true;
}
}
virtual ~GrpcLibraryCodegen() {
if (grpc_init_called_ &&
typeid(*g_glip).hash_code() !=
typeid(GrpcLibraryInterface).hash_code()) {
GPR_CODEGEN_ASSERT(g_glip &&
"gRPC library not initialized. See "
"grpc::internal::GrpcLibraryInitializer.");
g_glip->shutdown();
}
}
private:
bool grpc_init_called_;
};
} // namespace grpc
#endif // GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
......@@ -123,7 +123,7 @@ def __bootstrap__():
read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory'
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads'
]
if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline')
......
......@@ -142,14 +142,20 @@ class L2DecayRegularizer(WeightDecayRegularizer):
dtype="float32", shape=param.shape, lod_level=param.lod_level)
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
idx = block.create_var(
dtype="int64",
shape=param.shape,
type=core.VarDesc.VarType.LOD_TENSOR)
decay = block.create_var(
dtype="float32",
shape=param.shape,
type=core.VarDesc.VarType.SELECTED_ROWS)
block.append_op(
type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
block.append_op(
type='lookup_table',
inputs={'W': param,
'Ids': grad},
'Ids': idx},
outputs={'Out': decay},
attrs={'is_sparse': True})
param = decay
......@@ -216,14 +222,20 @@ class L1DecayRegularizer(WeightDecayRegularizer):
dtype="float32", shape=param.shape, lod_level=param.lod_level)
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
idx = block.create_var(
dtype="int64",
shape=param.shape,
type=core.VarDesc.VarType.LOD_TENSOR)
decay = block.create_var(
dtype="float32",
shape=param.shape,
type=core.VarDesc.VarType.SELECTED_ROWS)
block.append_op(
type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
block.append_op(
type='lookup_table',
inputs={'W': param,
'Ids': grad},
'Ids': idx},
outputs={'Out': decay},
attrs={'is_sparse': True})
......
......@@ -278,7 +278,7 @@ class DistSeResneXt2x2:
def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
batch_size=20)
batch_size=2)
if is_dist:
t = get_transpiler(trainer_id,
fluid.default_main_program(), endpoints,
......@@ -294,11 +294,7 @@ class DistSeResneXt2x2:
strategy.num_threads = 1
strategy.allow_op_delay = False
exe = fluid.ParallelExecutor(
True,
loss_name=avg_cost.name,
exec_strategy=strategy,
num_trainers=trainers,
trainer_id=trainer_id)
True, loss_name=avg_cost.name, exec_strategy=strategy)
feed_var_list = [
var for var in trainer_prog.global_block().vars.itervalues()
......
......@@ -56,7 +56,7 @@ class TestDistSeResneXt2x2(unittest.TestCase):
except os.error:
retry_times -= 1
def non_test_with_place(self):
def no_test_with_place(self):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs = {
"PATH": os.getenv("PATH"),
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from op_test import OpTest
class TestExtractRows(OpTest):
def check_with_place(self, place):
scope = core.Scope()
# create and initialize Variable
feature_len = 12
rows = [0, 4, 4, 7]
np_array = np.ones((len(rows), feature_len)).astype("float32")
in_x = scope.var('X').get_selected_rows()
in_x.set_height(len(rows))
in_x.set_rows(rows)
in_x_tensor = in_x.get_tensor()
in_x_tensor.set(np_array, place)
# create Out Variable
out_tensor = scope.var('Out').get_tensor()
# create and run lookup_table operator
extract_rows_op = Operator("extract_rows", X='X', Out='Out')
extract_rows_op.run(scope, place)
# get result from Out
result_array = np.array(out_tensor)
result_array = [ele[0] for ele in result_array]
assert result_array == rows
def test_concat_rows(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestFlattenOp(OpTest):
def setUp(self):
self.op_type = "flatten"
self.init_test_case()
self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
def init_test_case(self):
self.in_shape = (3, 2, 2, 5)
self.axis = 1
self.new_shape = (3, 20)
def init_attrs(self):
self.attrs = {"axis": self.axis}
class TestFlattenOp(TestFlattenOp):
def init_test_case(self):
self.in_shape = (3, 2, 2, 3)
self.axis = 0
self.new_shape = (1, 36)
class TestFlattenOpWithDefaultAxis(TestFlattenOp):
def init_test_case(self):
self.in_shape = (3, 2, 2, 3)
self.new_shape = (3, 12)
def init_attrs(self):
self.attrs = {}
class TestFlattenOpSixDims(TestFlattenOp):
def init_test_case(self):
self.in_shape = (3, 2, 3, 2, 4, 4)
self.axis = 4
self.new_shape = (36, 16)
if __name__ == "__main__":
unittest.main()
......@@ -49,53 +49,6 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
pass
class TestLookupTableIdsIsSelectedRows(OpTest):
def check_with_place(self, place):
scope = core.Scope()
# create and initialize Variable
height = 10
rows = [0, 4, 4, 7]
row_numel = 12
# create and initialize W Variable
W = scope.var('W').get_tensor()
W_array = np.full((height, row_numel), 1.0).astype("float32")
for i in range(height):
W_array[i] *= i
W.set(W_array, place)
# create and initialize Ids Variable
ids_selected_rows = scope.var('Ids').get_selected_rows()
ids_selected_rows.set_height(len(rows))
ids_selected_rows.set_rows(rows)
np_array = np.ones((len(rows), row_numel)).astype("float32")
ids_tensor = ids_selected_rows.get_tensor()
ids_tensor.set(np_array, place)
# create Out Variable
Out = scope.var('Out').get_selected_rows()
# create and run lookup_table operator
concat_rows_op = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
concat_rows_op.run(scope, place)
# get result from Out
Out_tensor = Out.get_tensor()
result_array = np.array(Out_tensor)
# all(): return True if all elements of the iterable are true (or if the iterable is empty)
for idx, row in enumerate(rows):
assert (row == result_array[idx]).all()
def test_concat_rows(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place)
class TestLookupTableWIsSelectedRows(OpTest):
def check_with_place(self, place):
scope = core.Scope()
......
......@@ -4,7 +4,7 @@ TOTAL_ERRORS=0
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then
if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*|patches/grpc/.*) ]]; then
continue;
else
cpplint --filter=-readability/fn_size $file;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册