提交 145c5357 编写于 作者: Q Qiao Longfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into refactor-prefetch

test=develop
...@@ -214,6 +214,7 @@ if (NOT WIN32) ...@@ -214,6 +214,7 @@ if (NOT WIN32)
# there is no official support of warpctc, nccl, cupti in windows # there is no official support of warpctc, nccl, cupti in windows
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(cupti) include(cupti)
include(external/gzstream)
endif (NOT WIN32) endif (NOT WIN32)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
IF(MOBILE_INFERENCE)
return()
ENDIF()
include (ExternalProject)
# NOTE: gzstream is needed when linking with ctr reader.
SET(GZSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/gzstream)
SET(GZSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gzstream)
SET(GZSTREAM_INCLUDE_DIR "${GZSTREAM_INSTALL_DIR}/include/" CACHE PATH "gzstream include directory." FORCE)
ExternalProject_Add(
extern_gzstream
GIT_REPOSITORY "https://github.com/jacquesqiao/gzstream.git"
GIT_TAG ""
PREFIX ${GZSTREAM_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
BUILD_COMMAND make -j8
INSTALL_COMMAND mkdir -p ${GZSTREAM_INSTALL_DIR}/lib/ && mkdir -p ${GZSTREAM_INSTALL_DIR}/include/
&& cp ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/libgzstream.a ${GZSTREAM_INSTALL_DIR}/lib
&& cp -r ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/gzstream.h ${GZSTREAM_INSTALL_DIR}/include
)
ADD_LIBRARY(gzstream STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gzstream PROPERTY IMPORTED_LOCATION
"${GZSTREAM_INSTALL_DIR}/lib/libgzstream.a")
include_directories(${GZSTREAM_INCLUDE_DIR})
ADD_DEPENDENCIES(gzstream extern_gzstream zlib)
...@@ -97,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti ...@@ -97,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
......
...@@ -39,11 +39,12 @@ if (WITH_GPU) ...@@ -39,11 +39,12 @@ if (WITH_GPU)
endif() endif()
cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass)
if (WITH_GPU) if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif() endif()
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/op_graph_view.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle {
namespace framework {
namespace details {
static constexpr char kAllOpDescs[] = "all_op_descs";
VarHandle* GetValidInput(const OpHandleBase* a) {
for (auto p : a->Inputs()) {
VarHandle* b = dynamic_cast<VarHandle*>(p);
if (b) {
return b;
}
}
return nullptr;
}
std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
// get vars order
int order = 0;
std::unordered_map<std::string, int> vars;
// TODO(gongwb): use graph topology sort to find the order of operators.
// Note that must assert topology sort is stable
auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
for (auto* op_desc : ops) {
auto outputs = op_desc->Outputs();
for (auto& o_it : outputs) {
for (auto& v : o_it.second) { // values
vars[v] = order;
}
}
order++;
}
std::vector<OpHandleBase*> dist_ops;
// get allreduce ops.
for (auto& op : graph_ops) {
// FIXME(gongwb):add broad cast.
if (op->Name() == "all_reduce" || op->Name() == "reduce") {
dist_ops.push_back(op);
}
}
VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl;
std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
OpHandleBase* op2) {
VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
op1->DebugString(), op2->DebugString());
auto l_it = vars.find(i0->name_);
auto r_it = vars.find(i1->name_);
if (l_it->second < r_it->second) return true;
if (l_it->second == r_it->second) {
return i0->name_ < i1->name_;
}
return false;
});
// add dependency.
auto& sorted_ops = dist_ops;
for (size_t i = 1; i < sorted_ops.size(); ++i) {
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
auto* pre_op = sorted_ops[i - 1];
auto* op = sorted_ops[i];
pre_op->AddOutput(dep_var);
op->AddInput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
<< " and " << op;
VLOG(10) << "pre_op:" << pre_op->DebugString()
<< ", op:" << op->DebugString();
}
return graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(all_reduce_deps_pass,
paddle::framework::details::AllReduceDepsPass)
.RequirePassAttr(paddle::framework::details::kAllOpDescs);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
// TODO(gongwb): overlap allreduce with backward computation.
class AllReduceDepsPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
...@@ -24,6 +25,10 @@ namespace paddle { ...@@ -24,6 +25,10 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1);
}
class ParallelExecutorPassBuilder : public ir::PassBuilder { class ParallelExecutorPassBuilder : public ir::PassBuilder {
public: public:
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
...@@ -70,6 +75,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -70,6 +75,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Verify that the graph is correct for multi-device executor. // Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass"); AppendPass("multi_devices_check_pass");
if (SeqOnlyAllReduceOps(strategy)) {
AppendPass("all_reduce_deps_pass");
}
if (strategy_.remove_unnecessary_lock_) { if (strategy_.remove_unnecessary_lock_) {
AppendPass("modify_op_lock_and_record_event_pass"); AppendPass("modify_op_lock_and_record_event_pass");
} }
...@@ -124,6 +133,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -124,6 +133,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx); pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif #endif
} else if (pass->Type() == "sequential_execution_pass") { } else if (pass->Type() == "sequential_execution_pass") {
VLOG(1) << "set enable_sequential_execution:"
<< enable_sequential_execution_;
pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} else if (pass->Type() == "all_reduce_deps_pass") {
VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
<< ", num_trainers:" << num_trainers_;
pass->Erase(kAllOpDescs); pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>( pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs, kAllOpDescs,
...@@ -144,4 +164,5 @@ USE_PASS(multi_devices_pass); ...@@ -144,4 +164,5 @@ USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass); USE_PASS(multi_devices_print_pass);
USE_PASS(sequential_execution_pass); USE_PASS(sequential_execution_pass);
USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass); USE_PASS(modify_op_lock_and_record_event_pass);
...@@ -73,6 +73,7 @@ struct BuildStrategy { ...@@ -73,6 +73,7 @@ struct BuildStrategy {
bool fuse_broadcast_op_{false}; bool fuse_broadcast_op_{false};
int num_trainers_{1};
bool remove_unnecessary_lock_{false}; bool remove_unnecessary_lock_{false};
// NOTE: // NOTE:
......
...@@ -71,7 +71,7 @@ class OperatorBase; ...@@ -71,7 +71,7 @@ class OperatorBase;
class ExecutionContext; class ExecutionContext;
/** /**
* OperatorBase has the basic element that Net will call to do computation. * OperatorBase has the basic elements that Net will call to do computation.
* Only CreateOperator from OpRegistry will new Operator directly. User * Only CreateOperator from OpRegistry will new Operator directly. User
* should always construct a proto message OpDesc and call * should always construct a proto message OpDesc and call
* OpRegistry::CreateOp(op_desc) to get an Operator instance. * OpRegistry::CreateOp(op_desc) to get an Operator instance.
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -54,7 +54,7 @@ class ParallelExecutorPrivate { ...@@ -54,7 +54,7 @@ class ParallelExecutorPrivate {
Scope *global_scope_; // not owned Scope *global_scope_; // not owned
std::unique_ptr<details::SSAGraphExecutor> executor_; std::unique_ptr<details::SSAGraphExecutor> executor_;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_; std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif #endif
bool own_local_scope_; bool own_local_scope_;
...@@ -104,7 +104,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -104,7 +104,7 @@ ParallelExecutor::ParallelExecutor(
if (member_->use_cuda_) { if (member_->use_cuda_) {
// Bcast Parameters to all GPUs // Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
ncclUniqueId *nccl_id = nullptr; ncclUniqueId *nccl_id = nullptr;
if (nccl_id_var != nullptr) { if (nccl_id_var != nullptr) {
...@@ -124,7 +124,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -124,7 +124,7 @@ ParallelExecutor::ParallelExecutor(
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp // ncclOp
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::unique_ptr<ir::Graph> graph = build_strategy.Apply( std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, params, main_program, member_->places_, loss_var_name, params,
member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
...@@ -213,7 +213,7 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -213,7 +213,7 @@ void ParallelExecutor::BCastParamsToDevices(
} }
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::vector<void *> buffers; std::vector<void *> buffers;
size_t numel = main_tensor.numel(); size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
......
...@@ -120,8 +120,22 @@ class SelectedRows { ...@@ -120,8 +120,22 @@ class SelectedRows {
*/ */
int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false); int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
void SyncIndex(); /*
* @brief Get the index of the key from id_to_index_ map.
*/
inline int64_t GetIndexFromId(int64_t key) {
auto iter = id_to_index_.find(key);
if (iter == id_to_index_.end()) {
return -1;
} else {
return iter->second;
}
}
void SyncIndex();
/*
* @brief Get complete Dims before
*/
DDim GetCompleteDims() const { DDim GetCompleteDims() const {
std::vector<int64_t> dims = vectorize(value_->dims()); std::vector<int64_t> dims = vectorize(value_->dims());
dims[0] = height_; dims[0] = height_;
...@@ -133,9 +147,10 @@ class SelectedRows { ...@@ -133,9 +147,10 @@ class SelectedRows {
// SelectedRows are simply concated when adding together. Until a // SelectedRows are simply concated when adding together. Until a
// SelectedRows add a Tensor, will the duplicate rows be handled. // SelectedRows add a Tensor, will the duplicate rows be handled.
Vector<int64_t> rows_; Vector<int64_t> rows_;
std::unordered_map<int64_t, int64_t> id_to_index_; std::unordered_map<int64_t, int64_t>
id_to_index_; // should not be used when rows_ has duplicate member
std::unique_ptr<Tensor> value_{nullptr}; std::unique_ptr<Tensor> value_{nullptr};
int64_t height_; int64_t height_; // height indicates the underline tensor's height
std::unique_ptr<RWLock> rwlock_{nullptr}; std::unique_ptr<RWLock> rwlock_{nullptr};
}; };
......
...@@ -17,28 +17,16 @@ ...@@ -17,28 +17,16 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// Holds all the transfer scope across the process.
std::unordered_map<size_t, Scope*>& global_transfer_data_cache() { std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
typedef std::unordered_map<size_t, Scope*> map_t; thread_local auto* x = new std::unordered_map<size_t, Scope*>;
thread_local std::unique_ptr<map_t> x(new map_t);
return *x; return *x;
} }
// Holds all the transfer scope for this thread.
std::unordered_set<Scope*>& global_transfer_scope_cache() { std::unordered_set<Scope*>& global_transfer_scope_cache() {
typedef std::unordered_set<Scope*> set_t; thread_local auto* x = new std::unordered_set<Scope*>;
thread_local std::unique_ptr<set_t> x(new set_t);
return *x; return *x;
} }
// Try to create a transfer scope. If one cached scope has match the
// requirement, just return that one.
// Inputs:
// @type0: the source kernel type.
// @type1: the target kernel type.
// @scope: the execution scope of this op.
// Returns: A scope used to hold the transfer data across the different kernel
// type.
Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1, Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
const Scope* scope) { const Scope* scope) {
Scope* new_scope{nullptr}; Scope* new_scope{nullptr};
...@@ -58,5 +46,27 @@ Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1, ...@@ -58,5 +46,27 @@ Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
return new_scope; return new_scope;
} }
void RemoveKidsFromTransferScopeCache(Scope* scope) {
auto it = global_transfer_scope_cache().find(scope);
if (it != global_transfer_scope_cache().end()) {
global_transfer_scope_cache().erase(it);
}
for (auto* s : scope->kids()) {
auto it = global_transfer_scope_cache().find(s);
if (it != global_transfer_scope_cache().end()) {
global_transfer_scope_cache().erase(it);
}
}
// remove global transfer data cache
auto& cache = global_transfer_data_cache();
for (auto it = cache.begin(); it != cache.end();) {
if (it->second == scope)
it = cache.erase(it);
else
it++;
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -35,4 +35,5 @@ function(inference_analysis_test TARGET) ...@@ -35,4 +35,5 @@ function(inference_analysis_test TARGET)
endif() endif()
endfunction(inference_analysis_test) endfunction(inference_analysis_test)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS reset_tensor_array paddle_inference_api) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS reset_tensor_array paddle_inference_api)
...@@ -76,7 +76,8 @@ void TestWord2vecPrediction(const std::string& model_path) { ...@@ -76,7 +76,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
0.000932706}; 0.000932706};
const size_t num_elements = outputs.front().data.length() / sizeof(float); const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min((size_t)5UL, num_elements); i++) { for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements);
i++) {
LOG(INFO) << "data: " LOG(INFO) << "data: "
<< static_cast<float*>(outputs.front().data.data())[i]; << static_cast<float*>(outputs.front().data.data())[i];
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i], PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
......
...@@ -284,6 +284,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs, ...@@ -284,6 +284,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
framework::GetFetchVariable(*scope, "fetch", idx); framework::GetFetchVariable(*scope, "fetch", idx);
auto type = fetch.type(); auto type = fetch.type();
auto output = &(outputs->at(i)); auto output = &(outputs->at(i));
output->name = fetchs_[idx]->Input("X")[0];
if (type == typeid(float)) { if (type == typeid(float)) {
GetFetchOne<float>(fetch, output); GetFetchOne<float>(fetch, output);
output->dtype = PaddleDType::FLOAT32; output->dtype = PaddleDType::FLOAT32;
......
...@@ -109,7 +109,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -109,7 +109,7 @@ class AnalysisPredictor : public PaddlePredictor {
std::map<std::string, size_t> feed_names_; std::map<std::string, size_t> feed_names_;
std::vector<framework::OpDesc *> fetchs_; std::vector<framework::OpDesc *> fetchs_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, so cache them. // concurrency problems, wrong results and memory leak, so cache them.
std::vector<framework::LoDTensor> feed_tensors_; std::vector<framework::LoDTensor> feed_tensors_;
details::TensorArrayBatchCleaner tensor_array_batch_cleaner_; details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
......
...@@ -185,8 +185,12 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -185,8 +185,12 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
<< inputs.size(); << inputs.size();
return false; return false;
} }
// Cache the inputs memory for better concurrency performance.
feed_tensors_.resize(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) { for (size_t i = 0; i < inputs.size(); ++i) {
framework::LoDTensor input; auto &input = feed_tensors_[i];
framework::DDim ddim = framework::make_ddim(inputs[i].shape); framework::DDim ddim = framework::make_ddim(inputs[i].shape);
void *input_ptr; void *input_ptr;
if (inputs[i].dtype == PaddleDType::INT64) { if (inputs[i].dtype == PaddleDType::INT64) {
...@@ -261,6 +265,7 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs, ...@@ -261,6 +265,7 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
framework::GetFetchVariable(*scope, "fetch", idx); framework::GetFetchVariable(*scope, "fetch", idx);
auto type = fetch.type(); auto type = fetch.type();
auto output = &(outputs->at(i)); auto output = &(outputs->at(i));
output->name = fetchs_[idx]->Input("X")[0];
if (type == typeid(float)) { if (type == typeid(float)) {
GetFetchOne<float>(fetch, output); GetFetchOne<float>(fetch, output);
output->dtype = PaddleDType::FLOAT32; output->dtype = PaddleDType::FLOAT32;
......
...@@ -69,6 +69,9 @@ class NativePaddlePredictor : public PaddlePredictor { ...@@ -69,6 +69,9 @@ class NativePaddlePredictor : public PaddlePredictor {
std::vector<framework::OpDesc *> feeds_; std::vector<framework::OpDesc *> feeds_;
std::map<std::string, size_t> feed_names_; std::map<std::string, size_t> feed_names_;
std::vector<framework::OpDesc *> fetchs_; std::vector<framework::OpDesc *> fetchs_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them.
std::vector<framework::LoDTensor> feed_tensors_;
// Do not use unique_ptr, use parent scope to delete // Do not use unique_ptr, use parent scope to delete
framework::Scope *sub_scope_{nullptr}; framework::Scope *sub_scope_{nullptr};
details::TensorArrayBatchCleaner tensor_array_batch_cleaner_; details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
......
...@@ -86,7 +86,11 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -86,7 +86,11 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
munlock(p, size); munlock(p, size);
#endif #endif
} }
#ifdef _WIN32
_aligned_free(p);
#else
free(p); free(p);
#endif
} }
bool CPUAllocator::UseGpu() const { return false; } bool CPUAllocator::UseGpu() const { return false; }
......
...@@ -149,6 +149,13 @@ $out = \max(x, 0)$ ...@@ -149,6 +149,13 @@ $out = \max(x, 0)$
)DOC"; )DOC";
UNUSED constexpr char GeluDoc[] = R"DOC(
Gelu Activation Operator.
$out = \\frac{1 + erf(\\frac{x}{\\sqrt{2}})}{2} x$
)DOC";
UNUSED constexpr char TanhDoc[] = R"DOC( UNUSED constexpr char TanhDoc[] = R"DOC(
Tanh Activation Operator. Tanh Activation Operator.
...@@ -472,6 +479,7 @@ REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc); ...@@ -472,6 +479,7 @@ REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc); REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc); REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc);
REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc); REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc); REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc); REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
...@@ -489,6 +497,7 @@ REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc); ...@@ -489,6 +497,7 @@ REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid); REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu); REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Gelu, gelu);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp); REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh); REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil); REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil);
...@@ -525,6 +534,7 @@ namespace ops = paddle::operators; ...@@ -525,6 +534,7 @@ namespace ops = paddle::operators;
__macro(Round, round); \ __macro(Round, round); \
__macro(Log, log); \ __macro(Log, log); \
__macro(Square, square); \ __macro(Square, square); \
__macro(Gelu, gelu); \
__macro(BRelu, brelu); \ __macro(BRelu, brelu); \
__macro(Pow, pow); \ __macro(Pow, pow); \
__macro(STanh, stanh); \ __macro(STanh, stanh); \
......
...@@ -16,6 +16,11 @@ limitations under the License. */ ...@@ -16,6 +16,11 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <cmath>
#ifndef _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#endif
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
...@@ -212,6 +217,31 @@ struct ReluGradFunctor : public BaseActivationFunctor<T> { ...@@ -212,6 +217,31 @@ struct ReluGradFunctor : public BaseActivationFunctor<T> {
} }
}; };
// gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2)))
template <typename T>
struct GeluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto temp =
((x * static_cast<T>(M_SQRT1_2)).erf()).template cast<T>().eval();
out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
}
};
template <typename T>
struct GeluGradFunctor : BaseActivationFunctor<T> {
bool Inplace() const { return IsInplace("gelu"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp = (static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
((-static_cast<T>(0.5) * x.square()).exp()))
.template cast<T>()
.eval();
dx.device(d) = dout * (out / x + temp);
}
};
// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T> template <typename T>
struct TanhFunctor : public BaseActivationFunctor<T> { struct TanhFunctor : public BaseActivationFunctor<T> {
...@@ -877,6 +907,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -877,6 +907,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
__macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
__macro(exp, ExpFunctor, ExpGradFunctor); \ __macro(exp, ExpFunctor, ExpGradFunctor); \
__macro(relu, ReluFunctor, ReluGradFunctor); \ __macro(relu, ReluFunctor, ReluGradFunctor); \
__macro(gelu, GeluFunctor, GeluGradFunctor); \
__macro(tanh, TanhFunctor, TanhGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \
__macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \
......
...@@ -70,7 +70,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> { ...@@ -70,7 +70,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
if (bias) { if (bias) {
auto bias_vec = EigenMatrix<T>::From(*bias); auto bias_vec = EigenMatrix<T>::From(*bias);
Eigen::DSizes<int, 2> bcast(batch_size, 1); Eigen::DSizes<int, 2> bcast(batch_size, 1);
output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat; output_mat.device(place) = bias_vec.broadcast(bcast).eval() + output_mat;
} }
} }
}; };
...@@ -99,13 +99,13 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> { ...@@ -99,13 +99,13 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
auto d_out_mat = EigenMatrix<T>::From(*d_out); auto d_out_mat = EigenMatrix<T>::From(*d_out);
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
// Create the intermediate variable to caculate the Output(Y@Grad). // Create the intermediate variable to calculate the Output(Y@Grad).
Tensor x_scale; Tensor x_scale;
x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}), x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
ctx.GetPlace()); ctx.GetPlace());
auto x_scale_mat = EigenMatrix<T>::From(x_scale); auto x_scale_mat = EigenMatrix<T>::From(x_scale);
// Create the intermediate variable to caculate the Output(X@Grad). // Create the intermediate variable to calculate the Output(X@Grad).
Tensor y_scale; Tensor y_scale;
y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}), y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
ctx.GetPlace()); ctx.GetPlace());
...@@ -113,65 +113,64 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> { ...@@ -113,65 +113,64 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
math::SetConstant<DeviceContext, T> set_zero; math::SetConstant<DeviceContext, T> set_zero;
// Set Output(X@Grad) be zero.
if (d_x) { if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace()); d_x->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, d_x, static_cast<T>(0)); set_zero(dev_ctx, d_x, static_cast<T>(0));
} }
// Set Output(Y@Grad) be zero.
if (d_y) { if (d_y) {
d_y->mutable_data<T>(ctx.GetPlace()); d_y->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, d_y, static_cast<T>(0)); set_zero(dev_ctx, d_y, static_cast<T>(0));
} }
if (d_weight) {
d_weight->mutable_data<T>(ctx.GetPlace());
}
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
// Caculate the Output(X@Grad) and Output(Y@Grad). // Caculate the Output(X@Grad) and Output(Y@Grad).
if (d_x || d_y) { if (d_x || d_y || d_weight) {
Eigen::DSizes<int, 2> bcast_for_x(1, y_dim); Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
Eigen::DSizes<int, 2> bcast_for_y(1, x_dim); Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
for (int i = 0; i < out_dim; ++i) { for (int i = 0; i < out_dim; ++i) {
Tensor weight_i = weight->Slice(i, i + 1).Resize( Tensor weight_i = weight->Slice(i, i + 1).Resize(
framework::make_ddim({x_dim, y_dim})); framework::make_ddim({x_dim, y_dim}));
auto output_vec = d_out_mat.chip(i, 1); auto output_vec = d_out_mat.chip(i, 1);
if (d_x) { if (d_x) {
y_scale_mat.device(place) = y_scale_mat.device(place) =
output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1)) output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
.broadcast(bcast_for_x) * .broadcast(bcast_for_x)
.eval() *
y_mat; y_mat;
blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1, blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>()); y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
} }
if (d_y) {
x_scale_mat.device(place) = if (d_y || d_weight) {
auto output_vec_y =
output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1)) output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
.broadcast(bcast_for_y) * .broadcast(bcast_for_y)
x_mat; .eval();
x_scale_mat.device(place) = output_vec_y * x_mat;
if (d_y) {
blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>()); x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
} }
}
}
// Caculate the gradient of Input(Weight).
if (d_weight) { if (d_weight) {
d_weight->mutable_data<T>(ctx.GetPlace());
Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
for (int i = 0; i < out_dim; ++i) {
Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize( Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
framework::make_ddim({x_dim, y_dim})); framework::make_ddim({x_dim, y_dim}));
auto output_vec = d_out_mat.chip(i, 1);
x_scale_mat.device(place) =
output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
.broadcast(bcast_for_weight) *
x_mat;
blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1, blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>()); x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
} }
} }
}
}
// Caculate the gradient of Input(Bias). // calculate the gradient of Input(Bias).
if (d_bias) { if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace()); d_bias->mutable_data<T>(ctx.GetPlace());
auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias); auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
......
...@@ -120,6 +120,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel { ...@@ -120,6 +120,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
"Dimensions of Input(X) and Mask must be the same."); "Dimensions of Input(X) and Mask must be the same.");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
} }
}; };
......
...@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef _WIN32
#include <unistd.h> #include <unistd.h>
#endif
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
......
...@@ -19,36 +19,21 @@ limitations under the License. */ ...@@ -19,36 +19,21 @@ limitations under the License. */
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/jit_kernel.h"
#include "xbyak.h" #include "xbyak/xbyak.h"
#include "xbyak_util.h" #include "xbyak/xbyak_util.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::DataLayout; using framework::DataLayout;
using mkldnn::memory; using mkldnn::memory;
using platform::StringToMKLDNNFormat;
static mkldnn::memory::format StringToMKLDNNFormat(std::string& format) {
std::transform(format.begin(), format.end(), format.begin(), ::tolower);
if (!format.compare("nchw")) {
return memory::format::nchw;
} else if (!format.compare("nchw16c")) {
return memory::format::nChw16c;
} else if (!format.compare("nchw8c")) {
return memory::format::nChw8c;
} else if (!format.compare("nhwc")) {
return memory::format::nhwc;
} else {
return memory::format::any;
}
}
static void UpdateDataFormat(const framework::ExecutionContext& ctx, static void UpdateDataFormat(const framework::ExecutionContext& ctx,
framework::Tensor* tensor, const char* attribute) { framework::Tensor* tensor, const char* attribute) {
if (ctx.op().HasAttr(attribute)) { if (ctx.op().HasAttr(attribute)) {
auto format_as_string = ctx.Attr<std::string>(attribute); auto format_as_string = ctx.Attr<std::string>(attribute);
auto format = StringToMKLDNNFormat(format_as_string); auto format = StringToMKLDNNFormat(&format_as_string);
if (format != memory::format::any) { if (format != memory::format::any) {
tensor->set_format(format); tensor->set_format(format);
} }
...@@ -93,8 +78,8 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> { ...@@ -93,8 +78,8 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
auto y_dims_untrimmed = y->dims(); auto y_dims_untrimmed = y->dims();
auto x_int_dims = paddle::framework::vectorize2int(x_dims); auto x_int_dims = paddle::framework::vectorize2int(x_dims);
UpdateDataFormat(ctx, (Tensor*)x, "x_data_format"); UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
UpdateDataFormat(ctx, (Tensor*)y, "y_data_format"); UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");
Xbyak::util::Cpu cpu; Xbyak::util::Cpu cpu;
const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F); const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
...@@ -156,10 +141,10 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> { ...@@ -156,10 +141,10 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>(); auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine(); const auto& mkldnn_engine = dev_ctx.GetEngine();
if (!(is_x_nchw || is_x_nc)) if (!(is_x_nchw || is_x_nc))
ReorderInput<T>((Tensor*)x, ctx.GetPlace(), mkldnn_engine, ReorderInput<T>(const_cast<Tensor*>(x), ctx.GetPlace(), mkldnn_engine,
x->dims().size() == 4); x->dims().size() == 4);
if (!(is_y_nchw || is_y_nc)) if (!(is_y_nchw || is_y_nc))
ReorderInput<T>((Tensor*)y, ctx.GetPlace(), mkldnn_engine, ReorderInput<T>(const_cast<Tensor*>(y), ctx.GetPlace(), mkldnn_engine,
y->dims().size() == 4); y->dims().size() == 4);
} }
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/hierarchical_sigmoid_op.h" #include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
#include <string>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -70,13 +70,14 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { ...@@ -70,13 +70,14 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
const int64_t batch_size = ctx->GetInputDim("X")[0]; const int64_t batch_size = ctx->GetInputDim("X")[0];
std::vector<int64_t> output_shape({batch_size, 1}); std::vector<int64_t> output_shape({batch_size, 1});
ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
ctx->ShareLoD("X", /*->*/ "Out");
} }
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.GetPlace()); ctx.GetPlace());
} }
}; };
...@@ -86,27 +87,40 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -86,27 +87,40 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", AddInput("X",
"(Tensor, required) The input tensor with shape [N, D], " "(LoDTensor, required) The input tensor with shape [N, D], "
"where N is the size of mini-batch, and D is the feature size."); "where N is the size of mini-batch, and D is the feature size.");
AddInput("W", AddInput("W",
"(Tensor, required), The parameters of hierarchical " "(LoDTensor, required), The parameters of hierarchical "
"sigmoid operator, each of them is a 2-D tensor, the shape is" "sigmoid operator, each of them is a 2-D tensor, the shape is"
"[num_classes - 1, D]."); "[K, D]. Which K is the num of non-leaf node in Path Tree");
AddInput("Label", AddInput("Label",
"(Tensor, required), The labels of training data. It's a" "(LoDTensor, required), The labels of training data. It's a"
"tensor with shape [N, 1]."); "tensor with shape [N, 1].");
AddInput("PTable",
"(LoDTensor, optional), The Path Table from root to current word"
"it should have shape like [N, L], L is the length of the Path")
.AsDispensable();
AddInput(
"PathCode",
"(LoDTensor, optional), The Code on each Node of the Path from root "
"to current word"
"it should have shape like [N, L], L is the length of the Path")
.AsDispensable();
AddInput("Bias", AddInput("Bias",
"(Tensor, optional), The bias is a tensor with shape" "(LoDTensor, optional), The bias is a tensor with shape or "
"[1, num_classes - 1]."); "[num_classes, 1]"
AddOutput("Out", "[num_classes - 1, 1].")
"(Tensor, required) The output of hierarchical sigmoid operator." .AsDispensable();
AddOutput(
"Out",
"(LoDTensor, required) The output of hierarchical sigmoid operator."
"The shape is [N, 1]."); "The shape is [N, 1].");
AddOutput("PreOut", AddOutput("PreOut",
"(Tensor, required) A intermedia 2-D tensor with shape " "(LoDTensor, required) A intermedia 2-D tensor with shape "
"[batch_size, code_length], where code_length represents the " "[batch_size, code_length], where code_length represents the "
"maximum path length from root to leaf nodes.") "maximum path length from root to leaf nodes.")
.AsIntermediate(); .AsIntermediate();
AddAttr<AttrType>("num_classes", "(int, required), The number of classes") AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
.SetDefault(2); .SetDefault(2);
AddComment(R"DOC( AddComment(R"DOC(
The hierarchical sigmoid operator organize the classes into a binary tree. The hierarchical sigmoid operator organize the classes into a binary tree.
...@@ -115,6 +129,10 @@ belonging to the right branch. This idea is from ...@@ -115,6 +129,10 @@ belonging to the right branch. This idea is from
"F. Morin, Y. Bengio (AISTATS 05): "F. Morin, Y. Bengio (AISTATS 05):
Hierarchical Probabilistic Neural Network Language Model." Hierarchical Probabilistic Neural Network Language Model."
)DOC"); )DOC");
AddAttr<bool>("is_sparse",
"(boolean, default false) "
"Sparse update.")
.SetDefault(false);
} }
}; };
...@@ -124,16 +142,21 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { ...@@ -124,16 +142,21 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null."); PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null."); PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@Grad) should not be null");
PADDLE_ENFORCE(ctx->HasInput("PreOut"), PADDLE_ENFORCE(ctx->HasInput("PreOut"),
"Input(Preout) should not be null."); "Input(Preout) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
"Output(W@Grad should not be null.)"); "Output(W@Grad should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X"))); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad should not be null.");
if (!ctx->Attrs().Get<bool>("is_sparse")) {
if (ctx->HasOutput(framework::GradVarName("Bias"))) { if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias")); ctx->GetInputDim("Bias"));
} }
ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
}
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
} }
...@@ -141,11 +164,55 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { ...@@ -141,11 +164,55 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.GetPlace()); ctx.GetPlace());
} }
}; };
class HierarchicalSigmoidGradOpGradVarTypeInference
: public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto w_grad_var_name = op_desc.Output(framework::GradVarName("W")).front();
auto bias_grad_var_name_vec =
op_desc.Output(framework::GradVarName("Bias"));
std::string bias_grad_var_name;
bool hasBias = false;
if (bias_grad_var_name_vec.size()) {
hasBias = true;
bias_grad_var_name =
op_desc.Output(framework::GradVarName("Bias")).front();
}
auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr);
if (is_sparse) {
VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
<< " is set to SelectedRows";
block->Var(w_grad_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS);
if (hasBias) {
VLOG(30) << "hierarchical_sigmoid_grad op "
<< framework::GradVarName("Bias") << " is set to SelectedRows";
block->Var(bias_grad_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS);
}
} else {
VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
<< " is set to LoDTensor";
block->Var(w_grad_var_name)
->SetType(framework::proto::VarType::LOD_TENSOR);
if (hasBias) {
VLOG(30) << "hierarchical_sigmoid_grad op "
<< framework::GradVarName("Bias") << " is set to LoDTensor";
block->Var(bias_grad_var_name)
->SetType(framework::proto::VarType::LOD_TENSOR);
}
}
block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType());
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -153,7 +220,8 @@ namespace ops = paddle::operators; ...@@ -153,7 +220,8 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp, REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
ops::HierarchicalSigmoidOpMaker<int>, ops::HierarchicalSigmoidOpMaker<int>,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp); REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp,
ops::HierarchicalSigmoidGradOpGradVarTypeInference);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
hierarchical_sigmoid, hierarchical_sigmoid,
ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>, ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -14,12 +14,16 @@ limitations under the License. */ ...@@ -14,12 +14,16 @@ limitations under the License. */
#pragma once #pragma once
#include <iostream> #include <iostream>
#include <set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/clip_op.h" #include "paddle/fluid/operators/clip_op.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/matrix_bit_code.h" #include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -28,20 +32,38 @@ template <typename T, int MajorType = Eigen::RowMajor, ...@@ -28,20 +32,38 @@ template <typename T, int MajorType = Eigen::RowMajor,
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
using platform::Transform; using platform::Transform;
static std::vector<int64_t> PathToRows(const framework::LoDTensor& path) {
std::set<int64_t> rows;
for (int64_t i = 0; i < path.numel(); ++i) {
int64_t row = path.data<int64_t>()[i];
if (row < 0) {
continue;
}
rows.emplace(row);
}
return std::vector<int64_t>(rows.begin(), rows.end());
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> { class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X"); auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
auto* w = ctx.Input<framework::Tensor>("W"); auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
auto* label = ctx.Input<framework::Tensor>("Label"); auto* path = ctx.Input<framework::LoDTensor>("PTable");
auto* bias = ctx.Input<framework::Tensor>("Bias"); auto* code = ctx.Input<framework::LoDTensor>("PathCode");
auto* out = ctx.Output<framework::Tensor>("Out"); auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
auto* pre_out = ctx.Output<framework::Tensor>("PreOut"); auto* bias = ctx.Input<framework::LoDTensor>("Bias");
auto* out = ctx.Output<framework::LoDTensor>("Out");
auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes")); size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
int64_t code_length = math::FindLastSet(num_classes - 1); bool is_custom = false;
int64_t batch_size = in->dims()[0]; if (path) {
framework::Tensor sum; is_custom = true;
}
int64_t code_length =
path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
int64_t batch_size = in.dims()[0];
framework::LoDTensor sum;
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto* pre_out_data = pre_out->mutable_data<T>( auto* pre_out_data = pre_out->mutable_data<T>(
framework::make_ddim({batch_size, code_length}), ctx.GetPlace()); framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
...@@ -52,7 +74,15 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> { ...@@ -52,7 +74,15 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
zero(dev_ctx, pre_out, static_cast<T>(0.0)); zero(dev_ctx, pre_out, static_cast<T>(0.0));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
math::RowwiseSum<DeviceContext, T> row_sum; math::RowwiseSum<DeviceContext, T> row_sum;
math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
if (!is_custom) {
bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
label.data<int64_t>()));
} else {
bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
label.data<int64_t>()));
}
std::vector<int64_t> sum_dims({batch_size, 1UL}); std::vector<int64_t> sum_dims({batch_size, 1UL});
sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace()); sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
...@@ -60,15 +90,15 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> { ...@@ -60,15 +90,15 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto out_mat = framework::EigenVector<T>::Flatten(*out); auto out_mat = framework::EigenVector<T>::Flatten(*out);
if (bias) { if (bias) {
bit_code.Add(pre_out, *bias); bit_code->Add(*bias, pre_out);
} }
bit_code.Mul(pre_out, *w, *in); bit_code->Mul(pre_out, w, in);
// clip to [-40, 40] // clip to [-40, 40]
Transform<DeviceContext> trans; Transform<DeviceContext> trans;
trans(ctx.template device_context<DeviceContext>(), pre_out_data, trans(ctx.template device_context<DeviceContext>(), pre_out_data,
pre_out_data + pre_out->numel(), pre_out_data, pre_out_data + pre_out->numel(), pre_out_data,
ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0))); ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
bit_code.Sum(*pre_out, out, static_cast<T>(-1)); bit_code->Sum(*pre_out, out, static_cast<T>(-1));
// use softrelu to calculate cross entropy // use softrelu to calculate cross entropy
pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log(); pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
row_sum(dev_ctx, *pre_out, &sum); row_sum(dev_ctx, *pre_out, &sum);
...@@ -84,50 +114,103 @@ template <typename DeviceContext, typename T> ...@@ -84,50 +114,103 @@ template <typename DeviceContext, typename T>
class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X"); auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
auto* w = ctx.Input<framework::Tensor>("W"); auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X")); auto* path = ctx.Input<framework::LoDTensor>("PTable");
auto* w_grad = ctx.Output<framework::Tensor>(framework::GradVarName("W")); auto* code = ctx.Input<framework::LoDTensor>("PathCode");
auto* bias_grad = auto* bias = ctx.Input<framework::LoDTensor>("Bias");
ctx.Output<framework::Tensor>(framework::GradVarName("Bias")); auto* in_grad =
auto* label = ctx.Input<framework::Tensor>("Label"); ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto* pre_out = ctx.Input<framework::Tensor>("PreOut"); bool is_sparse = ctx.Attr<bool>("is_sparse");
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
framework::Tensor pre_out_grad;
pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
in_grad->mutable_data<T>(ctx.GetPlace());
w_grad->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::SetConstant<DeviceContext, T> zero; math::SetConstant<DeviceContext, T> zero;
auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
auto& pre_out = detail::Ref(ctx.Input<framework::LoDTensor>("PreOut"));
auto& out_grad = detail::Ref(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out")));
framework::LoDTensor pre_out_grad;
pre_out_grad.mutable_data<T>(pre_out.dims(), ctx.GetPlace());
in_grad->mutable_data<T>(ctx.GetPlace());
zero(dev_ctx, in_grad, static_cast<T>(0.0)); zero(dev_ctx, in_grad, static_cast<T>(0.0));
zero(dev_ctx, w_grad, static_cast<T>(0.0));
size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes")); size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
bool is_custom = false;
if (path) {
is_custom = true;
}
std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
if (!is_custom) {
bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
label.data<int64_t>()));
} else {
bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
label.data<int64_t>()));
}
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto pre_out_mat = EigenMatrix<T>::From(*pre_out); auto pre_out_mat = EigenMatrix<T>::From(pre_out);
auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad); auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
auto out_grad_mat = EigenMatrix<T>::From(*out_grad); auto out_grad_mat = EigenMatrix<T>::From(out_grad);
Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])}; Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
// softrelu derivative // softrelu derivative
pre_out_grad_mat.device(place) = pre_out_grad_mat.device(place) =
static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp(); static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
bit_code.Sub(&pre_out_grad); // the gradient of clip(w * x + b) bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b)
pre_out_grad_mat.device(place) = pre_out_grad_mat.device(place) =
pre_out_grad_mat * out_grad_mat.broadcast(bcast); pre_out_grad_mat * out_grad_mat.broadcast(bcast);
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward. // be consistent with the clipping in forward.
if (!is_sparse) {
auto* bias_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
if (bias_grad) { if (bias_grad) {
bias_grad->mutable_data<T>(ctx.GetPlace()); bias_grad->mutable_data<T>(ctx.GetPlace());
zero(dev_ctx, bias_grad, static_cast<T>(0.0)); zero(dev_ctx, bias_grad, static_cast<T>(0.0));
bit_code.AddGrad(pre_out_grad, bias_grad); bit_code->AddGrad(pre_out_grad, bias_grad);
}
auto* w_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
w_grad->mutable_data<T>(ctx.GetPlace());
zero(dev_ctx, w_grad, static_cast<T>(0.0));
bit_code->MulGradWeight(pre_out_grad, w_grad, in);
} else {
framework::Vector<int64_t> real_rows = PathToRows(*path);
auto* w_grad =
ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
w_grad->set_rows(real_rows);
// Build a map of id -> row_index to speed up finding the index of one id
w_grad->SyncIndex();
w_grad->set_height(w.dims()[0]);
auto* w_grad_value = w_grad->mutable_value();
framework::DDim temp_dim(w.dims());
set(temp_dim, 0, real_rows.size());
w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
auto* bias_grad =
ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
if (bias_grad) {
bias_grad->set_rows(real_rows);
// build ids -> rows index map
bias_grad->SyncIndex();
bias_grad->set_height(bias->dims()[0]);
auto* bias_grad_value = bias_grad->mutable_value();
std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
bias->dims()[1]};
bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
ctx.GetPlace());
zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
bit_code->AddGrad(pre_out_grad, bias_grad);
}
bit_code->MulGradWeight(pre_out_grad, w_grad, in);
} }
bit_code.MulGradWeight(pre_out_grad, w_grad, *in); bit_code->MulGradError(pre_out_grad, w, in_grad);
bit_code.MulGradError(pre_out_grad, *w, in_grad);
} }
}; };
......
...@@ -19,16 +19,15 @@ namespace operators { ...@@ -19,16 +19,15 @@ namespace operators {
namespace math { namespace math {
template <typename T> template <typename T>
void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat, void MatrixBitCodeFunctor<T>::Add(const framework::Tensor& vec,
const framework::Tensor& vec) { framework::Tensor* tmat) {
SimpleCodeTable code_table(num_classes_);
size_t batch_size = tmat->dims()[0]; size_t batch_size = tmat->dims()[0];
size_t width = tmat->dims()[1]; size_t width = tmat->dims()[1];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i])); auto code = code_table_->get_code(i);
int code_length = code.get_length(); int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j); size_t index = code->calc_index(j);
tmat->data<T>()[i * width + j] += vec.data<T>()[index]; tmat->data<T>()[i * width + j] += vec.data<T>()[index];
} }
} }
...@@ -37,31 +36,46 @@ void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat, ...@@ -37,31 +36,46 @@ void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
template <typename T> template <typename T>
void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat, void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
framework::Tensor* vec) { framework::Tensor* vec) {
SimpleCodeTable code_table(num_classes_);
size_t batch_size = tmat.dims()[0]; size_t batch_size = tmat.dims()[0];
size_t width = tmat.dims()[1]; size_t width = tmat.dims()[1];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i])); auto code = code_table_->get_code(i);
int code_length = code.get_length(); int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j); size_t index = code->calc_index(j);
vec->data<T>()[index] += tmat.data<T>()[i * width + j]; vec->data<T>()[index] += tmat.data<T>()[i * width + j];
} }
} }
} }
template <typename T>
void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
framework::SelectedRows* vec) {
size_t batch_size = tmat.dims()[0];
size_t width = tmat.dims()[1];
for (size_t i = 0; i < batch_size; ++i) {
auto code = code_table_->get_code(i);
int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code->calc_index(j);
int64_t row_index = vec->GetIndexFromId(static_cast<int64_t>(index));
vec->mutable_value()->data<T>()[row_index] +=
tmat.data<T>()[i * width + j];
}
}
}
template <typename T> template <typename T>
void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat, void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
framework::Tensor* sum, T scale_sum) { framework::Tensor* sum, T scale_sum) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0]; size_t num_samples = tmat.dims()[0];
size_t o_width = tmat.dims()[1]; size_t o_width = tmat.dims()[1];
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
T sm = static_cast<T>(0.0); T sm = static_cast<T>(0.0);
auto code = code_table(static_cast<size_t>(ids_[i])); auto code = code_table_->get_code(i);
int code_length = code.get_length(); int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
if (code.calc_bit(j)) { if (code->calc_bit(j)) {
// calc_bit starts from right most bit, while data in tmat[i] is in the // calc_bit starts from right most bit, while data in tmat[i] is in the
// reverse order. // reverse order.
sm += tmat.data<T>()[i * o_width + j]; sm += tmat.data<T>()[i * o_width + j];
...@@ -75,7 +89,6 @@ template <typename T> ...@@ -75,7 +89,6 @@ template <typename T>
void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat, void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
const framework::Tensor& weight, const framework::Tensor& weight,
const framework::Tensor& input) { const framework::Tensor& input) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat->dims()[0]; size_t num_samples = tmat->dims()[0];
size_t tmat_width = tmat->dims()[1]; size_t tmat_width = tmat->dims()[1];
size_t input_width = input.dims()[1]; size_t input_width = input.dims()[1];
...@@ -84,10 +97,10 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat, ...@@ -84,10 +97,10 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
auto weight_value = weight.data<T>(); auto weight_value = weight.data<T>();
auto input_value = input.data<T>(); auto input_value = input.data<T>();
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i])); auto code = code_table_->get_code(i);
int code_length = code.get_length(); int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j); size_t index = code->calc_index(j);
T sum = static_cast<T>(0.0); T sum = static_cast<T>(0.0);
for (size_t k = 0; k < input_width; ++k) { for (size_t k = 0; k < input_width; ++k) {
sum += weight_value[weight_width * index + k] * sum += weight_value[weight_width * index + k] *
...@@ -102,7 +115,6 @@ template <typename T> ...@@ -102,7 +115,6 @@ template <typename T>
void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
framework::Tensor* weight, framework::Tensor* weight,
const framework::Tensor& input) { const framework::Tensor& input) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0]; size_t num_samples = tmat.dims()[0];
size_t input_width = input.dims()[1]; size_t input_width = input.dims()[1];
size_t tmat_width = tmat.dims()[1]; size_t tmat_width = tmat.dims()[1];
...@@ -111,10 +123,10 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, ...@@ -111,10 +123,10 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
auto weight_value = weight->data<T>(); auto weight_value = weight->data<T>();
auto input_value = input.data<T>(); auto input_value = input.data<T>();
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i])); auto code = code_table_->get_code(i);
int code_length = code.get_length(); int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j); size_t index = code->calc_index(j);
for (size_t k = 0; k < input_width; ++k) { for (size_t k = 0; k < input_width; ++k) {
weight_value[weight_width * index + k] += weight_value[weight_width * index + k] +=
...@@ -124,11 +136,35 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, ...@@ -124,11 +136,35 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
} }
} }
template <typename T>
void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
framework::SelectedRows* weight,
const framework::Tensor& input) {
size_t num_samples = tmat.dims()[0];
size_t input_width = input.dims()[1];
size_t tmat_width = tmat.dims()[1];
size_t weight_width = weight->value().dims()[1];
auto tmat_value = tmat.data<T>();
auto weight_value = weight->mutable_value()->data<T>();
auto input_value = input.data<T>();
for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table_->get_code(i);
int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code->calc_index(j);
for (size_t k = 0; k < input_width; ++k) {
int64_t row_index = weight->GetIndexFromId(static_cast<int64_t>(index));
weight_value[row_index * weight_width + k] +=
tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
}
}
}
}
template <typename T> template <typename T>
void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat, void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
const framework::Tensor& weight, const framework::Tensor& weight,
framework::Tensor* input) { framework::Tensor* input) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0]; size_t num_samples = tmat.dims()[0];
size_t tmat_width = tmat.dims()[1]; size_t tmat_width = tmat.dims()[1];
size_t input_width = input->dims()[1]; size_t input_width = input->dims()[1];
...@@ -138,10 +174,10 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat, ...@@ -138,10 +174,10 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
auto input_value = input->data<T>(); auto input_value = input->data<T>();
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i])); auto code = code_table_->get_code(i);
int code_length = code.get_length(); int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j); size_t index = code->calc_index(j);
for (size_t k = 0; k < input_width; ++k) { for (size_t k = 0; k < input_width; ++k) {
input_value[input_width * i + k] += input_value[input_width * i + k] +=
...@@ -154,14 +190,13 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat, ...@@ -154,14 +190,13 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
template <typename T> template <typename T>
void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) { void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat->dims()[0]; size_t num_samples = tmat->dims()[0];
size_t o_width = tmat->dims()[1]; size_t o_width = tmat->dims()[1];
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i])); auto code = code_table_->get_code(i);
int code_length = code.get_length(); int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
if (code.calc_bit(j)) { if (code->calc_bit(j)) {
tmat->data<T>()[i * o_width + j] -= 1; tmat->data<T>()[i * o_width + j] -= 1;
} }
} }
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -92,9 +94,27 @@ inline int clz(const T& value) { ...@@ -92,9 +94,27 @@ inline int clz(const T& value) {
inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); } inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
#endif // !_WIN32 #endif // !_WIN32
// set a code interface to create multiple code
class Code {
public:
virtual ~Code() {}
virtual size_t calc_index(int bit) const = 0;
virtual bool calc_bit(int bit) const = 0;
virtual int get_length() const = 0;
};
// set a CodeTable interface to create multiple code table
class CodeTable {
public:
virtual std::unique_ptr<Code> get_code(int64_t code) const = 0;
virtual size_t size() const = 0;
virtual int get_max_code_length() const = 0;
virtual ~CodeTable() {}
};
struct SimpleCode { class SimpleCode : public Code {
SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {} public:
SimpleCode(size_t code, size_t num_classes, const int64_t* ids)
: c_(static_cast<size_t>(ids[code]) + num_classes) {}
/** /**
* Here the id of root shoud be 1 rather than 0, thus the encoding of class c * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
* is `c + num_classes` and all siblings can get the same weight indice using * is `c + num_classes` and all siblings can get the same weight indice using
...@@ -104,41 +124,121 @@ struct SimpleCode { ...@@ -104,41 +124,121 @@ struct SimpleCode {
* Binary classification path is the suffixes of encoding, thus leave out the * Binary classification path is the suffixes of encoding, thus leave out the
* left most bit in calc_bit. * left most bit in calc_bit.
*/ */
inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; } size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
inline bool calc_bit(int bit) const { return c_ & (1 << bit); } bool calc_bit(int bit) const { return c_ & (1 << bit); }
inline int get_length() const { return FindLastSet(c_) - 1; } int get_length() const { return FindLastSet(c_) - 1; }
private: private:
size_t c_; size_t c_;
}; };
struct SimpleCodeTable { template <typename T>
explicit SimpleCodeTable(size_t num_classes) : num_classes_(num_classes) {} class CustomCode : public Code {
SimpleCode operator()(size_t code) const { public:
return SimpleCode(code, num_classes_); CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
const int64_t* ids, int index)
: ids_(ids), index_(index) {
ptable_ = ptable.Slice(index, index + 1);
pcode_ = pcode.Slice(index, index + 1);
}
/**
* Here the id of root shoud be 1 rather than 0, thus the encoding of class c
* is `c + num_classes` and all siblings can get the same weight indice using
* prefixes.
* Weight index is the prefixes of encoding, thus leave out the right most
* bit in calc_index.
* Binary classification path is the suffixes of encoding, thus leave out the
* left most bit in calc_bit.
*/
size_t calc_index(int bit) const { return ptable_.data<T>()[bit]; }
bool calc_bit(int bit) const { return pcode_.data<T>()[bit]; }
int get_length() const {
int length = 0;
for (int i = 0; i < static_cast<int>(ptable_.dims()[1]); i++) {
if (ptable_.data<T>()[i] >= 0) {
length++;
} else {
return length;
}
}
return length;
}
private:
framework::Tensor ptable_;
framework::Tensor pcode_;
const int64_t* ids_;
const int index_;
};
class SimpleCodeTable : public CodeTable {
public:
SimpleCodeTable(size_t num_classes, const int64_t* ids)
: num_classes_(num_classes), ids_(ids) {}
std::unique_ptr<Code> get_code(int64_t code) const {
std::unique_ptr<Code> coder(new SimpleCode(code, num_classes_, ids_));
return coder;
} }
size_t size() const { return num_classes_; } size_t size() const { return num_classes_; }
int get_max_code_length() const { return FindLastSet(num_classes_ - 1); } int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
private: private:
size_t num_classes_; size_t num_classes_;
const int64_t* ids_;
};
template <typename T>
class CustomCodeTable : public CodeTable {
public:
CustomCodeTable(const framework::Tensor& ptable,
const framework::Tensor& pcode, const int64_t* ids)
: ptable_(ptable), pcode_(pcode), ids_(ids) {}
std::unique_ptr<Code> get_code(int64_t code) const {
std::unique_ptr<Code> coder(new CustomCode<T>(ptable_, pcode_, ids_, code));
return coder;
}
size_t size() const { return static_cast<size_t>(ptable_.dims()[1]); }
int get_max_code_length() const {
return static_cast<size_t>(ptable_.dims()[1]);
}
private:
const framework::Tensor& ptable_;
const framework::Tensor& pcode_;
const int64_t* ids_;
}; };
template <typename T> template <typename T>
class MatrixBitCodeFunctor { class MatrixBitCodeFunctor {
public: public:
explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids) MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
: num_classes_(num_classes), ids_(ids) {} : num_classes_(num_classes),
ids_(ids),
code_table_(new SimpleCodeTable(num_classes, ids)) {}
MatrixBitCodeFunctor(const framework::Tensor& ptable,
const framework::Tensor& pcode, const int64_t* ids)
: num_classes_(static_cast<size_t>(ptable.dims()[1])),
ids_(ids),
code_table_(new CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
/* For j < code_length /* For j < code_length
tmat(i, j) += vec(0, index(i, j)) tmat(i, j) += vec(0, index(i, j))
*/ */
void Add(framework::Tensor* tmat, const framework::Tensor& vec); void Add(const framework::Tensor& vec, framework::Tensor* tmat);
/* For j < code_length /* For j < code_length
vec(0, index(i, j)) += tmat(i, j) vec(0, index(i, j)) += tmat(i, j)
*/ */
void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec); void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
/* For selected rows For j < code_length
vec(0, index(i, j)) += tmat(i, j)
*/
void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec);
/* For j < code_length /* For j < code_length
sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
*/ */
...@@ -159,6 +259,12 @@ class MatrixBitCodeFunctor { ...@@ -159,6 +259,12 @@ class MatrixBitCodeFunctor {
*/ */
void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight, void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight,
const framework::Tensor& input); const framework::Tensor& input);
/* For SelectedRows Weight, For index(i, j) >= 0:
weight.row(index(i, j)) += tmat(i, j) * input.row(i)
*/
void MulGradWeight(const framework::Tensor& tmat,
framework::SelectedRows* weight,
const framework::Tensor& input);
/* For j < code_length /* For j < code_length
input.row(i) += tmat(i, j) * weight.row(index(i, j)) input.row(i) += tmat(i, j) * weight.row(index(i, j))
*/ */
...@@ -167,6 +273,7 @@ class MatrixBitCodeFunctor { ...@@ -167,6 +273,7 @@ class MatrixBitCodeFunctor {
size_t num_classes_; size_t num_classes_;
const int64_t* ids_; const int64_t* ids_;
std::unique_ptr<CodeTable> code_table_;
}; };
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -60,75 +60,30 @@ float LogUniformSampler::Probability(int64_t value) const { ...@@ -60,75 +60,30 @@ float LogUniformSampler::Probability(int64_t value) const {
return (log((value + 2.0) / (value + 1.0))) / log_range_; return (log((value + 2.0) / (value + 1.0))) / log_range_;
} }
CustomSampler::CustomSampler(int64_t range, const float* probabilities, CustomSampler::CustomSampler(int64_t range, const float *probabilities,
const int *alias, const float *alias_probabilities,
unsigned int seed) unsigned int seed)
: Sampler(range, seed) { : Sampler(range, seed) {
random_engine_ = std::make_shared<std::mt19937_64>(seed_); random_engine_ = std::make_shared<std::mt19937>(seed_);
real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1); real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range); int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
alias_probs_ = std::make_shared<std::vector<float>>(range + 1);
alias_ = std::make_shared<std::vector<int64_t>>(range + 1);
probs_ = std::make_shared<std::vector<float>>(range + 1);
std::queue<std::pair<int64_t, float>> bigs;
std::queue<std::pair<int64_t, float>> littles;
for (int64_t i = 0; i <= range; ++i) {
(*probs_)[i] = probabilities[i];
float normal_prob = probabilities[i] * (range + 1);
if (normal_prob - 1.0 > 1e-4) {
bigs.emplace(i, normal_prob);
} else if (1.0 - normal_prob > 1e-4) {
littles.emplace(i, normal_prob);
} else {
(*alias_probs_)[i] = normal_prob;
(*alias_)[i] = -1;
}
}
while ((!littles.empty()) && (!bigs.empty())) {
auto big = bigs.front();
auto little = littles.front();
bigs.pop();
littles.pop();
(*alias_probs_)[little.first] = little.second;
(*alias_)[little.first] = big.first;
auto big_left = big.second - (1 - little.second);
if (big_left - 1.0 > 1e-4) {
bigs.emplace(big.first, big_left);
} else if (1.0 - big_left > 1e-4) {
littles.emplace(big.first, big_left);
} else {
(*alias_probs_)[big.first] = big_left;
(*alias_)[big.first] = -1;
}
}
if (!littles.empty()) { // littles.second is close to 1.0
auto little = littles.front();
(*alias_probs_)[little.first] = 1.0;
(*alias_)[little.first] = -1;
}
if (!bigs.empty()) { // bigs.second is close to 1.0 alias_probs_ = alias_probabilities;
auto big = bigs.front(); probs_ = probabilities;
(*alias_probs_)[big.first] = 1.0; alias_ = alias;
(*alias_)[big.first] = -1;
}
} }
int64_t CustomSampler::Sample() const { int64_t CustomSampler::Sample() const {
auto index = (*int_dist_)(*random_engine_); auto index = (*int_dist_)(*random_engine_);
auto p = (*real_dist_)(*random_engine_); auto p = (*real_dist_)(*random_engine_);
if (p > (*alias_probs_)[index]) { if (p > alias_probs_[index]) {
return (*alias_)[index]; return alias_[index];
} else { } else {
return index; return index;
} }
} }
float CustomSampler::Probability(int64_t value) const { float CustomSampler::Probability(int64_t value) const { return probs_[value]; }
return (*probs_)[value];
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <cstdint> #include <cstdint>
#include <memory> #include <memory>
#include <random> #include <random>
...@@ -38,9 +39,12 @@ class Sampler { ...@@ -38,9 +39,12 @@ class Sampler {
seed_ = seed; seed_ = seed;
} }
} }
virtual ~Sampler(); virtual ~Sampler();
// Sample a single value // Sample a single value
virtual int64_t Sample() const = 0; virtual int64_t Sample() const = 0;
// The probability that a single call to Sample() returns the given value. // The probability that a single call to Sample() returns the given value.
virtual float Probability(int64_t value) const = 0; virtual float Probability(int64_t value) const = 0;
...@@ -99,6 +103,7 @@ class LogUniformSampler : public Sampler { ...@@ -99,6 +103,7 @@ class LogUniformSampler : public Sampler {
class CustomSampler : public Sampler { class CustomSampler : public Sampler {
public: public:
explicit CustomSampler(int64_t range, const float* probabilities, explicit CustomSampler(int64_t range, const float* probabilities,
const int* alias, const float* alias_probabilities,
unsigned int seed = 0UL); unsigned int seed = 0UL);
~CustomSampler() override {} ~CustomSampler() override {}
...@@ -108,10 +113,10 @@ class CustomSampler : public Sampler { ...@@ -108,10 +113,10 @@ class CustomSampler : public Sampler {
float Probability(int64_t value) const override; float Probability(int64_t value) const override;
private: private:
std::shared_ptr<std::vector<float>> alias_probs_; const float* alias_probs_;
std::shared_ptr<std::vector<int64_t>> alias_; const int* alias_;
std::shared_ptr<std::vector<float>> probs_; const float* probs_;
std::shared_ptr<std::mt19937_64> random_engine_; std::shared_ptr<std::mt19937> random_engine_;
std::shared_ptr<std::uniform_real_distribution<>> real_dist_; std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
std::shared_ptr<std::uniform_int_distribution<>> int_dist_; std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
}; };
......
...@@ -16,13 +16,12 @@ limitations under the License. */ ...@@ -16,13 +16,12 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/operators/math/sequence_pooling.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
#define FLT_MAX __FLT_MAX__
template <typename T> template <typename T>
struct MaxPoolFunctor { struct MaxPoolFunctor {
HOSTDEVICE void operator()(const T* input, const size_t start, HOSTDEVICE void operator()(const T* input, const size_t start,
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/nce_op.h" #include "paddle/fluid/operators/nce_op.h"
#include <string>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -25,7 +26,7 @@ class NCEOp : public framework::OperatorWithKernel { ...@@ -25,7 +26,7 @@ class NCEOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input")); PADDLE_ENFORCE(ctx->HasInput("Input"));
PADDLE_ENFORCE(ctx->HasInput("Label")); PADDLE_ENFORCE(ctx->HasInput("Label"));
PADDLE_ENFORCE(ctx->HasInput("Weight")); PADDLE_ENFORCE(ctx->HasInput("Weight"));
...@@ -67,7 +68,7 @@ class NCEOp : public framework::OperatorWithKernel { ...@@ -67,7 +68,7 @@ class NCEOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
platform::CPUPlace()); platform::CPUPlace());
...@@ -101,11 +102,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -101,11 +102,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
.AsDispensable(); .AsDispensable();
AddInput( AddInput(
"CustomDistribution", "CustomDistProbs",
"(Tensor) It is used in 'CostumDist' sampler. " "(Tensor) It is used in 'CostumDist' sampler. "
"It is a tensor with shape [num_total_classes]." "It is a tensor with shape [num_total_classes]."
"The i-th element is the probsbility of the i-th class being sampled.") "The i-th element is the probsbility of the i-th class being sampled.")
.AsDispensable(); .AsDispensable();
AddInput(
"CustomDistAlias",
"(Tensor) It is used in 'CostumDist' sampler. "
"It is a tensor with shape [num_total_classes]."
"The i-th element is the probsbility of the i-th class being sampled.")
.AsDispensable();
AddInput(
"CustomDistAliasProbs",
"(Tensor) It is used in 'CostumDist' sampler. "
"It is a tensor with shape [num_total_classes]."
"The i-th element is the probsbility of the i-th class being sampled.")
.AsDispensable();
AddOutput("Cost", AddOutput("Cost",
"(Tensor) A tensor of shape [batch_size, 1]. Cost of samples."); "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
AddOutput("SampleLogits", AddOutput("SampleLogits",
...@@ -124,21 +138,22 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -124,21 +138,22 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
"kernel to compute grads." "kernel to compute grads."
"") "")
.AsIntermediate(); .AsIntermediate();
AddAttr<int>("num_total_classes", AddAttr<int>("num_total_classes",
"Total number of classes in all samples."); "Total number of classes in all samples.");
AddAttr<int>("num_neg_samples", AddAttr<int>("num_neg_samples",
"The number of negative classes. The default value is 10.") "The number of negative classes. The default value is 10.")
.SetDefault(10); .SetDefault(10);
AddAttr<int>("sampler", AddAttr<int>("sampler",
"(int) Which sampler to be used to sample negative class." "(int) Which sampler to be used to sample negative class."
"0: Uniform; 1: LogUniform; 2: CostumDist.") "0: Uniform; 1: LogUniform; 2: CostumDist.")
.SetDefault(0); .SetDefault(0);
AddAttr<int>("seed", AddAttr<int>("seed",
"(int) The seed used in sampler. If it is 0, " "(int) The seed used in sampler. If it is 0, "
"the sampler will generate a seed randomly.") "the sampler will generate a seed randomly.")
.SetDefault(0); .SetDefault(0);
AddAttr<bool>("is_sparse", "(boolean, default false) Sparse update.")
.SetDefault(false);
AddAttr<std::vector<int>>("custom_neg_classes", AddAttr<std::vector<int>>("custom_neg_classes",
"This attribute only be used in unitest. Classes " "This attribute only be used in unitest. Classes "
...@@ -156,11 +171,19 @@ By default this operator uses a uniform distribution for sampling. ...@@ -156,11 +171,19 @@ By default this operator uses a uniform distribution for sampling.
} }
}; };
class NCEOpGradDescMaker : public framework::DefaultGradOpDescMaker<true> {
using ::paddle::framework::DefaultGradOpDescMaker<
true>::DefaultGradOpDescMaker;
protected:
virtual std::string GradOpType() const { return "nce_grad"; }
};
class NCEOpGrad : public framework::OperatorWithKernel { class NCEOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input")); PADDLE_ENFORCE(ctx->HasInput("Input"));
PADDLE_ENFORCE(ctx->HasInput("Weight")); PADDLE_ENFORCE(ctx->HasInput("Weight"));
PADDLE_ENFORCE(ctx->HasInput("Cost")); PADDLE_ENFORCE(ctx->HasInput("Cost"));
...@@ -190,20 +213,45 @@ class NCEOpGrad : public framework::OperatorWithKernel { ...@@ -190,20 +213,45 @@ class NCEOpGrad : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
platform::CPUPlace()); platform::CPUPlace());
} }
}; };
class NCEOpGradVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front();
auto bias_grad = op_desc.Output(framework::GradVarName("Bias")).front();
auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr);
if (is_sparse) {
VLOG(30) << "nce_op_grad op " << weight_grad << " and " << bias_grad
<< " is set to SelectedRows";
block->Var(weight_grad)
->SetType(framework::proto::VarType::SELECTED_ROWS);
block->Var(bias_grad)->SetType(framework::proto::VarType::SELECTED_ROWS);
} else {
VLOG(30) << "nce_op_grad op " << weight_grad << " and " << bias_grad
<< " is set to LoDTensor";
block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
block->Var(bias_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
}
block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType());
block->Var(bias_grad)->SetDataType(block->Var("Input")->GetDataType());
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpMaker, REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpGradDescMaker, ops::NCEOpMaker);
paddle::framework::DefaultGradOpDescMaker<true>); REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad, ops::NCEOpGradVarTypeInference);
REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad);
REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>, REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
ops::NCEKernel<paddle::platform::CPUPlace, double>); ops::NCEKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(nce_grad, REGISTER_OP_CPU_KERNEL(nce_grad,
......
...@@ -16,26 +16,32 @@ limitations under the License. */ ...@@ -16,26 +16,32 @@ limitations under the License. */
#include <math.h> #include <math.h>
#include <random> #include <random>
#include <set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/sampler.h" #include "paddle/fluid/operators/math/sampler.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
using Sampler = math::Sampler; using Sampler = math::Sampler;
using DDim = framework::DDim;
template <typename T, int MajorType = Eigen::RowMajor, template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void PrepareSamples(const framework::ExecutionContext& context, void PrepareSamples(const framework::ExecutionContext &context,
Sampler* sampler) { Sampler *sampler) {
auto label = context.Input<Tensor>("Label"); auto label = context.Input<Tensor>("Label");
const int64_t* label_data = label->data<int64_t>(); const int64_t *label_data = label->data<int64_t>();
auto label_dims = label->dims(); auto label_dims = label->dims();
// int num_total_classes = context.Attr<int>("num_total_classes"); // int num_total_classes = context.Attr<int>("num_total_classes");
// for unitest // for unitest
...@@ -44,7 +50,7 @@ void PrepareSamples(const framework::ExecutionContext& context, ...@@ -44,7 +50,7 @@ void PrepareSamples(const framework::ExecutionContext& context,
auto sample_labels = context.Output<Tensor>("SampleLabels"); auto sample_labels = context.Output<Tensor>("SampleLabels");
auto sample_labels_dims = sample_labels->dims(); auto sample_labels_dims = sample_labels->dims();
int64_t* sample_labels_data = int64_t *sample_labels_data =
sample_labels->mutable_data<int64_t>(context.GetPlace()); sample_labels->mutable_data<int64_t>(context.GetPlace());
int num_label = label_dims.size() == 2 ? label_dims[1] : 1; int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
...@@ -70,13 +76,13 @@ void PrepareSamples(const framework::ExecutionContext& context, ...@@ -70,13 +76,13 @@ void PrepareSamples(const framework::ExecutionContext& context,
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class NCEKernel : public framework::OpKernel<T> { class NCEKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext &context) const override {
int sampler_type = context.Attr<int>("sampler"); int sampler_type = context.Attr<int>("sampler");
int seed = context.Attr<int>("seed"); int seed = context.Attr<int>("seed");
int num_total_classes = context.Attr<int>("num_total_classes"); int num_total_classes = context.Attr<int>("num_total_classes");
int num_neg_samples = context.Attr<int>("num_neg_samples"); int num_neg_samples = context.Attr<int>("num_neg_samples");
Sampler* sampler; Sampler *sampler;
switch (sampler_type) { switch (sampler_type) {
case 0: { case 0: {
sampler = new math::UniformSampler(num_total_classes - 1, seed); sampler = new math::UniformSampler(num_total_classes - 1, seed);
...@@ -87,11 +93,19 @@ class NCEKernel : public framework::OpKernel<T> { ...@@ -87,11 +93,19 @@ class NCEKernel : public framework::OpKernel<T> {
break; break;
} }
case 2: { case 2: {
auto custom_dist = context.Input<Tensor>("CustomDistribution"); auto dist_probs = context.Input<Tensor>("CustomDistProbs");
const float* custom_dist_data = custom_dist->data<float>(); auto dist_alias = context.Input<Tensor>("CustomDistAlias");
PADDLE_ENFORCE_EQ(custom_dist->numel(), num_total_classes); auto dist_alias_probs = context.Input<Tensor>("CustomDistAliasProbs");
sampler = new math::CustomSampler(num_total_classes - 1,
custom_dist_data, seed); PADDLE_ENFORCE_EQ(dist_probs->numel(), num_total_classes);
PADDLE_ENFORCE_EQ(dist_alias->numel(), num_total_classes);
PADDLE_ENFORCE_EQ(dist_alias_probs->numel(), num_total_classes);
const float *probs_data = dist_probs->data<float>();
const int *alias_data = dist_alias->data<int>();
const float *alias_probs_data = dist_alias_probs->data<float>();
sampler = new math::CustomSampler(num_total_classes - 1, probs_data,
alias_data, alias_probs_data, seed);
break; break;
} }
default: { PADDLE_THROW("Unsupported SamplerType."); } default: { PADDLE_THROW("Unsupported SamplerType."); }
...@@ -99,17 +113,17 @@ class NCEKernel : public framework::OpKernel<T> { ...@@ -99,17 +113,17 @@ class NCEKernel : public framework::OpKernel<T> {
PrepareSamples<DeviceContext, T>(context, sampler); PrepareSamples<DeviceContext, T>(context, sampler);
auto sample_labels = context.Output<Tensor>("SampleLabels"); auto sample_labels = context.Output<Tensor>("SampleLabels");
const int64_t* sample_labels_data = sample_labels->data<int64_t>(); const int64_t *sample_labels_data = sample_labels->data<int64_t>();
auto sample_out = context.Output<Tensor>("SampleLogits"); auto sample_out = context.Output<Tensor>("SampleLogits");
T* sample_out_data = sample_out->mutable_data<T>(context.GetPlace()); T *sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
auto label = context.Input<Tensor>("Label"); auto label = context.Input<Tensor>("Label");
auto sample_weight = context.Input<Tensor>("SampleWeight"); auto sample_weight = context.Input<Tensor>("SampleWeight");
const T* sample_weight_data = nullptr; const T *sample_weight_data = nullptr;
if (sample_weight != nullptr) { if (sample_weight != nullptr) {
sample_weight_data = sample_weight->data<T>(); sample_weight_data = sample_weight->data<T>();
} }
auto out = context.Output<Tensor>("Cost"); auto out = context.Output<Tensor>("Cost");
T* out_data = out->mutable_data<T>(context.GetPlace()); T *out_data = out->mutable_data<T>(context.GetPlace());
int64_t num_true_class = 1; int64_t num_true_class = 1;
if (label != nullptr) { if (label != nullptr) {
num_true_class = label->dims()[1]; num_true_class = label->dims()[1];
...@@ -119,7 +133,7 @@ class NCEKernel : public framework::OpKernel<T> { ...@@ -119,7 +133,7 @@ class NCEKernel : public framework::OpKernel<T> {
// forward bias // forward bias
auto bias = context.Input<Tensor>("Bias"); auto bias = context.Input<Tensor>("Bias");
if (bias != nullptr) { if (bias != nullptr) {
const T* bias_data = bias->data<T>(); const T *bias_data = bias->data<T>();
for (int64_t i = 0; i < sample_labels->numel(); ++i) { for (int64_t i = 0; i < sample_labels->numel(); ++i) {
sample_out_data[i] = bias_data[sample_labels_data[i]]; sample_out_data[i] = bias_data[sample_labels_data[i]];
} }
...@@ -158,16 +172,16 @@ class NCEKernel : public framework::OpKernel<T> { ...@@ -158,16 +172,16 @@ class NCEKernel : public framework::OpKernel<T> {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class NCEGradKernel : public framework::OpKernel<T> { class NCEGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto d_out = context.Input<Tensor>(framework::GradVarName("Cost")); auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
const T* d_out_data = d_out->data<T>(); const T *d_out_data = d_out->data<T>();
auto label = context.Input<Tensor>("Label"); auto label = context.Input<Tensor>("Label");
auto sample_out = context.Input<Tensor>("SampleLogits"); auto sample_out = context.Input<Tensor>("SampleLogits");
const T* sample_out_data = sample_out->data<T>(); const T *sample_out_data = sample_out->data<T>();
auto sample_labels = context.Input<Tensor>("SampleLabels"); auto sample_labels = context.Input<Tensor>("SampleLabels");
const int64_t* sample_labels_data = sample_labels->data<int64_t>(); const int64_t *sample_labels_data = sample_labels->data<int64_t>();
auto sample_weight = context.Input<Tensor>("SampleWeight"); auto sample_weight = context.Input<Tensor>("SampleWeight");
const T* sample_weight_data = nullptr; const T *sample_weight_data = nullptr;
if (sample_weight != nullptr) { if (sample_weight != nullptr) {
sample_weight_data = sample_weight->data<T>(); sample_weight_data = sample_weight->data<T>();
} }
...@@ -180,7 +194,7 @@ class NCEGradKernel : public framework::OpKernel<T> { ...@@ -180,7 +194,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
int sampler_type = context.Attr<int>("sampler"); int sampler_type = context.Attr<int>("sampler");
int seed = context.Attr<int>("seed"); int seed = context.Attr<int>("seed");
Sampler* sampler; Sampler *sampler;
switch (sampler_type) { switch (sampler_type) {
case 0: { case 0: {
sampler = new math::UniformSampler(num_total_classes - 1, seed); sampler = new math::UniformSampler(num_total_classes - 1, seed);
...@@ -191,11 +205,19 @@ class NCEGradKernel : public framework::OpKernel<T> { ...@@ -191,11 +205,19 @@ class NCEGradKernel : public framework::OpKernel<T> {
break; break;
} }
case 2: { case 2: {
auto custom_dist = context.Input<Tensor>("CustomDistribution"); auto dist_probs = context.Input<Tensor>("CustomDistProbs");
const float* custom_dist_data = custom_dist->data<float>(); auto dist_alias = context.Input<Tensor>("CustomDistAlias");
PADDLE_ENFORCE_EQ(custom_dist->numel(), num_total_classes); auto dist_alias_probs = context.Input<Tensor>("CustomDistAliasProbs");
sampler = new math::CustomSampler(num_total_classes - 1,
custom_dist_data, seed); PADDLE_ENFORCE_EQ(dist_probs->numel(), num_total_classes);
PADDLE_ENFORCE_EQ(dist_alias->numel(), num_total_classes);
PADDLE_ENFORCE_EQ(dist_alias_probs->numel(), num_total_classes);
const float *probs_data = dist_probs->data<float>();
const int *alias_data = dist_alias->data<int>();
const float *alias_probs_data = dist_alias_probs->data<float>();
sampler = new math::CustomSampler(num_total_classes - 1, probs_data,
alias_data, alias_probs_data, seed);
break; break;
} }
default: { PADDLE_THROW("Unsupported SamplerType."); } default: { PADDLE_THROW("Unsupported SamplerType."); }
...@@ -203,7 +225,7 @@ class NCEGradKernel : public framework::OpKernel<T> { ...@@ -203,7 +225,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
// T b = 1. / num_total_classes * num_neg_samples; // T b = 1. / num_total_classes * num_neg_samples;
Tensor sample_grad; // tmp tensor Tensor sample_grad; // tmp tensor
T* sample_grad_data = T *sample_grad_data =
sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace()); sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
// backward cost // backward cost
for (int64_t i = 0; i < sample_labels->numel(); ++i) { for (int64_t i = 0; i < sample_labels->numel(); ++i) {
...@@ -217,10 +239,14 @@ class NCEGradKernel : public framework::OpKernel<T> { ...@@ -217,10 +239,14 @@ class NCEGradKernel : public framework::OpKernel<T> {
: w * (o * (1 - o) / (o + b)); : w * (o * (1 - o) / (o + b));
sample_grad_data[i] *= d_out_data[sample_idx]; sample_grad_data[i] *= d_out_data[sample_idx];
} }
bool is_sparse = context.Attr<bool>("is_sparse");
if (!is_sparse) {
// get d_bias // get d_bias
auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias")); auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
if (d_bias != nullptr) { if (d_bias != nullptr) {
T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace()); T *d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
for (int64_t i = 0; i < sample_labels->numel(); ++i) { for (int64_t i = 0; i < sample_labels->numel(); ++i) {
d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
...@@ -239,10 +265,79 @@ class NCEGradKernel : public framework::OpKernel<T> { ...@@ -239,10 +265,79 @@ class NCEGradKernel : public framework::OpKernel<T> {
sample_grad_data[i]; sample_grad_data[i];
} }
} }
} else {
std::vector<int64_t> labels;
for (int64_t i = 0; i < sample_labels->numel(); ++i) {
labels.push_back(sample_labels_data[i]);
}
std::set<T> st(labels.begin(), labels.end());
labels.assign(st.begin(), st.end());
auto *bias_var = context.InputVar("Bias");
DDim bias_dim;
if (bias_var->IsType<LoDTensor>()) {
bias_dim = context.Input<LoDTensor>("Bias")->dims();
} else if (bias_var->IsType<SelectedRows>()) {
auto *table_t = context.Input<SelectedRows>("Bias");
bias_dim = table_t->value().dims();
} else {
PADDLE_THROW(
"The parameter Bias of a NCE_OP "
"must be either LoDTensor or SelectedRows");
}
auto d_bias =
context.Output<SelectedRows>(framework::GradVarName("Bias"));
d_bias->set_rows(labels);
d_bias->set_height(bias_dim[0]);
d_bias->mutable_value()->Resize(
{static_cast<int64_t>(labels.size()), bias_dim[1]});
T *d_bias_data =
d_bias->mutable_value()->mutable_data<T>(context.GetPlace());
std::fill(d_bias_data, d_bias_data + labels.size(), 0.0);
for (int64_t i = 0; i < sample_labels->numel(); ++i) {
d_bias_data[d_bias->Index(sample_labels_data[i])] +=
sample_grad_data[i];
}
auto *table_var = context.InputVar("Weight");
DDim table_dim;
if (table_var->IsType<LoDTensor>()) {
table_dim = context.Input<LoDTensor>("Weight")->dims();
} else if (table_var->IsType<SelectedRows>()) {
auto *table_t = context.Input<SelectedRows>("Weight");
table_dim = table_t->value().dims();
} else {
PADDLE_THROW(
"The parameter Weight of a NCE_OP "
"must be either LoDTensor or SelectedRows");
}
auto d_w = context.Output<SelectedRows>(framework::GradVarName("Weight"));
d_w->set_rows(labels);
d_w->set_height(table_dim[0]);
auto *d_table_value = d_w->mutable_value();
d_table_value->Resize(
{static_cast<int64_t>(labels.size()), table_dim[1]});
auto d_w_data = d_table_value->mutable_data<T>(context.GetPlace());
std::fill(d_w_data, d_w_data + d_table_value->numel(), 0.0);
auto d_w_matrix = EigenMatrix<T>::From(*d_table_value);
auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
for (int64_t i = 0; i < sample_labels->numel(); ++i) {
d_w_matrix.chip(d_w->Index(sample_labels_data[i]), 0) +=
x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
sample_grad_data[i];
}
}
// get d_x // get d_x
auto d_x = context.Output<Tensor>(framework::GradVarName("Input")); auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
if (d_x != nullptr) { if (d_x != nullptr) {
auto* d_x_data = d_x->mutable_data<T>(context.GetPlace()); auto *d_x_data = d_x->mutable_data<T>(context.GetPlace());
std::fill(d_x_data, d_x_data + d_x->numel(), 0.0); std::fill(d_x_data, d_x_data + d_x->numel(), 0.0);
auto d_x_matrix = EigenMatrix<T>::From(*d_x); auto d_x_matrix = EigenMatrix<T>::From(*d_x);
auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight"))); auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
...@@ -251,6 +346,7 @@ class NCEGradKernel : public framework::OpKernel<T> { ...@@ -251,6 +346,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i]; w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
} }
} }
delete sampler; delete sampler;
} }
}; };
......
...@@ -28,6 +28,12 @@ reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc) ...@@ -28,6 +28,12 @@ reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc) reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
reader_library(create_py_reader_op SRCS create_py_reader_op.cc) reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
if (NOT WIN32 AND NOT ON_INFER)
cc_library(ctr_reader SRCS ctr_reader.cc DEPS gzstream reader zlib)
cc_test(ctr_reader_test SRCS ctr_reader_test.cc DEPS ctr_reader)
reader_library(create_ctr_reader_op SRCS create_ctr_reader_op.cc DEPS ctr_reader)
endif ()
cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc) cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
# Export local libraries to parent # Export local libraries to parent
# set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE) # set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/ctr_reader.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class CreateCTRReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) return;
const std::string& queue_name = Input("blocking_queue");
auto* queue_holder_var = scope.FindVar(queue_name);
PADDLE_ENFORCE_NOT_NULL(
queue_holder_var,
"No LoDTensorBlockingQueueHolder variable with name %s found",
queue_name);
auto* queue_holder =
queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
int thread_num = Attr<int>("thread_num");
std::vector<std::string> slots = Attr<std::vector<std::string>>("slots");
int batch_size = Attr<int>("batch_size");
std::vector<std::string> file_list =
Attr<std::vector<std::string>>("file_list");
out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), batch_size,
thread_num, slots, file_list));
}
};
class CreateCTRReaderOpMaker : public FileReaderMakerBase {
protected:
void Apply() override {
AddInput("blocking_queue",
"Name of the `LoDTensorBlockingQueueHolder` variable");
AddAttr<int>("thread_num", "the thread num to read data");
AddAttr<int>("batch_size", "the batch size of read data");
AddAttr<std::vector<std::string>>("file_list",
"The list of files that need to read");
AddAttr<std::vector<std::string>>(
"slots", "the slots that should be extract from file");
AddComment(R"DOC(
Create CTRReader to support read ctr data with cpp.
)DOC");
}
};
} // namespace reader
} // namespace operators
} // namespace paddle
namespace reader = ::paddle::operators::reader;
REGISTER_FILE_READER_OPERATOR(create_ctr_reader, reader::CreateCTRReaderOp,
reader::CreateCTRReaderOpMaker);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/ctr_reader.h"
#include <gzstream.h>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <algorithm>
#include <random>
namespace paddle {
namespace operators {
namespace reader {
static inline void string_split(const std::string& s, const char delimiter,
std::vector<std::string>* output) {
size_t start = 0;
size_t end = s.find_first_of(delimiter);
while (end <= std::string::npos) {
output->emplace_back(s.substr(start, end - start));
if (end == std::string::npos) {
break;
}
start = end + 1;
end = s.find_first_of(delimiter, start);
}
}
static inline void parse_line(
const std::string& line,
const std::unordered_map<std::string, size_t>& slot_to_index,
int64_t* label,
std::unordered_map<std::string, std::vector<int64_t>>* slot_to_data) {
std::vector<std::string> ret;
string_split(line, ' ', &ret);
*label = std::stoi(ret[2]) > 0;
for (size_t i = 3; i < ret.size(); ++i) {
const std::string& item = ret[i];
std::vector<std::string> feasign_and_slot;
string_split(item, ':', &feasign_and_slot);
if (feasign_and_slot.size() == 2 &&
slot_to_index.find(feasign_and_slot[1]) != slot_to_index.end()) {
int64_t feasign = std::strtoll(feasign_and_slot[0].c_str(), NULL, 10);
(*slot_to_data)[feasign_and_slot[1]].push_back(feasign);
}
}
// NOTE:: if the slot has no value, then fill [0] as it's data.
for (auto& item : slot_to_index) {
if (slot_to_data->find(item.first) == slot_to_data->end()) {
(*slot_to_data)[item.first].push_back(0);
}
}
}
class Reader {
public:
virtual ~Reader() {}
virtual bool HasNext() = 0;
virtual void NextLine(std::string* line) = 0;
};
class GzipReader : public Reader {
public:
explicit GzipReader(const std::string& file_name)
: gzstream_(file_name.c_str()) {}
~GzipReader() {}
bool HasNext() override { return gzstream_.peek() != EOF; }
void NextLine(std::string* line) override { std::getline(gzstream_, *line); }
private:
igzstream gzstream_;
};
class MultiGzipReader : public Reader {
public:
explicit MultiGzipReader(const std::vector<std::string>& file_list) {
for (auto& file : file_list) {
readers_.emplace_back(std::make_shared<GzipReader>(file));
}
}
bool HasNext() override {
if (current_reader_index_ >= readers_.size()) {
return false;
}
if (!readers_[current_reader_index_]->HasNext()) {
current_reader_index_++;
return HasNext();
}
return true;
}
void NextLine(std::string* line) override {
readers_[current_reader_index_]->NextLine(line);
}
private:
std::vector<std::shared_ptr<GzipReader>> readers_;
size_t current_reader_index_ = 0;
};
void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
std::shared_ptr<LoDTensorBlockingQueue> queue) {
VLOG(30) << "monitor thread in";
bool reader_thread_is_running = true;
while (reader_thread_is_running) {
VLOG(30) << "reader_thread_is_running";
reader_thread_is_running = false;
for (size_t i = 0; i < (*thread_status).size(); ++i) {
if ((*thread_status)[i] == Running) {
VLOG(30) << "reader is running!";
reader_thread_is_running = true;
}
}
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
}
VLOG(30) << "all reader thread is stopped, push empty data into queue";
queue->Push({});
VLOG(30) << "monitor thread exited";
}
void ReadThread(const std::vector<std::string>& file_list,
const std::vector<std::string>& slots, int batch_size,
int thread_id, std::vector<ReaderThreadStatus>* thread_status,
std::shared_ptr<LoDTensorBlockingQueue> queue) {
VLOG(30) << "[" << thread_id << "]"
<< " reader thread start! thread_id = " << thread_id;
for (auto& file : file_list) {
VLOG(30) << "[" << thread_id << "]"
<< " file " << file;
}
(*thread_status)[thread_id] = Running;
VLOG(30) << "set status to running";
std::unordered_map<std::string, size_t> slot_to_index;
for (size_t i = 0; i < slots.size(); ++i) {
slot_to_index[slots[i]] = i;
}
std::string line;
std::vector<std::unordered_map<std::string, std::vector<int64_t>>> batch_data;
std::vector<int64_t> batch_label;
MultiGzipReader reader(file_list);
VLOG(30) << "reader inited";
while (reader.HasNext()) {
batch_data.clear();
batch_data.reserve(batch_size);
batch_label.clear();
batch_label.reserve(batch_size);
// read batch_size data
for (int i = 0; i < batch_size; ++i) {
if (reader.HasNext()) {
reader.NextLine(&line);
std::unordered_map<std::string, std::vector<int64_t>> slot_to_data;
int64_t label;
parse_line(line, slot_to_index, &label, &slot_to_data);
batch_data.push_back(slot_to_data);
batch_label.push_back(label);
} else {
break;
}
}
std::vector<framework::LoDTensor> lod_datas;
// first insert tensor for each slots
for (auto& slot : slots) {
std::vector<size_t> lod_data{0};
std::vector<int64_t> batch_feasign;
for (size_t i = 0; i < batch_data.size(); ++i) {
auto& feasign = batch_data[i][slot];
lod_data.push_back(lod_data.back() + feasign.size());
batch_feasign.insert(batch_feasign.end(), feasign.begin(),
feasign.end());
}
framework::LoDTensor lod_tensor;
framework::LoD lod{lod_data};
lod_tensor.set_lod(lod);
int64_t* tensor_data = lod_tensor.mutable_data<int64_t>(
framework::make_ddim({1, static_cast<int64_t>(batch_feasign.size())}),
platform::CPUPlace());
memcpy(tensor_data, batch_feasign.data(),
batch_feasign.size() * sizeof(int64_t));
lod_datas.push_back(lod_tensor);
}
// insert label tensor
framework::LoDTensor label_tensor;
auto* label_tensor_data = label_tensor.mutable_data<int64_t>(
framework::make_ddim({1, static_cast<int64_t>(batch_label.size())}),
platform::CPUPlace());
memcpy(label_tensor_data, batch_label.data(),
batch_label.size() * sizeof(int64_t));
lod_datas.push_back(label_tensor);
queue->Push(lod_datas);
VLOG(40) << "push one data, queue_size=" << queue->Size();
}
(*thread_status)[thread_id] = Stopped;
VLOG(30) << "set status to stopped, thread " << thread_id << " exited";
}
} // namespace reader
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <sys/time.h>
#include <chrono> // NOLINT
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
namespace paddle {
namespace operators {
namespace reader {
enum ReaderThreadStatus { Running, Stopped };
void ReadThread(const std::vector<std::string>& file_list,
const std::vector<std::string>& slots, int batch_size,
int thread_id, std::vector<ReaderThreadStatus>* thread_status,
std::shared_ptr<LoDTensorBlockingQueue> queue);
// monitor all running thread, if they are all stopped,
// then push an empty data into LoDTensorBlockingQueue
void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
std::shared_ptr<LoDTensorBlockingQueue> queue);
class CTRReader : public framework::FileReader {
public:
explicit CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
int batch_size, int thread_num,
const std::vector<std::string>& slots,
const std::vector<std::string>& file_list)
: batch_size_(batch_size), slots_(slots), file_list_(file_list) {
PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!");
PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty");
thread_num_ =
file_list_.size() > thread_num ? thread_num : file_list_.size();
queue_ = queue;
SplitFiles();
for (size_t i = 0; i < thread_num_; ++i) {
read_thread_status_.push_back(Stopped);
}
}
~CTRReader() {}
void ReadNext(std::vector<framework::LoDTensor>* out) override {
bool success;
*out = queue_->Pop(&success);
if (!success) out->clear();
}
void Shutdown() override {
VLOG(3) << "Shutdown reader";
if (status_ == ReaderStatus::kStopped) {
return;
}
// shutdown should stop all the reader thread
for (auto& read_thread : read_threads_) {
read_thread->join();
}
monitor_thread_->join();
read_threads_.clear();
monitor_thread_.reset(nullptr);
queue_->Close();
status_ = ReaderStatus::kStopped;
}
void Start() override {
VLOG(3) << "Start reader";
PADDLE_ENFORCE_EQ(read_threads_.size(), 0, "read thread should be empty!");
queue_->ReOpen();
VLOG(3) << "reopen success";
VLOG(3) << "thread_num " << thread_num_;
for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
read_threads_.emplace_back(new std::thread(
std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_,
thread_id, &read_thread_status_, queue_)));
}
monitor_thread_.reset(new std::thread(
std::bind(&MonitorThread, &read_thread_status_, queue_)));
status_ = ReaderStatus::kRunning;
}
private:
void SplitFiles() {
file_groups_.resize(thread_num_);
for (size_t i = 0; i < file_list_.size(); ++i) {
auto& file_name = file_list_[i];
std::ifstream f(file_name.c_str());
PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name);
file_groups_[i % thread_num_].push_back(file_name);
}
}
private:
size_t thread_num_;
const int batch_size_;
const std::vector<std::string> slots_;
const std::vector<std::string> file_list_;
std::shared_ptr<LoDTensorBlockingQueue> queue_;
std::vector<std::unique_ptr<std::thread>> read_threads_;
std::unique_ptr<std::thread> monitor_thread_;
std::vector<ReaderThreadStatus> read_thread_status_;
std::vector<std::vector<std::string>> file_groups_;
};
} // namespace reader
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/ctr_reader.h"
#include <gzstream.h>
#include <time.h>
#include <math.h>
#include <stdio.h>
#include <cstring>
#include <fstream>
#include <tuple>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
using paddle::operators::reader::LoDTensorBlockingQueue;
using paddle::operators::reader::LoDTensorBlockingQueueHolder;
using paddle::operators::reader::CTRReader;
using paddle::framework::LoDTensor;
using paddle::framework::LoD;
using paddle::framework::DDim;
using paddle::platform::CPUPlace;
using paddle::framework::make_ddim;
static void generatedata(const std::vector<std::string>& data,
const std::string& file_name) {
std::ifstream in(file_name.c_str());
if (in.good()) {
VLOG(3) << "file " << file_name << " exist, delete it first!";
remove(file_name.c_str());
} else {
in.close();
}
ogzstream out(file_name.c_str());
PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name);
for (auto& c : data) {
out << c;
}
out.close();
PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name);
}
static inline void check_all_data(
const std::vector<std::string>& ctr_data,
const std::vector<std::string>& slots, const std::vector<DDim>& label_dims,
const std::vector<int64_t>& label_value,
const std::vector<std::tuple<LoD, std::vector<int64_t>>>& data_slot_6002,
const std::vector<std::tuple<LoD, std::vector<int64_t>>>& data_slot_6003,
size_t batch_num, size_t batch_size,
std::shared_ptr<LoDTensorBlockingQueue> queue, CTRReader* reader) {
std::vector<LoDTensor> out;
for (size_t i = 0; i < batch_num; ++i) {
reader->ReadNext(&out);
ASSERT_EQ(out.size(), slots.size() + 1);
auto& label_tensor = out.back();
ASSERT_EQ(label_tensor.dims(), label_dims[i]);
for (size_t j = 0; j < batch_size && i * batch_num + j < ctr_data.size();
++j) {
auto& label = label_tensor.data<int64_t>()[j];
ASSERT_TRUE(label == 0 || label == 1);
ASSERT_EQ(label, label_value[i * batch_size + j]);
}
auto& tensor_6002 = out[0];
ASSERT_EQ(std::get<0>(data_slot_6002[i]), tensor_6002.lod());
ASSERT_EQ(std::memcmp(std::get<1>(data_slot_6002[i]).data(),
tensor_6002.data<int64_t>(),
tensor_6002.dims()[1] * sizeof(int64_t)),
0);
}
reader->ReadNext(&out);
ASSERT_EQ(out.size(), 0);
ASSERT_EQ(queue->Size(), 0);
}
TEST(CTR_READER, read_data) {
const std::vector<std::string> ctr_data = {
"aaaa 1 0 0:6002 1:6003 2:6004 3:6005 4:6006 -1\n",
"bbbb 1 0 5:6003 6:6003 7:6003 8:6004 9:6004 -1\n",
"cccc 1 1 10:6002 11:6002 12:6002 13:6002 14:6002 -2\n",
"dddd 1 0 15:6003 16:6003 17:6003 18:6003 19:6004 -3\n",
"1111 1 1 20:6001 21:6001 22:6001 23:6001 24:6001 12\n",
"2222 1 1 25:6004 26:6004 27:6004 28:6005 29:6005 aa\n",
"3333 1 0 30:6002 31:6003 32:6004 33:6004 34:6005 er\n",
"eeee 1 1 35:6003 36:6003 37:6005 38:6005 39:6005 dd\n",
"ffff 1 1 40:6002 41:6003 42:6004 43:6004 44:6005 66\n",
"gggg 1 1 46:6006 45:6006 47:6003 48:6003 49:6003 ba\n",
};
std::string gz_file_name = "test_ctr_reader_data.gz";
generatedata(ctr_data, gz_file_name);
std::vector<int64_t> label_value = {0, 0, 1, 0, 1, 1, 0, 1, 1, 1};
std::tuple<LoD, std::vector<int64_t>> a1({{0, 1, 2, 7}},
{0, 0, 10, 11, 12, 13, 14});
std::tuple<LoD, std::vector<int64_t>> a2({{0, 1, 2, 3}}, {0, 0, 0});
std::tuple<LoD, std::vector<int64_t>> a3({{0, 1, 2, 3}}, {30, 0, 40});
std::tuple<LoD, std::vector<int64_t>> a4({{0, 1}}, {0});
std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6002{a1, a2, a3,
a4};
std::tuple<LoD, std::vector<int64_t>> b1({{0, 1, 4, 5}}, {1, 5, 6, 7, 0});
std::tuple<LoD, std::vector<int64_t>> b2({{0, 4, 5, 6}},
{15, 16, 17, 18, 0, 0});
std::tuple<LoD, std::vector<int64_t>> b3({{0, 1, 3, 4}}, {31, 35, 36, 41});
std::tuple<LoD, std::vector<int64_t>> b4({{0, 3}}, {47, 48, 49});
std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6003{b1, b2, b3,
b4};
std::vector<DDim> label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}};
LoDTensorBlockingQueueHolder queue_holder;
int capacity = 64;
queue_holder.InitOnce(capacity, {}, false);
std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();
int batch_size = 3;
int thread_num = 1;
std::vector<std::string> slots = {"6002", "6003"};
std::vector<std::string> file_list;
for (int i = 0; i < thread_num; ++i) {
file_list.push_back(gz_file_name);
}
CTRReader reader(queue, batch_size, thread_num, slots, file_list);
reader.Start();
size_t batch_num =
std::ceil(static_cast<float>(ctr_data.size()) / batch_size) * thread_num;
check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002,
data_slot_6003, batch_num, batch_size, queue, &reader);
reader.Shutdown();
reader.Start();
check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002,
data_slot_6003, batch_num, batch_size, queue, &reader);
reader.Shutdown();
}
...@@ -1039,6 +1039,11 @@ HOSTDEVICE inline float16 exp(const float16& a) { ...@@ -1039,6 +1039,11 @@ HOSTDEVICE inline float16 exp(const float16& a) {
return float16(::expf(static_cast<float>(a))); return float16(::expf(static_cast<float>(a)));
} }
template <>
HOSTDEVICE inline float16 erf(const float16& a) {
return float16(::erff(static_cast<float>(a)));
}
template <> template <>
HOSTDEVICE inline float16 log(const float16& a) { HOSTDEVICE inline float16 log(const float16& a) {
return float16(::logf(static_cast<float>(a))); return float16(::logf(static_cast<float>(a)));
......
...@@ -20,12 +20,12 @@ limitations under the License. */ ...@@ -20,12 +20,12 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#ifndef _WIN32 #ifndef _WIN32
const float fraction_of_gpu_memory_to_use = 0.92f; constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
#else #else
// fraction_of_gpu_memory_to_use cannot be too high on windows, // fraction_of_gpu_memory_to_use cannot be too high on windows,
// since the win32 graphic sub-system can occupy some GPU memory // since the win32 graphic sub-system can occupy some GPU memory
// which may lead to insufficient memory left for paddle // which may lead to insufficient memory left for paddle
const float fraction_of_gpu_memory_to_use = 0.5f; constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif #endif
DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <mkldnn.h> #include <mkldnn.h>
#include <algorithm>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
...@@ -292,5 +293,21 @@ inline mkldnn::memory::format data_format_to_memory_format( ...@@ -292,5 +293,21 @@ inline mkldnn::memory::format data_format_to_memory_format(
} }
} }
inline mkldnn::memory::format StringToMKLDNNFormat(std::string* format) {
std::transform(format->begin(), format->end(), format->begin(), ::tolower);
if (!format->compare("nchw")) {
return mkldnn::memory::format::nchw;
} else if (!format->compare("nchw16c")) {
return mkldnn::memory::format::nChw16c;
} else if (!format->compare("nchw8c")) {
return mkldnn::memory::format::nChw8c;
} else if (!format->compare("nhwc")) {
return mkldnn::memory::format::nhwc;
} else {
return mkldnn::memory::format::any;
}
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -29,8 +29,16 @@ limitations under the License. */ ...@@ -29,8 +29,16 @@ limitations under the License. */
namespace pybind11 { namespace pybind11 {
namespace detail { namespace detail {
#if !defined(PYBIND11_HIDDEN)
#ifdef _WIN32
#define PYBIND11_HIDDEN __declspec(dllexport)
#else
#define PYBIND11_HIDDEN __attribute__((visibility("hidden")))
#endif
#endif
// Can be replaced by a generic lambda in C++14 // Can be replaced by a generic lambda in C++14
struct __attribute__((visibility("hidden"))) paddle_variant_caster_visitor struct PYBIND11_HIDDEN paddle_variant_caster_visitor
: public boost::static_visitor<handle> { : public boost::static_visitor<handle> {
return_value_policy policy; return_value_policy policy;
handle parent; handle parent;
......
...@@ -860,6 +860,12 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -860,6 +860,12 @@ All parameter, weight, gradient are variables in Paddle.
self.remove_unnecessary_lock_ = b; self.remove_unnecessary_lock_ = b;
}, },
R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC") R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
.def_property(
"num_trainers",
[](const BuildStrategy &self) { return self.num_trainers_; },
[](BuildStrategy &self, int num_trainers) {
self.num_trainers_ = num_trainers;
})
.def_property( .def_property(
"fuse_elewise_add_act_ops", "fuse_elewise_add_act_ops",
[](const BuildStrategy &self) { [](const BuildStrategy &self) {
......
...@@ -137,10 +137,10 @@ inline pid_t gettid() { ...@@ -137,10 +137,10 @@ inline pid_t gettid() {
#define __NR_gettid 224 #define __NR_gettid 224
#endif #endif
pid_t tid = syscall(__NR_gettid); pid_t tid = syscall(__NR_gettid);
#endif
#else // _WIN32 #else // _WIN32
pid_t tid = _getpid(); pid_t tid = _getpid();
#endif // _WIN32 #endif // _WIN32
#endif
CHECK_NE((int)tid, -1); CHECK_NE((int)tid, -1);
return tid; return tid;
} }
......
...@@ -469,18 +469,21 @@ function assert_api_spec_approvals() { ...@@ -469,18 +469,21 @@ function assert_api_spec_approvals() {
BRANCH="develop" BRANCH="develop"
fi fi
API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/API.spec" || true` API_FILES=("paddle/fluid/API.spec" "paddle/fluid/framework/operator.h")
echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" for API_FILE in ${API_FILES[*]}; do
API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true`
echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
echo "You must have at least 2 approvals for the api change!" echo "You must have at least 2 approvals for the api change! ${API_FILE}"
exit 1 exit 1
fi fi
fi fi
done
} }
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddle.fluid import core
from paddle.fluid.executor import global_scope
from paddle.fluid.framework import default_main_program, \
default_startup_program, Variable
from paddle.fluid.unique_name import generate as unique_name
def monkey_patch_reader_methods(reader):
def __get_reader__():
scope = global_scope()
var = scope.find_var(reader.name)
return var.get_reader()
def reset():
return __get_reader__().reset()
reader.reset = reset
reader.stop_gradient = True
reader.persistable = True
return reader
def _copy_reader_var_(block, var):
new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
new_var.desc.set_shapes(var.desc.shapes())
new_var.desc.set_dtypes(var.desc.dtypes())
new_var.persistable = True
return new_var
def ctr_reader(feed_data,
capacity,
thread_num,
batch_size,
file_list,
slots,
name=None):
"""
Create a CTR reader for data feeding in Python
This layer returns a Reader Variable.
The Reader provides :code:`decorate_paddle_reader()` and
:code:`decorate_tensor_provider()` to set a Python generator as the data
source in Python side. When :code:`Executor::Run()` is invoked in C++
side, the data from the generator would be read automatically. Unlike
:code:`DataFeeder.feed()`, the data reading process and
:code:`Executor::Run()` process can run in parallel using
:code:`py_reader`. The :code:`start()` method of the Reader should be
called when each pass begins, while the :code:`reset()` method should be
called when the pass ends and :code:`fluid.core.EOFException` raises.
Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
Args:
capacity(int): The buffer capacity maintained by :code:`py_reader`.
thread_num(list|tuple): List of tuples which declaring data shapes.
batch_size(list|tuple): List of strs which declaring data type.
file_list(list|tuple): List of ints which declaring data lod_level.
slots(bool): Whether use double buffer or not.
name(basestring): The prefix Python queue name and Reader name. None will
be generated automatically.
Returns:
Variable: A Reader from which we can get feeding data.
Examples:
1. The basic usage of :code:`py_reader` is as follows:
"""
if name is None:
queue_name = unique_name('lod_tensor_blocking_queue')
reader_name = unique_name('create_ctr_reader')
else:
queue_name = "_".join([name, "queue"])
reader_name = "_".join([name, "reader"])
var = global_scope().var(queue_name)
feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
startup_blk = default_startup_program().current_block()
reader_var = startup_blk.create_var(name=reader_name)
startup_blk.append_op(
type='create_ctr_reader',
inputs={'blocking_queue': [queue_name]},
outputs={'Out': [reader_var]},
attrs={
'thread_num': thread_num,
'batch_size': batch_size,
'file_list': file_list,
'slots': slots,
})
reader_var.persistable = True
main_prog_reader_var = _copy_reader_var_(
default_main_program().current_block(), reader_var)
reader = monkey_patch_reader_methods(main_prog_reader_var)
# monkey patch py_reader special methods
reader.queue = feed_queue
reader.exited = False
main_blk = default_main_program().current_block()
main_blk.append_op(
type='read', inputs={'Reader': [reader]}, outputs={'Out': feed_data})
return reader
...@@ -637,8 +637,8 @@ def save_inference_model(dirname, ...@@ -637,8 +637,8 @@ def save_inference_model(dirname,
if isinstance(target_vars, Variable): if isinstance(target_vars, Variable):
target_vars = [target_vars] target_vars = [target_vars]
elif export_for_deployment: elif export_for_deployment:
if not (bool(target_vars) and all( if not (bool(target_vars) and
isinstance(var, Variable) for var in target_vars)): all(isinstance(var, Variable) for var in target_vars)):
raise ValueError("'target_vars' should be a list of Variable.") raise ValueError("'target_vars' should be a list of Variable.")
if main_program is None: if main_program is None:
...@@ -667,10 +667,15 @@ def save_inference_model(dirname, ...@@ -667,10 +667,15 @@ def save_inference_model(dirname,
if export_for_deployment: if export_for_deployment:
main_program = main_program.clone() main_program = main_program.clone()
global_block = main_program.global_block() global_block = main_program.global_block()
need_to_remove_op_index = []
for i, op in enumerate(global_block.ops): for i, op in enumerate(global_block.ops):
op.desc.set_is_target(False) op.desc.set_is_target(False)
if op.type == "feed" or op.type == "fetch": if op.type == "feed" or op.type == "fetch":
global_block._remove_op(i) need_to_remove_op_index.append(i)
for index in need_to_remove_op_index[::-1]:
global_block._remove_op(index)
main_program.desc.flush() main_program.desc.flush()
main_program = main_program._prune(targets=target_vars) main_program = main_program._prune(targets=target_vars)
......
...@@ -4398,7 +4398,8 @@ def nce(input, ...@@ -4398,7 +4398,8 @@ def nce(input,
name=None, name=None,
sampler="uniform", sampler="uniform",
custom_dist=None, custom_dist=None,
seed=0): seed=0,
is_sparse=False):
""" """
${comment} ${comment}
...@@ -4424,11 +4425,12 @@ def nce(input, ...@@ -4424,11 +4425,12 @@ def nce(input,
sampler (str): The sampler used to sample class from negtive classes. sampler (str): The sampler used to sample class from negtive classes.
It can be 'uniform', 'log_uniform' or 'custom_dist'. It can be 'uniform', 'log_uniform' or 'custom_dist'.
default: 'uniform'. default: 'uniform'.
custom_dist (Variable): A tensor with shape [num_total_classes]. custom_dist (float[]): A float[] with size=num_total_classes.
It is used when sampler is set to 'custom_dist'. It is used when sampler is set to 'custom_dist'.
custom_dist[i] is the probsbility of i-th class to be sampled. custom_dist[i] is the probsbility of i-th class to be sampled.
default: None. default: None.
seed (int): The seed used in sampler. default: 0. seed (int): The seed used in sampler. default: 0.
is_sparse(bool): The flag indicating whether to use sparse update, the weight@GRAD and bias@GRAD will be changed to SelectedRows.
Returns: Returns:
Variable: The output nce loss. Variable: The output nce loss.
...@@ -4480,12 +4482,7 @@ def nce(input, ...@@ -4480,12 +4482,7 @@ def nce(input,
shape=[num_total_classes, dim], shape=[num_total_classes, dim],
is_bias=False, is_bias=False,
dtype=input.dtype) dtype=input.dtype)
inputs = { inputs = {}
'Input': input,
'Label': label,
'Weight': w,
'SampleWeight': sample_weight if sample_weight is not None else []
}
if helper.bias_attr: if helper.bias_attr:
b = helper.create_parameter( b = helper.create_parameter(
attr=helper.bias_attr, attr=helper.bias_attr,
...@@ -4497,18 +4494,10 @@ def nce(input, ...@@ -4497,18 +4494,10 @@ def nce(input,
sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype) sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype)
sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype) sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype)
if num_neg_samples is None: inputs['Input'] = input
num_neg_samples = 10 inputs['Label'] = label
else: inputs['Weight'] = w
num_neg_samples = int(num_neg_samples) inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
inputs = {
'Input': input,
'Label': label,
'Weight': w,
'Bias': b,
'SampleWeight': sample_weight if sample_weight is not None else []
}
if sampler == "uniform": if sampler == "uniform":
sampler = 0 sampler = 0
...@@ -4516,17 +4505,73 @@ def nce(input, ...@@ -4516,17 +4505,73 @@ def nce(input,
sampler = 1 sampler = 1
elif sampler == "custom_dist": elif sampler == "custom_dist":
assert custom_dist is not None assert custom_dist is not None
assert isinstance(custom_dist, Variable) # assert isinstance(custom_dist, Variable)
inputs['CustomDistribution'] = custom_dist
custom_dist_len = len(custom_dist)
alias_probs_ = [0] * custom_dist_len
alias_ = [0] * custom_dist_len
bigs = []
littles = []
for i in range(custom_dist_len):
normal_prob = custom_dist[i] * custom_dist_len
if normal_prob - 1.0 > 1e-4:
bigs.append((i, normal_prob))
elif 1.0 - normal_prob > 1e-4:
littles.append((i, normal_prob))
else:
alias_probs_[i] = normal_prob
alias_[i] = -1
while len(bigs) and len(littles):
big = bigs.pop(0)
little = littles.pop(0)
big_idx = big[0]
big_prob = big[1]
alias_probs_[little[0]] = little[1]
alias_[little[0]] = big_idx
big_left = big[1] + little[1] - 1
if big_left - 1.0 > 1e-4:
bigs.append((big_idx, big_left))
elif 1.0 - big_left > 1e-4:
littles.append((big_idx, big_left))
else:
alias_probs_[big_idx] = big_left
alias_[big_idx] = -1
if len(bigs):
big = bigs.pop(0)
alias_probs_[big[0]] = 1.0
alias_[big[0]] = -1
if len(littles):
little = littles.pop(0)
alias_probs_[little[0]] = 1.0
alias_[little[0]] = -1
probs = assign(input=np.array(custom_dist).astype('float32'))
custom_alias = assign(input=np.array(alias_).astype('int32'))
custom_alias_probs = assign(
input=np.array(alias_probs_).astype('float32'))
inputs['CustomDistProbs'] = probs
inputs['CustomDistAlias'] = custom_alias
inputs['CustomDistAliasProbs'] = custom_alias_probs
sampler = 2 sampler = 2
else: else:
raise Exception("Unsupported sampler type.") raise Exception("Unsupported sampler type.")
if num_neg_samples is None:
num_neg_samples = 10
else:
num_neg_samples = int(num_neg_samples)
attrs = { attrs = {
'num_total_classes': int(num_total_classes), 'num_total_classes': int(num_total_classes),
'num_neg_samples': num_neg_samples, 'num_neg_samples': num_neg_samples,
'seed': seed, 'seed': seed,
'sampler': sampler 'sampler': sampler,
'is_sparse': is_sparse
} }
helper.append_op( helper.append_op(
...@@ -4546,27 +4591,43 @@ def hsigmoid(input, ...@@ -4546,27 +4591,43 @@ def hsigmoid(input,
num_classes, num_classes,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
name=None): name=None,
path_table=None,
path_code=None,
is_custom=False,
is_sparse=False):
""" """
The hierarchical sigmoid operator is used to accelerate the training The hierarchical sigmoid operator is used to accelerate the training
process of language model. This operator organizes the classes into a process of language model. This operator organizes the classes into a
complete binary tree, each leaf node represents a class(a word) and each complete binary tree, or you can use is_custom to pass your own tree to
implement hierarchical. Each leaf node represents a class(a word) and each
internal node acts as a binary classifier. For each word there's a unique internal node acts as a binary classifier. For each word there's a unique
path from root to it's leaf node, hsigmoid calculate the cost for each path from root to it's leaf node, hsigmoid calculate the cost for each
internal node on the path, and sum them to get a total cost. hsigmoid can internal node on the path, and sum them to get a total cost. hsigmoid can
achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N` achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
represents the size of word dict. represents the size of word dict.
Refer to `Hierarchical Probabilistic Neural Network Language Model Using default tree you can Refer to `Hierarchical Probabilistic Neural Network Language Model
<http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_ <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first:
1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
means label of each binary classification, using 1 indicate true, 0 indicate false.
4. now, each word should has its path and code along the path, you can pass a batch of path and code
related to the same batch of inputs.
Args: Args:
input (Variable): The input tensor variable with shape input (Variable): The input tensor variable with shape
:math:`[N \\times D]`, where :math:`N` is the size of mini-batch, :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
and :math:`D` is the feature size. and :math:`D` is the feature size.
label (Variable): The tensor variable contains labels of training data. label (Variable): The tensor variable contains labels of training data.
It's a tensor with shape is :math:`[N \\times 1]`. It's a tensor with shape is :math:`[N \\times 1]`.
num_classes: (int), The number of classes, must not be less than 2. num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
which indicates the num of classes using by binary classify.
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
will create ParamAttr as param_attr. If the Initializer of the param_attr will create ParamAttr as param_attr. If the Initializer of the param_attr
...@@ -4578,9 +4639,19 @@ def hsigmoid(input, ...@@ -4578,9 +4639,19 @@ def hsigmoid(input,
is not set, the bias is initialized zero. Default: None. is not set, the bias is initialized zero. Default: None.
name (str|None): A name for this layer(optional). If set None, the layer name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None. will be named automatically. Default: None.
path_table: (Variable|None) this variable can store each batch of samples' path to root,
it should be in leaf -> root order
path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
structure and each element in this array is indexes in parent nodes' Weight Matrix.
path_code: (Variable|None) this variable can store each batch of samples' code,
each code consist with every code of parent nodes. it should be in leaf -> root order
is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
of W and input will be sparse.
Returns: Returns:
Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1] Out: (LodTensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
Examples: Examples:
...@@ -4596,18 +4667,52 @@ def hsigmoid(input, ...@@ -4596,18 +4667,52 @@ def hsigmoid(input,
out = helper.create_variable_for_type_inference(dtype) out = helper.create_variable_for_type_inference(dtype)
pre_out = helper.create_variable_for_type_inference(dtype) pre_out = helper.create_variable_for_type_inference(dtype)
dim = input.shape[1] dim = input.shape[1]
if num_classes < 2: if ((num_classes is None) or (num_classes < 2)) and (not is_custom):
raise ValueError("num_classes must not be less than 2.") raise ValueError(
"num_classes must not be less than 2 with default tree")
if (is_custom) and (path_code is None):
raise ValueError("path_code should not be None with costum tree")
elif (is_custom) and (path_table is None):
raise ValueError("path_table should not be None with costum tree")
elif (is_custom) and (num_classes is None):
raise ValueError("num_classes should not be None with costum tree")
else:
pass
weights = None
if not is_custom:
weights = helper.create_parameter( weights = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr,
shape=[num_classes - 1, dim], shape=[num_classes - 1, dim],
is_bias=False, is_bias=False,
dtype=input.dtype) dtype=input.dtype)
inputs = {"X": input, "W": weights, "Label": label} else:
weights = helper.create_parameter(
attr=helper.param_attr,
shape=[num_classes, dim],
is_bias=False,
dtype=input.dtype)
inputs = {
"X": input,
"W": weights,
"PTable": path_table,
"PathCode": path_code,
"Label": label
}
if helper.bias_attr: if helper.bias_attr:
if not is_custom:
bias = helper.create_parameter(
attr=helper.bias_attr,
shape=[num_classes - 1, 1],
is_bias=True,
dtype=input.dtype)
inputs['Bias'] = bias
else:
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.bias_attr, attr=helper.bias_attr,
shape=[1, num_classes - 1], shape=[num_classes, 1],
is_bias=True, is_bias=True,
dtype=input.dtype) dtype=input.dtype)
inputs['Bias'] = bias inputs['Bias'] = bias
...@@ -4616,7 +4721,8 @@ def hsigmoid(input, ...@@ -4616,7 +4721,8 @@ def hsigmoid(input,
inputs=inputs, inputs=inputs,
outputs={"Out": out, outputs={"Out": out,
"PreOut": pre_out}, "PreOut": pre_out},
attrs={"num_classes": num_classes}) attrs={"num_classes": num_classes,
"is_sparse": is_sparse})
return out return out
......
...@@ -124,16 +124,11 @@ class ParallelExecutor(object): ...@@ -124,16 +124,11 @@ class ParallelExecutor(object):
os.environ.get('CPU_NUM', multiprocessing.cpu_count())) os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exec_strategy.num_threads = cpu_num * 2 exec_strategy.num_threads = cpu_num * 2
# Set 1 thread num under nccl2 distribute
# env to make sure all gpus run ops in same order.
if num_trainers > 1:
assert (use_cuda)
# FIXME(gongwb): avoid this set.
exec_strategy.num_threads = 1
if build_strategy is None: if build_strategy is None:
build_strategy = BuildStrategy() build_strategy = BuildStrategy()
build_strategy.num_trainers = num_trainers
main = main_program main = main_program
main = main if main else framework.default_main_program() main = main if main else framework.default_main_program()
if scope == None: if scope == None:
......
...@@ -63,7 +63,7 @@ function(py_test_modules TARGET_NAME) ...@@ -63,7 +63,7 @@ function(py_test_modules TARGET_NAME)
set(multiValueArgs MODULES DEPS ENVS) set(multiValueArgs MODULES DEPS ENVS)
cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS} COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (py_test_modules_SERIAL) if (py_test_modules_SERIAL)
......
...@@ -18,7 +18,7 @@ import unittest ...@@ -18,7 +18,7 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
from scipy.special import expit from scipy.special import expit, erf
class TestActivation(OpTest): class TestActivation(OpTest):
...@@ -295,6 +295,23 @@ class TestRelu(TestActivation): ...@@ -295,6 +295,23 @@ class TestRelu(TestActivation):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestGelu(TestActivation):
def setUp(self):
self.op_type = "gelu"
self.init_dtype()
x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
out = 0.5 * x * (1.0 + erf(x / np.sqrt(2.0)))
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestBRelu(TestActivation): class TestBRelu(TestActivation):
def setUp(self): def setUp(self):
self.op_type = "brelu" self.op_type = "brelu"
...@@ -628,6 +645,7 @@ create_test_act_fp16_class(TestCos, grad_atol=0.85) ...@@ -628,6 +645,7 @@ create_test_act_fp16_class(TestCos, grad_atol=0.85)
create_test_act_fp16_class(TestSin) create_test_act_fp16_class(TestSin)
create_test_act_fp16_class(TestRound, grad_check=False) create_test_act_fp16_class(TestRound, grad_check=False)
create_test_act_fp16_class(TestRelu) create_test_act_fp16_class(TestRelu)
create_test_act_fp16_class(TestGelu)
create_test_act_fp16_class(TestBRelu) create_test_act_fp16_class(TestBRelu)
create_test_act_fp16_class(TestRelu6) create_test_act_fp16_class(TestRelu6)
create_test_act_fp16_class(TestSoftRelu) create_test_act_fp16_class(TestSoftRelu)
......
...@@ -16,6 +16,8 @@ from __future__ import print_function ...@@ -16,6 +16,8 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
import paddle.fluid as fluid
import math import math
from op_test import OpTest from op_test import OpTest
...@@ -40,6 +42,29 @@ class CodeTable(object): ...@@ -40,6 +42,29 @@ class CodeTable(object):
return self.c & (1 << bit) return self.c & (1 << bit)
class CodeTableWithCustomTree(object):
def __init__(self, path_table, path_code, index):
self.ptable_ = path_table
self.pcode_ = path_code
self.index_ = index
def cal_index(self, bit):
return self.ptable_[self.index_][bit]
def get_length(self):
length = 0
for ele in self.ptable_[self.index_]: # find the first -1 to stop trace
if ele >= 0:
length = length + 1
else:
return length
return length
def cal_bit(self, bit):
return self.pcode_[self.index_][bit]
def hsigmoid(x, w, label, bias, num_classes): def hsigmoid(x, w, label, bias, num_classes):
batch_size = x.shape[0] batch_size = x.shape[0]
code_length = find_latest_set(num_classes - 1) code_length = find_latest_set(num_classes - 1)
...@@ -52,7 +77,7 @@ def hsigmoid(x, w, label, bias, num_classes): ...@@ -52,7 +77,7 @@ def hsigmoid(x, w, label, bias, num_classes):
length = code_table.get_length() length = code_table.get_length()
for j in range(length): for j in range(length):
idx = code_table.cal_index(j) idx = code_table.cal_index(j)
pre_output[i][j] += bias[0][idx] pre_output[i][j] += bias[idx][0]
for i in range(batch_size): for i in range(batch_size):
code_table = CodeTable(num_classes, label[i]) code_table = CodeTable(num_classes, label[i])
length = code_table.get_length() length = code_table.get_length()
...@@ -77,17 +102,58 @@ def hsigmoid(x, w, label, bias, num_classes): ...@@ -77,17 +102,58 @@ def hsigmoid(x, w, label, bias, num_classes):
return pre_output, out return pre_output, out
def hsigmoidWithCustomTree(x, w, path_table, path_code, label, bias,
num_classes):
batch_size = x.shape[0]
code_length = len(path_table[0])
code_table = [0 for _ in range(code_length)]
# init pre_out with shape [N, code_length]
pre_output = np.zeros((batch_size, code_length))
pre_sum = np.zeros((batch_size, 1))
out = np.zeros((batch_size, 1)).astype("float32")
if isinstance(bias, np.ndarray):
for i in range(batch_size):
code_table = CodeTableWithCustomTree(path_table, path_code, i)
length = code_table.get_length()
for j in range(length):
idx = code_table.cal_index(j)
pre_output[i][j] += bias[idx][0]
for i in range(batch_size):
code_table = CodeTableWithCustomTree(path_table, path_code, i)
length = code_table.get_length()
for j in range(length):
idx = code_table.cal_index(j)
pre_output[i][j] += np.dot(w[idx], x[i])
# clip[-40.0, 40.0]
pre_output = np.clip(pre_output, -40.0, 40.0)
# out(i, 0) = \sum_j bit(i, j) * preout(i, j)
for i in range(batch_size):
code_table = CodeTableWithCustomTree(path_table, path_code, i)
length = code_table.get_length()
sum = 0.0
for j in range(length):
if code_table.cal_bit(j):
sum += pre_output[i][j]
out[i] = -1.0 * sum
# soft relu
pre_output = np.log(1 + np.exp(pre_output))
pre_sum = pre_output.sum(1).reshape((batch_size, 1))
out += pre_sum
return pre_output, out
class TestHSigmoidOp(OpTest): class TestHSigmoidOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "hierarchical_sigmoid" self.op_type = "hierarchical_sigmoid"
num_classes = 6 num_classes = 6
feature_size = 8 feature_size = 8
batch_size = 4 batch_size = 4
x = np.random.random((batch_size, feature_size)).astype("float32") x = np.random.random((batch_size, feature_size)).astype("float32") * 2
w = np.random.random((num_classes - 1, feature_size)).astype("float32") w = np.random.random(
(num_classes - 1, feature_size)).astype("float32") * 2
label = np.random.randint(0, num_classes, (batch_size, 1)) label = np.random.randint(0, num_classes, (batch_size, 1))
bias = np.random.random((1, num_classes - 1)).astype("float32") bias = np.random.random((num_classes - 1, 1)).astype("float32")
self.attrs = {'num_classes': num_classes} self.attrs = {'num_classes': num_classes, 'is_sparse': False}
self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
pre_output, out = hsigmoid(x, w, label, bias, num_classes) pre_output, out = hsigmoid(x, w, label, bias, num_classes)
self.outputs = {'PreOut': pre_output, 'Out': out} self.outputs = {'PreOut': pre_output, 'Out': out}
...@@ -99,5 +165,185 @@ class TestHSigmoidOp(OpTest): ...@@ -99,5 +165,185 @@ class TestHSigmoidOp(OpTest):
self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
class TestHSigmoidOpSparse(OpTest):
def setUp(self):
self.op_type = "hierarchical_sigmoid"
num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
feature_size = 8
batch_size = 4
x = np.random.random((batch_size, feature_size)).astype("float32")
w = np.random.random((num_classes - 1, feature_size)).astype("float32")
label = np.array([0, 1, 4, 5])
path_table = np.array(
[(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
(0, 2, -1, -1,
-1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store
bias = np.random.random((num_classes - 1, 1)).astype("float32")
self.attrs = {'num_classes': num_classes, 'is_sparse': True}
self.inputs = {
'X': x,
'W': w,
'PTable': path_table,
'PathCode': path_code,
'Label': label,
'Bias': bias
}
pre_output, out = hsigmoidWithCustomTree(x, w, path_table, path_code,
label, bias, num_classes)
self.outputs = {'PreOut': pre_output, 'Out': out}
def test_check_output(self):
self.check_output()
class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
def hs_net_conf(self, is_sparse):
input_word = fluid.layers.data(name="x", shape=[1], dtype='int64')
path_table = fluid.layers.data(
name='path_table', shape=[3], dtype='int64')
path_code = fluid.layers.data(
name='path_code', shape=[3], dtype='int64')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
data_list = [input_word, path_table, path_code, label]
emb = fluid.layers.embedding(
input=input_word,
is_sparse=is_sparse,
size=[3, 3],
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(3))))
cost = fluid.layers.hsigmoid(
input=emb,
label=label,
bias_attr=True,
num_classes=3,
path_table=path_table,
path_code=path_code,
is_custom=True,
is_sparse=is_sparse)
avg_cost = fluid.layers.reduce_mean(cost)
return avg_cost, data_list
def training_test(self, is_sparse):
with fluid.program_guard(fluid.Program(), fluid.Program()):
start_up = fluid.default_startup_program()
start_up.random_seed = 1 # Fix random seed
x = np.arange(6).reshape(6)
path_table = np.array([(1, 2, -1), (1, 2, -1)])
path_code = np.array([(1, 0, -1), (0, 0, -1)])
label = np.array([1, 4])
loss, data_list = self.hs_net_conf(is_sparse)
optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
optimizer.minimize(loss)
main_program = fluid.default_main_program()
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=data_list, place=place)
exe = fluid.Executor(place)
exe.run(start_up)
result = list()
for i in range(10):
data = [([[x[i % 2]]], [list(path_table[i % 2])],
[list(path_code[i % 2])], [label[i % 2]])]
loss_val = exe.run(main_program,
feed=feeder.feed(data),
fetch_list=[loss])
result.append(loss_val)
return result
def test_hs_grad_with_sparse(self):
dense_result = self.training_test(is_sparse=False)
sparse_result = self.training_test(is_sparse=True)
assert (dense_result == sparse_result)
class TestHSigmoidOpWithCostumTree(OpTest):
def setUp(self):
self.op_type = "hierarchical_sigmoid"
num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
feature_size = 8
batch_size = 4
x = np.random.random((batch_size, feature_size)).astype("float32") * 2
w = np.random.random(
(num_classes - 1, feature_size)).astype("float32") * 2
label = np.array([0, 1, 4, 5])
path_table = np.array(
[(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
(0, 2, -1, -1,
-1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store
bias = np.random.random((num_classes - 1, 1)).astype("float32")
self.attrs = {'num_classes': num_classes, 'is_sparse': False}
self.inputs = {
'X': x,
'W': w,
'PTable': path_table,
'PathCode': path_code,
'Label': label,
'Bias': bias
}
pre_output, out = hsigmoidWithCustomTree(x, w, path_table, path_code,
label, bias, num_classes)
self.outputs = {'PreOut': pre_output, 'Out': out}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest):
def setUp(self):
self.op_type = "hierarchical_sigmoid"
num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
feature_size = 8
batch_size = 4
x = np.random.random((batch_size, feature_size)).astype("float32") * 2
w = np.random.random(
(num_classes - 1, feature_size)).astype("float32") * 2
label = np.array([0, 1, 4, 5])
path_table = np.array(
[(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
(0, 2, -1, -1,
-1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store
# bias = np.random.random((num_classes - 1, 1)).astype("float32")
self.attrs = {'num_classes': num_classes, 'is_sparse': False}
self.inputs = {
'X': x,
'W': w,
'PTable': path_table,
'PathCode': path_code,
'Label': label,
}
pre_output, out = hsigmoidWithCustomTree(
x=x,
w=w,
path_table=path_table,
path_code=path_code,
label=label,
bias=None,
num_classes=num_classes)
self.outputs = {'PreOut': pre_output, 'Out': out}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X', 'W'], ['Out'], no_grad_set=set('Label'))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -185,6 +185,25 @@ class TestBook(unittest.TestCase): ...@@ -185,6 +185,25 @@ class TestBook(unittest.TestCase):
input=x, label=y, num_classes=2)) input=x, label=y, num_classes=2))
print(str(program)) print(str(program))
# test hsigmod with custom tree structure
program2 = Program()
with program_guard(program2):
x2 = layers.data(name='x2', shape=[4, 8], dtype='float32')
y2 = layers.data(name='y2', shape=[4], dtype='int64')
path_table = layers.data(
name='path_table', shape=[4, 6], dtype='int64')
path_code = layers.data(
name='path_code', shape=[4, 6], dtype='int64')
self.assertIsNotNone(
layers.hsigmoid(
input=x2,
label=y2,
num_classes=6,
path_table=path_table,
path_code=path_code,
is_custom=True))
print(str(program2))
def test_sequence_expand(self): def test_sequence_expand(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -14,8 +14,12 @@ ...@@ -14,8 +14,12 @@
from __future__ import print_function from __future__ import print_function
import unittest
import numpy as np import numpy as np
import unittest
import paddle.fluid as fluid
import paddle.fluid.initializer as initializer
from op_test import OpTest from op_test import OpTest
...@@ -59,7 +63,7 @@ def nce(input, weight, bias, sample_weight, labels, num_classes, ...@@ -59,7 +63,7 @@ def nce(input, weight, bias, sample_weight, labels, num_classes,
class TestNCE(OpTest): class TestNCE(OpTest):
def generate_data(self, dim, batch_size, num_classes, num_true_class, def generate_data(self, dim, batch_size, num_classes, num_true_class,
num_neg_samples): num_neg_samples, is_sparse):
input = np.random.randn(batch_size, dim).astype(np.float32) input = np.random.randn(batch_size, dim).astype(np.float32)
weight = np.random.randn(num_classes, dim).astype(np.float32) weight = np.random.randn(num_classes, dim).astype(np.float32)
bias = np.random.randn(num_classes).astype(np.float32) bias = np.random.randn(num_classes).astype(np.float32)
...@@ -70,7 +74,8 @@ class TestNCE(OpTest): ...@@ -70,7 +74,8 @@ class TestNCE(OpTest):
'num_neg_samples': num_neg_samples, 'num_neg_samples': num_neg_samples,
'custom_neg_classes': list(range(num_neg_samples)), 'custom_neg_classes': list(range(num_neg_samples)),
'seed': 0, 'seed': 0,
'sampler': 0 'sampler': 0,
'is_sparse': is_sparse
} }
self.inputs = { self.inputs = {
'Input': input, 'Input': input,
...@@ -81,7 +86,7 @@ class TestNCE(OpTest): ...@@ -81,7 +86,7 @@ class TestNCE(OpTest):
} }
def set_data(self): def set_data(self):
self.generate_data(5, 5, 4, 1, 2) self.generate_data(5, 5, 4, 1, 2, False)
def compute(self): def compute(self):
out = nce(self.inputs['Input'], self.inputs['Weight'], out = nce(self.inputs['Input'], self.inputs['Weight'],
...@@ -107,9 +112,110 @@ class TestNCE(OpTest): ...@@ -107,9 +112,110 @@ class TestNCE(OpTest):
["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02) ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02)
class TestNCECase1(TestNCE): class TestNCECase1Tensor(TestNCE):
def set_data(self): def set_data(self):
self.generate_data(10, 20, 10, 2, 5) self.generate_data(10, 20, 10, 2, 5, False)
class TestNCECase1SelectedRows(unittest.TestCase):
def setUp(self):
self.base_lr = 0.0001
self.batch_size = 8
@staticmethod
def get_place():
place = fluid.core.CPUPlace()
return place
@staticmethod
def get_train_data(batch_size):
batchs = []
for i in range(batch_size):
input = np.random.randn(batch_size, 10).astype(np.float32)
labels = np.random.randint(0, 20, (batch_size, 1))
batchs.append([input, labels])
return batchs
def get_optimizer(self):
# SGD optimizer
optimizer = fluid.optimizer.SGD(learning_rate=self.base_lr)
return optimizer
def train_network(self, num_total_classes, num_neg_samples, sampler,
custom_dist, is_sparse):
input = fluid.layers.data(name="input", shape=[10], dtype="float32")
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
w_param = fluid.default_main_program().global_block().create_parameter(
shape=[num_total_classes, 10],
dtype='float32',
name='nce_w',
initializer=initializer.ConstantInitializer())
b_param = fluid.default_main_program().global_block().create_parameter(
shape=[num_total_classes, 1],
dtype='float32',
name='nce_b',
initializer=initializer.ConstantInitializer())
cost = fluid.layers.nce(input=input,
label=label,
num_total_classes=num_total_classes,
sampler=sampler,
custom_dist=custom_dist,
sample_weight=None,
param_attr='nce_w',
bias_attr='nce_b',
seed=1,
num_neg_samples=num_neg_samples,
is_sparse=is_sparse)
avg_cost = fluid.layers.mean(cost)
# optimizer
optimizer = self.get_optimizer()
optimizer.minimize(avg_cost)
return [avg_cost, [input, label]]
def test_input_is_selected_rows(self):
place = self.get_place()
exe = fluid.Executor(place)
data = self.get_train_data(self.batch_size)
nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
rets = []
# for dense
dense_scope = fluid.core.Scope()
dense_startup_program = fluid.framework.Program()
dense_train_program = fluid.framework.Program()
with fluid.scope_guard(dense_scope):
with fluid.program_guard(dense_train_program,
dense_startup_program):
cost, feeds = self.train_network(20, 5, "custom_dist",
nid_freq_arr.tolist(), False)
feeder = fluid.DataFeeder(feed_list=feeds, place=place)
exe.run(dense_startup_program)
loss_val = exe.run(dense_train_program,
feed=feeder.feed(data),
fetch_list=[cost.name])
rets.append(np.mean(loss_val))
# for sparse
sparse_scope = fluid.core.Scope()
sparse_startup_program = fluid.framework.Program()
sparse_train_program = fluid.framework.Program()
with fluid.scope_guard(sparse_scope):
with fluid.program_guard(sparse_train_program,
sparse_startup_program):
cost, feeds = self.train_network(20, 5, "custom_dist",
nid_freq_arr.tolist(), True)
feeder = fluid.DataFeeder(feed_list=feeds, place=place)
exe.run(sparse_startup_program)
loss_val = exe.run(sparse_train_program,
feed=feeder.feed(data),
fetch_list=[cost.name])
rets.append(np.mean(loss_val))
self.assertEqual(rets[0], rets[1])
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册