From b9203958428eacebc7ffba4aebd6689df783e3b0 Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Tue, 20 Aug 2019 16:21:03 +0800 Subject: [PATCH] Use sparse matrix to implement fused emb_seq_pool operator (#19064) * Implement the operator with sprase matrix multiply * Update the URL of mklml library. test=develop * Disable MKLML implematation when using no-linux. test=develop * Ignore the deprecated status for windows test=develop --- CMakeLists.txt | 2 + cmake/external/mklml.cmake | 2 +- .../fused/fused_embedding_seq_pool_op.h | 77 +++++++++++++++++++ paddle/fluid/operators/math/blas.h | 11 +++ paddle/fluid/operators/math/blas_impl.h | 23 ++++++ paddle/fluid/platform/dynload/mklml.h | 2 + 6 files changed, 116 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index be6a4d7c09..c2c6f44a72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,8 @@ if(WIN32) set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") +else(WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations") endif(WIN32) find_package(CUDA QUIET) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 066811296e..17556afec8 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -43,7 +43,7 @@ IF(WIN32) ELSE() #TODO(intel-huying): # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. - SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_VER "csrmm2_mklml_lnx_2019.0.2" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 4651c2b2ba..7565823fc6 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include @@ -22,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/operators/math/blas.h" namespace paddle { namespace operators { @@ -31,6 +33,44 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) +template +void prepare_csr_data(const std::vector &offset, + const int64_t *ids_data, const size_t idx_width, + T *csr_vals, int *csr_colmuns, int *csr_row_idx) { + int val_idx = 0; + int row_idx = 0; + csr_row_idx[0] = 0; + + std::map ids_map; + + // for each sequence in batch + for (size_t i = 0; i < offset.size() - 1; ++i) { + for (size_t idx = 0; idx < idx_width; ++idx) { + ids_map.clear(); + + // construct a map for creating csr + for (size_t j = offset[i]; j < offset[i + 1]; ++j) { + unsigned int word_idx = + static_cast(ids_data[idx + j * idx_width]); + ++ids_map[word_idx]; + } + + VLOG(4) << "====sequence %d====" << i; + for (std::map::const_iterator it = ids_map.begin(); + it != ids_map.end(); ++it) { + VLOG(4) << it->first << " => " << it->second; + csr_vals[val_idx] = it->second; + csr_colmuns[val_idx] = it->first; + ++val_idx; + } + csr_row_idx[row_idx + 1] = csr_row_idx[row_idx] + ids_map.size(); + ++row_idx; + } + } +} +#else template struct EmbeddingVSumFunctor { void operator()(const framework::ExecutionContext &context, @@ -60,6 +100,7 @@ struct EmbeddingVSumFunctor { } } }; +#endif inline int FusedEmbeddingSeqPoolLastDim(const framework::DDim &table_dims, const framework::DDim &ids_dims) { @@ -91,8 +132,44 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { output_t->Resize({batch_size, last_dim}); if (combiner_type == "sum") { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) + auto output = output_t->mutable_data(context.GetPlace()); + int64_t table_height = table_var->dims()[0]; + int64_t table_width = table_var->dims()[1]; + auto weights = table_var->data(); + + const std::vector offset = ids_lod[0]; + auto len = ids_t->numel(); + int idx_width = len / offset.back(); + + Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t; + csr_vals_t.Resize({len}); + csr_colmuns_t.Resize({len}); + csr_row_idx_t.Resize({(batch_size + 1) * idx_width}); + auto csr_vals = csr_vals_t.mutable_data(context.GetPlace()); + auto csr_colmuns = csr_colmuns_t.mutable_data(context.GetPlace()); + auto csr_row_idx = csr_row_idx_t.mutable_data(context.GetPlace()); + prepare_csr_data(offset, ids_t->data(), idx_width, csr_vals, + csr_colmuns, csr_row_idx); + + const char transa = 'N'; + const T alpha = 1.0; + const T beta = 0.0; + const char matdescra[] = {'G', 'L', 'N', 'C'}; + + const int m = batch_size * idx_width; + const int n = table_width; + const int k = table_height; + auto blas = math::GetBlas(context); + blas.CSRMM(&transa, &m, &n, &k, &alpha, matdescra, (const T *)csr_vals, + (const int *)csr_colmuns, (const int *)csr_row_idx, + (const int *)csr_row_idx + 1, weights, &n, &beta, output, &n); + +#else EmbeddingVSumFunctor functor; functor(context, table_var, ids_t, output_t); +#endif } } }; diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index b42d75d342..0c1b61c144 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -113,6 +113,12 @@ class Blas { template void GEMM_FREE(T* data) const; + template + void CSRMM(const char* transa, const int* m, const int* n, const int* k, + const T* alpha, const char* matdescra, const T* val, + const int* indx, const int* pntrb, const int* pntre, const T* b, + const int* ldb, const T* beta, T* c, const int* ldc) const; + #if !defined(PADDLE_WITH_CUDA) template void MatMulWithHead(const framework::Tensor& mat_a, @@ -239,6 +245,11 @@ class BlasT : private Blas { Base()->template GEMM_FREE(args...); } + template + void CSRMM(ARGS... args) const { + Base()->template CSRMM(args...); + } + #if !defined(PADDLE_WITH_CUDA) template void MatMulWithHead(ARGS... args) const { diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index da313fbce4..1a45752e11 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -128,6 +128,11 @@ struct CBlas { static void VMERF(ARGS... args) { platform::dynload::vmsErf(args...); } + + template + static void CSRMM(ARGS... args) { + platform::dynload::mkl_scsrmm(args...); + } }; template <> @@ -233,6 +238,11 @@ struct CBlas { static void VMERF(ARGS... args) { platform::dynload::vmdErf(args...); } + + template + static void CSRMM(ARGS... args) { + platform::dynload::mkl_dcsrmm(args...); + } }; #else @@ -748,6 +758,19 @@ void Blas::VMERF(int n, const T *a, T *y, #endif } +#ifdef PADDLE_WITH_MKLML +template <> +template +void Blas::CSRMM( + const char *transa, const int *m, const int *n, const int *k, + const T *alpha, const char *matdescra, const T *val, const int *indx, + const int *pntrb, const int *pntre, const T *b, const int *ldb, + const T *beta, T *c, const int *ldc) const { + CBlas::CSRMM(transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntre, b, + ldb, beta, c, ldc); +} +#endif + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index a5b846f500..200746ab9f 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -88,6 +88,8 @@ extern void* mklml_dso_handle; __macro(vdInv); \ __macro(vmsErf); \ __macro(vmdErf); \ + __macro(mkl_scsrmm); \ + __macro(mkl_dcsrmm); \ __macro(MKL_Set_Num_Threads) MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); -- GitLab