未验证 提交 89a0ecd1 编写于 作者: C Cwndmiao 提交者: GitHub

[LITE][XPU] Support mmdnn3.0-ras (a.k.a. crmm-0608) (#3950)

* fix typo

* [LITE][XPU] accomodate crmm(variant 20200608)

* refine lite/tests/api/test_mmdnn_lite_xpu.cc

* more comments, test=develop test=xpu

* bugfix in crmm pattern match

* pr comments, test=develop test=xpu

* add XPU_CALL and retval check, test=develop test=xpu
上级 85a12dab
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <type_traits> #include <type_traits>
#include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/backends/xpu/target_wrapper.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -82,8 +82,8 @@ void DumpXPUMem(const T* ptr, ...@@ -82,8 +82,8 @@ void DumpXPUMem(const T* ptr,
size_t item_per_line = 30) { size_t item_per_line = 30) {
size_t after_stride_len = (len + stride - 1) / stride; size_t after_stride_len = (len + stride - 1) / stride;
std::unique_ptr<T[]> cpu_mem(new T[len]); std::unique_ptr<T[]> cpu_mem(new T[len]);
xpu_memcpy( XPU_CALL(xpu_memcpy(
cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST); cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST));
std::unique_ptr<T[]> after_stride(new T[after_stride_len]); std::unique_ptr<T[]> after_stride(new T[after_stride_len]);
for (size_t i = 0; i < after_stride_len; ++i) { for (size_t i = 0; i < after_stride_len; ++i) {
after_stride[i] = cpu_mem[i * stride]; after_stride[i] = cpu_mem[i * stride];
......
...@@ -19,11 +19,11 @@ namespace lite { ...@@ -19,11 +19,11 @@ namespace lite {
void* TargetWrapperXPU::Malloc(size_t size) { void* TargetWrapperXPU::Malloc(size_t size) {
void* ptr{nullptr}; void* ptr{nullptr};
xpu_malloc(&ptr, size); XPU_CALL(xpu_malloc(&ptr, size));
return ptr; return ptr;
} }
void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); } void TargetWrapperXPU::Free(void* ptr) { XPU_CALL(xpu_free(ptr)); }
void TargetWrapperXPU::MemcpySync(void* dst, void TargetWrapperXPU::MemcpySync(void* dst,
const void* src, const void* src,
...@@ -31,10 +31,10 @@ void TargetWrapperXPU::MemcpySync(void* dst, ...@@ -31,10 +31,10 @@ void TargetWrapperXPU::MemcpySync(void* dst,
IoDirection dir) { IoDirection dir) {
switch (dir) { switch (dir) {
case IoDirection::HtoD: case IoDirection::HtoD:
xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE); XPU_CALL(xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE));
break; break;
case IoDirection::DtoH: case IoDirection::DtoH:
xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST); XPU_CALL(xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST));
break; break;
default: default:
LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir); LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
...@@ -49,7 +49,7 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size, ...@@ -49,7 +49,7 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size,
} else { } else {
ptr = TargetWrapperXPU::Malloc(size); ptr = TargetWrapperXPU::Malloc(size);
} }
CHECK(ptr != nullptr); CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3;
return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3)); return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3));
} }
......
...@@ -16,11 +16,23 @@ ...@@ -16,11 +16,23 @@
#include <memory> // std::unique_ptr #include <memory> // std::unique_ptr
#include "lite/backends/xpu/xpu_header_sitter.h" // xpu_free #include "lite/backends/xpu/xpu_header_sitter.h" // xpu_free
#include "lite/core/target_wrapper.h" #include "lite/core/target_wrapper.h" // TargetWrapper
#include "lite/utils/cp_logging.h" // CHECK_EQ
#define XPU_CALL(func) \
{ \
auto e = (func); \
CHECK_EQ(e, 0) << "XPU: (" << #func << ") returns " << e; \
}
namespace paddle { namespace paddle {
namespace lite { namespace lite {
// MAX(lod.size()) = 64
const int XPU_MAX_LOD_SIZE = 64;
// MAX(lod[i + 1] - lod[i]) = 512
const int XPU_MAX_LOD_SEQ_LEN = 512;
using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>; using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
struct XPUScratchPad { struct XPUScratchPad {
...@@ -33,7 +45,7 @@ struct XPUScratchPad { ...@@ -33,7 +45,7 @@ struct XPUScratchPad {
struct XPUScratchPadDeleter { struct XPUScratchPadDeleter {
void operator()(XPUScratchPad* sp) const { void operator()(XPUScratchPad* sp) const {
if (!sp->is_l3_) { if (!sp->is_l3_) {
xpu_free(sp->addr_); XPU_CALL(xpu_free(sp->addr_));
} }
delete sp; delete sp;
} }
...@@ -55,7 +67,7 @@ class TargetWrapper<TARGET(kXPU)> { ...@@ -55,7 +67,7 @@ class TargetWrapper<TARGET(kXPU)> {
size_t size, size_t size,
IoDirection dir); IoDirection dir);
static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = true); static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = false);
static xdnn::Context* GetRawContext() { static xdnn::Context* GetRawContext() {
if (tls_raw_ctx_ == nullptr) { if (tls_raw_ctx_ == nullptr) {
...@@ -77,11 +89,10 @@ class TargetWrapper<TARGET(kXPU)> { ...@@ -77,11 +89,10 @@ class TargetWrapper<TARGET(kXPU)> {
static void SetDev(int dev_no = 0) { static void SetDev(int dev_no = 0) {
const char* dev_env = getenv("LITE_XPU_DEV"); const char* dev_env = getenv("LITE_XPU_DEV");
if (dev_env) { if (dev_env) {
xpu_set_device(atoi(dev_env)); dev_no = atoi(dev_env);
return;
} }
xpu_set_device(dev_no); XPU_CALL(xpu_set_device(dev_no));
} }
static std::string multi_encoder_precision; // NOLINT static std::string multi_encoder_precision; // NOLINT
......
...@@ -31,11 +31,14 @@ void XPUEmbeddingWithEltwiseAddCompute::PrepareForRun() { ...@@ -31,11 +31,14 @@ void XPUEmbeddingWithEltwiseAddCompute::PrepareForRun() {
CHECK_EQ(table_dims.size(), 2); /* shape like [table_len, embed_dim] */ CHECK_EQ(table_dims.size(), 2); /* shape like [table_len, embed_dim] */
table_lens_cpu_.push_back(table_dims[0]); table_lens_cpu_.push_back(table_dims[0]);
} }
void* lens_ptr = nullptr;
size_t lens_size = table_lens_cpu_.size() * sizeof(int); size_t lens_size = table_lens_cpu_.size() * sizeof(int);
xpu_malloc(&lens_ptr, lens_size); table_lens_guard_ =
xpu_memcpy(lens_ptr, &table_lens_cpu_[0], lens_size, XPU_HOST_TO_DEVICE); TargetWrapperXPU::MallocScratchPad(lens_size, false /* use_l3 */);
table_lens_guard_.reset(lens_ptr); XPU_CALL(xpu_memcpy(table_lens_guard_->addr_,
&table_lens_cpu_[0],
lens_size,
XPU_HOST_TO_DEVICE));
} }
void XPUEmbeddingWithEltwiseAddCompute::Run() { void XPUEmbeddingWithEltwiseAddCompute::Run() {
...@@ -55,16 +58,16 @@ void XPUEmbeddingWithEltwiseAddCompute::Run() { ...@@ -55,16 +58,16 @@ void XPUEmbeddingWithEltwiseAddCompute::Run() {
int embed_dim = table_dims[1]; int embed_dim = table_dims[1];
int emb_layer_num = param.Ids.size(); int emb_layer_num = param.Ids.size();
int r = xdnn::embedding_with_ewadd<float, int64_t, false, false>( int r = xdnn::embedding_with_ewadd<float, int64_t, false, false>(
ctx.GetRawContext(), /* context */ ctx.GetRawContext(), /* context */
embed_dim, /* embed_dim */ embed_dim, /* embed_dim */
idx_len, /* idx_len */ idx_len, /* idx_len */
emb_layer_num, /* emb_layer_num */ emb_layer_num, /* emb_layer_num */
param.padding_idx, /* padding_idx */ param.padding_idx, /* padding_idx */
&arg_tables_[0], /* tables */ &arg_tables_[0], /* tables */
&arg_ids_[0], /* indices */ &arg_ids_[0], /* indices */
static_cast<int*>(table_lens_guard_.get()), /* table_lens */ static_cast<int*>(table_lens_guard_->addr_), /* table_lens */
nullptr, /* scale_after_emb */ nullptr, /* scale_after_emb */
nullptr, /* scale_after_ewadd */ nullptr, /* scale_after_ewadd */
param.Out->mutable_data<float>(TARGET(kXPU)) /* top */); param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
CHECK_EQ(r, 0); CHECK_EQ(r, 0);
} }
......
...@@ -14,10 +14,9 @@ ...@@ -14,10 +14,9 @@
#pragma once #pragma once
#include <memory>
#include <vector> #include <vector>
#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/kernels/xpu/utils.h" // XPUFreeDeleter
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -36,7 +35,7 @@ class XPUEmbeddingWithEltwiseAddCompute ...@@ -36,7 +35,7 @@ class XPUEmbeddingWithEltwiseAddCompute
private: private:
std::vector<const int64_t*> arg_ids_; std::vector<const int64_t*> arg_ids_;
std::vector<const float*> arg_tables_; std::vector<const float*> arg_tables_;
std::unique_ptr<void, XPUFreeDeleter> table_lens_guard_; XPUScratchPadGuard table_lens_guard_;
std::vector<int> table_lens_cpu_; std::vector<int> table_lens_cpu_;
}; };
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <vector> #include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <vector> #include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
......
...@@ -22,16 +22,19 @@ namespace kernels { ...@@ -22,16 +22,19 @@ namespace kernels {
namespace xpu { namespace xpu {
void XPUMmdnnSearchAttentionCompute::PrepareForRun() { void XPUMmdnnSearchAttentionCompute::PrepareForRun() {
offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
pad_begin_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
w_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(8 * sizeof(float)); pad_begin_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
w_max_xpu_guard_ =
TargetWrapperXPU::MallocScratchPad(8 * sizeof(float), false /* use_l3 */);
buffer_at_l3_guard_ = TargetWrapperXPU::MallocScratchPad( buffer_at_l3_guard_ = TargetWrapperXPU::MallocScratchPad(
5 * L3_SLOT_SIZE * sizeof(float), false /* use_l3 */); 5 * L3_SLOT_SIZE * sizeof(float), false /* use_l3 */);
buffer_at_gm_guard_ = TargetWrapperXPU::MallocScratchPad( buffer_at_gm_guard_ = TargetWrapperXPU::MallocScratchPad(
5 * GM_SLOT_SIZE * sizeof(float), false /* use_l3 */); 5 * GM_SLOT_SIZE * sizeof(float), false /* use_l3 */);
offset_cpu.reset(new int[64]); offset_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
pad_begin_cpu.reset(new int[64]); pad_begin_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
} }
void XPUMmdnnSearchAttentionCompute::Run() { void XPUMmdnnSearchAttentionCompute::Run() {
...@@ -72,18 +75,18 @@ void XPUMmdnnSearchAttentionCompute::Run() { ...@@ -72,18 +75,18 @@ void XPUMmdnnSearchAttentionCompute::Run() {
} }
offset_cpu[batch] = offset[batch]; offset_cpu[batch] = offset[batch];
xpu_memcpy(offset_xpu_guard_->addr_, XPU_CALL(xpu_memcpy(offset_xpu_guard_->addr_,
offset_cpu.get(), offset_cpu.get(),
offset.size() * sizeof(int), offset.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
xpu_memcpy(pad_begin_xpu_guard_->addr_, XPU_CALL(xpu_memcpy(pad_begin_xpu_guard_->addr_,
pad_begin_cpu.get(), pad_begin_cpu.get(),
batch * sizeof(int), batch * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
xpu_memcpy(w_max_xpu_guard_->addr_, XPU_CALL(xpu_memcpy(w_max_xpu_guard_->addr_,
maxs_cpu, maxs_cpu,
8 * sizeof(float), 8 * sizeof(float),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
int* offset_xpu = reinterpret_cast<int*>(offset_xpu_guard_->addr_); int* offset_xpu = reinterpret_cast<int*>(offset_xpu_guard_->addr_);
int* pad_begin_xpu = reinterpret_cast<int*>(pad_begin_xpu_guard_->addr_); int* pad_begin_xpu = reinterpret_cast<int*>(pad_begin_xpu_guard_->addr_);
...@@ -115,90 +118,99 @@ void XPUMmdnnSearchAttentionCompute::Run() { ...@@ -115,90 +118,99 @@ void XPUMmdnnSearchAttentionCompute::Run() {
} }
const auto* bottom_data = X->data<float>(); const auto* bottom_data = X->data<float>();
xdnn::search_sequence_pad_depad(ctx.GetRawContext(), int r = 0;
const_cast<float*>(bottom_data), r = xdnn::search_sequence_pad_depad(ctx.GetRawContext(),
group_padding_output, const_cast<float*>(bottom_data),
offset_xpu, group_padding_output,
max_seq, offset_xpu,
batch, max_seq,
dim1, batch,
0); // is_depad = 0 dim1,
0); // is_depad = 0
CHECK_EQ(r, 0);
// do-findmax // do-findmax
xdnn::findmax<float>(ctx.GetRawContext(), r = xdnn::findmax<float>(ctx.GetRawContext(),
group_padding_output, group_padding_output,
batch * max_seq * dim1, batch * max_seq * dim1,
maxs_xpu); maxs_xpu);
xdnn::gemm_int16_maxptr<float, int16_t, float>( CHECK_EQ(r, 0);
ctx.GetRawContext(), r = xdnn::gemm_int16_maxptr<float, int16_t, float>(
false, ctx.GetRawContext(), /* ctx */
true, // trans_a, trans_b false, /* trans_a */
batch * max_seq, true, /* trans_b */
dim1, batch * max_seq, /* m */
dim1, // m, n, k dim1, /* n */
1.0f, dim1, /* k */
group_padding_output, 1.0f, /* alpha */
dim1, // alpha, data_a, lda group_padding_output, /* data_a */
w_data, dim1, /* lda */
dim1, w_data, /* data_b */
0.0f, // data_b, ldb, beta dim1, /* ldb */
seq_fc_output, 0.0f, /* beta */
dim1, seq_fc_output, /* data_c */
b_data, // data_c, ldc, bias dim1, /* ldc */
xdnn::Activation_t::LINEAR, b_data, /* bias */
maxs_xpu, xdnn::Activation_t::LINEAR, /* act */
maxs_xpu + 4, maxs_xpu, /* max_a */
nullptr); // max_a, max_b, max_c maxs_xpu + 4, /* max_b */
xdnn::search_aligned_mat_mul(ctx.GetRawContext(), nullptr /* max_c */);
0, CHECK_EQ(r, 0);
1, r = xdnn::search_aligned_mat_mul(ctx.GetRawContext(),
batch, 0,
max_seq, 1,
max_seq, batch,
dim1, max_seq,
alpha0, max_seq,
group_padding_output, dim1,
dim1, alpha0,
seq_fc_output, group_padding_output,
dim1, dim1,
batchgemm0_output, seq_fc_output,
max_seq); dim1,
xdnn::search_pad_mask(ctx.GetRawContext(), batchgemm0_output,
batchgemm0_output, max_seq);
attention_output, CHECK_EQ(r, 0);
pad_begin_xpu, r = xdnn::search_pad_mask(ctx.GetRawContext(),
batch, batchgemm0_output,
max_seq, attention_output,
max_seq, pad_begin_xpu,
batch, batch,
mask); max_seq,
xdnn::softmax2d_forward(ctx.GetRawContext(), max_seq,
attention_output, batch,
seq_softmax_output, mask);
batch * max_seq, CHECK_EQ(r, 0);
max_seq, r = xdnn::softmax2d_forward(ctx.GetRawContext(),
true); attention_output,
xdnn::search_aligned_mat_mul(ctx.GetRawContext(), seq_softmax_output,
0, batch * max_seq,
0, max_seq,
batch, true);
max_seq, CHECK_EQ(r, 0);
dim1, r = xdnn::search_aligned_mat_mul(ctx.GetRawContext(),
max_seq, 0,
alpha1, 0,
seq_softmax_output, batch,
max_seq, max_seq,
group_padding_output, dim1,
dim1, max_seq,
batchgemm1_output, alpha1,
dim1); seq_softmax_output,
xdnn::search_sequence_pad_depad(ctx.GetRawContext(), max_seq,
top_data, group_padding_output,
batchgemm1_output, dim1,
offset_xpu, batchgemm1_output,
max_seq, dim1);
batch, CHECK_EQ(r, 0);
dim1, r = xdnn::search_sequence_pad_depad(ctx.GetRawContext(),
1); // is_depad = 1 top_data,
batchgemm1_output,
offset_xpu,
max_seq,
batch,
dim1,
1); // is_depad = 1
CHECK_EQ(r, 0);
} }
} // namespace xpu } // namespace xpu
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -29,12 +29,13 @@ void LookupTableCompute::Run() { ...@@ -29,12 +29,13 @@ void LookupTableCompute::Run() {
int embed_dim = param.W->dims()[1]; int embed_dim = param.W->dims()[1];
int r = xdnn::embedding<float, int64_t>( int r = xdnn::embedding<float, int64_t>(
ctx.GetRawContext(), /* context */ ctx.GetRawContext(), /* context */
num, /* num */ num, /* num */
param.Ids->data<int64_t>(), /* indices */ param.Ids->data<int64_t>(), /* indices */
embed_dim, /* embed_dim */ embed_dim, /* embed_dim */
param.W->data<float>(), /* table */ param.W->data<float>(), /* table */
param.Out->mutable_data<float>(TARGET(kXPU)) /* top */); param.Out->mutable_data<float>(TARGET(kXPU)), /* top */
param.padding_idx /* padding_idx */);
CHECK_EQ(r, 0); CHECK_EQ(r, 0);
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -23,12 +23,15 @@ namespace kernels { ...@@ -23,12 +23,15 @@ namespace kernels {
namespace xpu { namespace xpu {
void MatchMatrixTensorCompute::PrepareForRun() { void MatchMatrixTensorCompute::PrepareForRun() {
wx_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); wx_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
offset_l_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
offset_r_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); offset_l_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
offset_l_cpu.reset(new int[64]); offset_r_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
offset_r_cpu.reset(new int[64]); XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
offset_l_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
offset_r_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
} }
void MatchMatrixTensorCompute::Run() { void MatchMatrixTensorCompute::Run() {
...@@ -76,25 +79,25 @@ void MatchMatrixTensorCompute::Run() { ...@@ -76,25 +79,25 @@ void MatchMatrixTensorCompute::Run() {
int* offset_r_xpu = reinterpret_cast<int*>(offset_r_xpu_guard_->addr_); int* offset_r_xpu = reinterpret_cast<int*>(offset_r_xpu_guard_->addr_);
int r = xdnn::gemm_int16_tmp_api<float, int16_t, float>( int r = xdnn::gemm_int16_tmp_api<float, int16_t, float>(
ctx.GetRawContext(), /* ctx */ ctx.GetRawContext(), /* ctx */
false, false, /* trans_a */
false, /* trans_a, trans_b */ false, /* trans_b */
x->dims()[0], x->dims()[0], /* m */
dim_t * dim_in, dim_t * dim_in, /* n */
dim_in, /* m, n, k */ dim_in, /* k */
1.0f, 1.0f, /* alpha */
bottom_l_data, bottom_l_data, /* data_a */
dim_in, /* alpha, data_a, lda */ dim_in, /* lda */
w_data, w_data, /* data_b */
dim_t * dim_in, dim_t * dim_in, /* ldb */
0.0f, /* data_b, ldb, beta */ 0.0f, /* beta */
bottom_l_trans_data, bottom_l_trans_data, /* data_c */
dim_t * dim_in, /* data_c, ldc */ dim_t * dim_in, /* ldc */
nullptr, /* bias */ nullptr, /* bias */
xdnn::Activation_t::LINEAR, xdnn::Activation_t::LINEAR, /* act */
0.0f, 0.0f, /* max_a */
w_max, w_max, /* max_b */
wx_max /* max_a, max_b, max_c */); wx_max /* max_c */);
CHECK_EQ(r, 0); CHECK_EQ(r, 0);
int max_width = 0; int max_width = 0;
...@@ -110,14 +113,14 @@ void MatchMatrixTensorCompute::Run() { ...@@ -110,14 +113,14 @@ void MatchMatrixTensorCompute::Run() {
max_width = offset_r_cpu[i] - offset_r_cpu[i - 1]; max_width = offset_r_cpu[i] - offset_r_cpu[i - 1];
} }
} }
xpu_memcpy(offset_l_xpu, XPU_CALL(xpu_memcpy(offset_l_xpu,
offset_l_cpu.get(), offset_l_cpu.get(),
offset_l.size() * sizeof(int), offset_l.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
xpu_memcpy(offset_r_xpu, XPU_CALL(xpu_memcpy(offset_r_xpu,
offset_r_cpu.get(), offset_r_cpu.get(),
offset_r.size() * sizeof(int), offset_r.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
r = xdnn::match_matrix_tensor(ctx.GetRawContext(), r = xdnn::match_matrix_tensor(ctx.GetRawContext(),
batch_size, batch_size,
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -23,7 +23,8 @@ namespace kernels { ...@@ -23,7 +23,8 @@ namespace kernels {
namespace xpu { namespace xpu {
void SearchFcCompute::PrepareForRun() { void SearchFcCompute::PrepareForRun() {
maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(float)); maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
XPU_MAX_LOD_SIZE * sizeof(float), false /* use_l3 */);
} }
void SearchFcCompute::Run() { void SearchFcCompute::Run() {
...@@ -59,34 +60,34 @@ void SearchFcCompute::Run() { ...@@ -59,34 +60,34 @@ void SearchFcCompute::Run() {
float* maxs_xpu = reinterpret_cast<float*>(maxs_xpu_guard_->addr_); float* maxs_xpu = reinterpret_cast<float*>(maxs_xpu_guard_->addr_);
float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, w_max, 0.0f, 0.0f, 0.0f}; float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, w_max, 0.0f, 0.0f, 0.0f};
xpu_memcpy(maxs_xpu, XPU_CALL(xpu_memcpy(maxs_xpu,
&maxs_cpu[0], &maxs_cpu[0],
8 * sizeof(float), 8 * sizeof(float),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
int r = xdnn::findmax<float>( int r = xdnn::findmax<float>(
ctx.GetRawContext(), bottom_data, batch * _in, maxs_xpu); ctx.GetRawContext(), bottom_data, batch * _in, maxs_xpu);
CHECK_EQ(r, 0); CHECK_EQ(r, 0);
r = xdnn::gemm_int16_maxptr<float, int16_t, float>( r = xdnn::gemm_int16_maxptr<float, int16_t, float>(
ctx.GetRawContext(), /* ctx */ ctx.GetRawContext(), /* ctx */
false, false, /* trans_a */
true, /*trans_a, trans_b*/ true, /* trans_b */
batch, batch, /* m */
_out, _out, /* n */
_in, /*m, n, k*/ _in, /* k */
1.0f, 1.0f, /* alpha */
bottom_data, bottom_data, /* data_a */
_in, /*alpha, data_a, lda*/ _in, /* lda */
weights, weights, /* data_b */
_in, _in, /* ldb */
0.0f, /*data_b, ldb, beta*/ 0.0f, /* beta */
top_data, top_data, /* data_c */
_out, _out, /* ldc */
bias_data, /* data_c, ldc, bias*/ bias_data, /* bias */
act, act, /* act */
maxs_xpu, maxs_xpu, /* max_a */
maxs_xpu + 4, maxs_xpu + 4, /* max_b */
nullptr /*act, max_a, max_b, max_c*/); nullptr /* max_c */);
CHECK_EQ(r, 0); CHECK_EQ(r, 0);
} }
......
...@@ -24,13 +24,16 @@ namespace kernels { ...@@ -24,13 +24,16 @@ namespace kernels {
namespace xpu { namespace xpu {
void SearchGrnnCompute::PrepareForRun() { void SearchGrnnCompute::PrepareForRun() {
offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
new_offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(256 * sizeof(int)); XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(16 * sizeof(float)); new_offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
XPU_MAX_LOD_SEQ_LEN * sizeof(int), false /* use_l3 */);
maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(16 * sizeof(float),
false /* use_l3 */);
idx_sorted_by_width_data_cpu.reset(new int[64]); idx_sorted_by_width_data_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
offset_cpu.reset(new int[64]); offset_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
new_offset_cpu.reset(new int[256]); new_offset_cpu.reset(new int[XPU_MAX_LOD_SEQ_LEN]);
} }
void SearchGrnnCompute::prepare_layout(const operators::SearchGrnnParam& param, void SearchGrnnCompute::prepare_layout(const operators::SearchGrnnParam& param,
...@@ -96,10 +99,10 @@ void SearchGrnnCompute::prepare_layout(const operators::SearchGrnnParam& param, ...@@ -96,10 +99,10 @@ void SearchGrnnCompute::prepare_layout(const operators::SearchGrnnParam& param,
layout_input->Resize({dim0, dim1}); layout_input->Resize({dim0, dim1});
} }
xpu_memcpy(idx_sorted_by_width->mutable_data<int>(TARGET(kXPU)), XPU_CALL(xpu_memcpy(idx_sorted_by_width->mutable_data<int>(TARGET(kXPU)),
idx_sorted_by_width_data_cpu.get(), idx_sorted_by_width_data_cpu.get(),
idx_sorted_by_width->numel() * sizeof(int), idx_sorted_by_width->numel() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
} }
void SearchGrnnCompute::Run() { void SearchGrnnCompute::Run() {
...@@ -156,14 +159,14 @@ void SearchGrnnCompute::Run() { ...@@ -156,14 +159,14 @@ void SearchGrnnCompute::Run() {
for (size_t i = 0; i < new_offset.size(); ++i) { for (size_t i = 0; i < new_offset.size(); ++i) {
new_offset_cpu[i] = new_offset[i]; new_offset_cpu[i] = new_offset[i];
} }
xpu_memcpy(offset_xpu, XPU_CALL(xpu_memcpy(offset_xpu,
offset_cpu.get(), offset_cpu.get(),
offset.size() * sizeof(int), offset.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
xpu_memcpy(new_offset_xpu, XPU_CALL(xpu_memcpy(new_offset_xpu,
new_offset_cpu.get(), new_offset_cpu.get(),
new_offset.size() * sizeof(int), new_offset.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
int r = xdnn::search_seq2batch(ctx.GetRawContext(), int r = xdnn::search_seq2batch(ctx.GetRawContext(),
batch, batch,
...@@ -200,10 +203,10 @@ void SearchGrnnCompute::Run() { ...@@ -200,10 +203,10 @@ void SearchGrnnCompute::Run() {
0.0f, 0.0f,
0.0f, 0.0f,
0.0f}; 0.0f};
xpu_memcpy(maxs_xpu, XPU_CALL(xpu_memcpy(maxs_xpu,
maxs_cpu, maxs_cpu,
16 * sizeof(float), 16 * sizeof(float),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
r = xdnn::findmax<float>( r = xdnn::findmax<float>(
ctx.GetRawContext(), new_emb, cap_l * cap_e, maxs_xpu); ctx.GetRawContext(), new_emb, cap_l * cap_e, maxs_xpu);
CHECK_EQ(r, 0); CHECK_EQ(r, 0);
......
...@@ -37,44 +37,54 @@ void SequenceArithmeticCompute::Run() { ...@@ -37,44 +37,54 @@ void SequenceArithmeticCompute::Run() {
const auto* bottom_data1 = bottom1->data<float>(); const auto* bottom_data1 = bottom1->data<float>();
auto* top_data = top->mutable_data<float>(TARGET(kXPU)); auto* top_data = top->mutable_data<float>(TARGET(kXPU));
int r = 0;
switch (op_type) { switch (op_type) {
case 1: // addition: top[0] = bottom[0] + bottom[1] case 1: // addition: top[0] = bottom[0] + bottom[1]
if (len1 > len2) { if (len1 > len2) {
xdnn::elementwise_add( r = xdnn::elementwise_add(
ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2); ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
xdnn::memcpy_device(ctx.GetRawContext(), CHECK_EQ(r, 0);
&top_data[len2], r = xdnn::memcpy_device(ctx.GetRawContext(),
&bottom_data0[len2], &top_data[len2],
(len1 - len2) * sizeof(float)); &bottom_data0[len2],
(len1 - len2) * sizeof(float));
CHECK_EQ(r, 0);
} else { } else {
xdnn::elementwise_add( r = xdnn::elementwise_add(
ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1); ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
CHECK_EQ(r, 0);
} }
break; break;
case 2: // substraction: top[0] = bottom[0] - bottom[1] case 2: // substraction: top[0] = bottom[0] - bottom[1]
if (len1 > len2) { if (len1 > len2) {
xdnn::elementwise_sub( r = xdnn::elementwise_sub(
ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2); ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
xdnn::memcpy_device(ctx.GetRawContext(), CHECK_EQ(r, 0);
&top_data[len2], r = xdnn::memcpy_device(ctx.GetRawContext(),
&bottom_data0[len2], &top_data[len2],
(len1 - len2) * sizeof(float)); &bottom_data0[len2],
(len1 - len2) * sizeof(float));
CHECK_EQ(r, 0);
} else { } else {
xdnn::elementwise_sub( r = xdnn::elementwise_sub(
ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1); ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
CHECK_EQ(r, 0);
} }
break; break;
case 3: // multiplication: top[0] = bottom[0] * bottom[1] case 3: // multiplication: top[0] = bottom[0] * bottom[1]
if (len1 > len2) { if (len1 > len2) {
xdnn::elementwise_mul( r = xdnn::elementwise_mul(
ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2); ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
xdnn::memcpy_device(ctx.GetRawContext(), CHECK_EQ(r, 0);
&top_data[len2], r = xdnn::memcpy_device(ctx.GetRawContext(),
&bottom_data0[len2], &top_data[len2],
(len1 - len2) * sizeof(float)); &bottom_data0[len2],
(len1 - len2) * sizeof(float));
CHECK_EQ(r, 0);
} else { } else {
xdnn::elementwise_mul( r = xdnn::elementwise_mul(
ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1); ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
CHECK_EQ(r, 0);
} }
break; break;
default: default:
......
...@@ -23,11 +23,13 @@ namespace kernels { ...@@ -23,11 +23,13 @@ namespace kernels {
namespace xpu { namespace xpu {
void SequenceConcatCompute::PrepareForRun() { void SequenceConcatCompute::PrepareForRun() {
lod0_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); lod0_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
lod1_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
lod1_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
lod0_cpu.reset(new int[64]); lod0_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
lod1_cpu.reset(new int[64]); lod1_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
} }
template <typename T> template <typename T>
...@@ -106,14 +108,14 @@ void SequenceConcatCompute::Run() { ...@@ -106,14 +108,14 @@ void SequenceConcatCompute::Run() {
for (int i = 0; i < lod1.size(); ++i) { for (int i = 0; i < lod1.size(); ++i) {
lod1_cpu[i] = lod1[i]; lod1_cpu[i] = lod1[i];
} }
xpu_memcpy(lod0_xpu, XPU_CALL(xpu_memcpy(lod0_xpu,
lod0_cpu.get(), lod0_cpu.get(),
lod0.size() * sizeof(int), lod0.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
xpu_memcpy(lod1_xpu, XPU_CALL(xpu_memcpy(lod1_xpu,
lod1_cpu.get(), lod1_cpu.get(),
lod1.size() * sizeof(int), lod1.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
int r = xdnn::sequence_concat(ctx.GetRawContext(), int r = xdnn::sequence_concat(ctx.GetRawContext(),
xs[0]->data<float>(), xs[0]->data<float>(),
......
...@@ -23,8 +23,9 @@ namespace kernels { ...@@ -23,8 +23,9 @@ namespace kernels {
namespace xpu { namespace xpu {
void XPUSequencePoolCompute::PrepareForRun() { void XPUSequencePoolCompute::PrepareForRun() {
lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
lod_cpu.reset(new int[64]); XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
} }
void XPUSequencePoolCompute::Run() { void XPUSequencePoolCompute::Run() {
...@@ -55,10 +56,10 @@ void XPUSequencePoolCompute::Run() { ...@@ -55,10 +56,10 @@ void XPUSequencePoolCompute::Run() {
lod_cpu[i] = in_lod[i]; lod_cpu[i] = in_lod[i];
} }
int* lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_); int* lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_);
xpu_memcpy(lod_xpu, XPU_CALL(xpu_memcpy(lod_xpu,
lod_cpu.get(), lod_cpu.get(),
in_lod.size() * sizeof(int), in_lod.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
int r = int r =
xdnn::sequence_pooling_forward(ctx.GetRawContext(), xdnn::sequence_pooling_forward(ctx.GetRawContext(),
......
...@@ -23,8 +23,9 @@ namespace xpu { ...@@ -23,8 +23,9 @@ namespace xpu {
template <typename T, PrecisionType PType> template <typename T, PrecisionType PType>
void SequenceReverseCompute<T, PType>::PrepareForRun() { void SequenceReverseCompute<T, PType>::PrepareForRun() {
lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
lod_cpu.reset(new int[64]); XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
} }
template <typename T, PrecisionType PType> template <typename T, PrecisionType PType>
...@@ -58,10 +59,10 @@ void SequenceReverseCompute<T, PType>::Run() { ...@@ -58,10 +59,10 @@ void SequenceReverseCompute<T, PType>::Run() {
lod_cpu[i] = lod[i]; lod_cpu[i] = lod[i];
} }
int* lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_); int* lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_);
xpu_memcpy(lod_xpu, XPU_CALL(xpu_memcpy(lod_xpu,
lod_cpu.get(), lod_cpu.get(),
lod.size() * sizeof(int), lod.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
int r = xdnn::sequence_reverse(ctx.GetRawContext(), int r = xdnn::sequence_reverse(ctx.GetRawContext(),
batch_size, batch_size,
......
...@@ -23,10 +23,11 @@ namespace kernels { ...@@ -23,10 +23,11 @@ namespace kernels {
namespace xpu { namespace xpu {
void SequenceTopkAvgPoolingCompute::PrepareForRun() { void SequenceTopkAvgPoolingCompute::PrepareForRun() {
lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(256 * sizeof(int)); lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
in_lod_cpu.reset(new int[64]); 4 * XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
row_lod_cpu.reset(new int[64]); in_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
col_lod_cpu.reset(new int[64]); row_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
col_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
} }
void SequenceTopkAvgPoolingCompute::Run() { void SequenceTopkAvgPoolingCompute::Run() {
...@@ -81,22 +82,22 @@ void SequenceTopkAvgPoolingCompute::Run() { ...@@ -81,22 +82,22 @@ void SequenceTopkAvgPoolingCompute::Run() {
for (int i = 0; i < col_lod.size(); ++i) { for (int i = 0; i < col_lod.size(); ++i) {
col_lod_cpu[i] = col_lod[i]; col_lod_cpu[i] = col_lod[i];
} }
xpu_memcpy(in_lod_xpu, XPU_CALL(xpu_memcpy(in_lod_xpu,
in_lod_cpu.get(), in_lod_cpu.get(),
in_lod.size() * sizeof(int), in_lod.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
xpu_memcpy(row_lod_xpu, XPU_CALL(xpu_memcpy(row_lod_xpu,
row_lod_cpu.get(), row_lod_cpu.get(),
row_lod.size() * sizeof(int), row_lod.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
xpu_memcpy(col_lod_xpu, XPU_CALL(xpu_memcpy(col_lod_xpu,
col_lod_cpu.get(), col_lod_cpu.get(),
col_lod.size() * sizeof(int), col_lod.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
xpu_memcpy(topks_xpu, XPU_CALL(xpu_memcpy(topks_xpu,
topks.data(), topks.data(),
topks.size() * sizeof(int), topks.size() * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
int r = xdnn::sequence_topk_avg_pooling(ctx.GetRawContext(), int r = xdnn::sequence_topk_avg_pooling(ctx.GetRawContext(),
in_data, in_data,
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
......
...@@ -25,9 +25,8 @@ void StackCompute::PrepareForRun() { ...@@ -25,9 +25,8 @@ void StackCompute::PrepareForRun() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
int n = param.X.size(); int n = param.X.size();
void* x_ptr = nullptr; x_ptr_guard_ = TargetWrapperXPU::MallocScratchPad(
xpu_malloc(&x_ptr, n * 8 /* sizeof(__global__ float*) */); n * 8 /* sizeof(__global__ float*) */, false /* use_l3 */);
x_ptr_guard_.reset(x_ptr);
x_ptr_cpu_.reserve(n); x_ptr_cpu_.reserve(n);
} }
...@@ -47,14 +46,15 @@ void StackCompute::Run() { ...@@ -47,14 +46,15 @@ void StackCompute::Run() {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
x_ptr_cpu_[i] = param.X[i]->data<float>(); x_ptr_cpu_[i] = param.X[i]->data<float>();
} }
xpu_memcpy(x_ptr_guard_.get(), &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE); XPU_CALL(xpu_memcpy(
x_ptr_guard_->addr_, &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE));
int r = xdnn::stack_forward( int r = xdnn::stack_forward(
ctx.GetRawContext(), /* context */ ctx.GetRawContext(), /* context */
height, /* height */ height, /* height */
width, /* width */ width, /* width */
n, /* n */ n, /* n */
x_ptr_guard_.get(), /* x_ptr */ x_ptr_guard_->addr_, /* x_ptr */
param.Out->mutable_data<float>(TARGET(kXPU)) /* out */); param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
CHECK_EQ(r, 0); CHECK_EQ(r, 0);
} }
......
...@@ -14,10 +14,9 @@ ...@@ -14,10 +14,9 @@
#pragma once #pragma once
#include <memory>
#include <vector> #include <vector>
#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/kernels/xpu/utils.h" // XPUFreeDeleter
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -35,7 +34,7 @@ class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> { ...@@ -35,7 +34,7 @@ class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
virtual ~StackCompute() = default; virtual ~StackCompute() = default;
private: private:
std::unique_ptr<void, XPUFreeDeleter> x_ptr_guard_; XPUScratchPadGuard x_ptr_guard_;
std::vector<const float*> x_ptr_cpu_; std::vector<const float*> x_ptr_cpu_;
}; };
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/backends/xpu/xpu_header_sitter.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
struct XPUFreeDeleter {
void operator()(void* p) const { xpu_free(p); }
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -23,10 +23,12 @@ namespace kernels { ...@@ -23,10 +23,12 @@ namespace kernels {
namespace xpu { namespace xpu {
void VarConv2DCompute::PrepareForRun() { void VarConv2DCompute::PrepareForRun() {
offset_x_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); offset_x_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
offset_y_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int)); XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
offset_x_cpu.reset(new int[64]); offset_y_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
offset_y_cpu.reset(new int[64]); XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
offset_x_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
offset_y_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
} }
void VarConv2DCompute::Run() { void VarConv2DCompute::Run() {
...@@ -94,14 +96,14 @@ void VarConv2DCompute::Run() { ...@@ -94,14 +96,14 @@ void VarConv2DCompute::Run() {
offset_x_cpu[i] = offset_x[i]; offset_x_cpu[i] = offset_x[i];
offset_y_cpu[i] = offset_y[i]; offset_y_cpu[i] = offset_y[i];
} }
xpu_memcpy(offset_x_xpu, XPU_CALL(xpu_memcpy(offset_x_xpu,
offset_x_cpu.get(), offset_x_cpu.get(),
(batch + 1) * sizeof(int), (batch + 1) * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
xpu_memcpy(offset_y_xpu, XPU_CALL(xpu_memcpy(offset_y_xpu,
offset_y_cpu.get(), offset_y_cpu.get(),
(batch + 1) * sizeof(int), (batch + 1) * sizeof(int),
XPUMemcpyKind::XPU_HOST_TO_DEVICE); XPUMemcpyKind::XPU_HOST_TO_DEVICE));
int r = xdnn::search_varconv<float, int16_t>(ctx.GetRawContext(), int r = xdnn::search_varconv<float, int16_t>(ctx.GetRawContext(),
batch, batch,
......
...@@ -88,6 +88,78 @@ bool XPUMmdnnBidEmbGrnnAttOp::AttachImpl(const cpp::OpDesc& op_desc, ...@@ -88,6 +88,78 @@ bool XPUMmdnnBidEmbGrnnAttOp::AttachImpl(const cpp::OpDesc& op_desc,
return true; return true;
} }
bool XPUMmdnnBidEmbGrnnAttOp2::CheckShape() const { return true; }
bool XPUMmdnnBidEmbGrnnAttOp2::InferShapeImpl() const {
auto& id_dims = param_.id0->dims();
auto& id_lod = param_.id0->lod()[0];
auto& emb_tbl_dims = param_.emb_tbl->dims();
auto& grnn_wh_dims = param_.grnn_rv_wh->dims();
param_.emb0_out->Resize({id_dims[0], emb_tbl_dims[1]});
param_.emb0_out->set_lod({id_lod});
param_.grnn_fw_pool_out->Resize(
{(int64_t)id_lod.size() - 1, grnn_wh_dims[2]});
param_.grnn_rv_pool_out->Resize(
{(int64_t)id_lod.size() - 1, grnn_wh_dims[2]});
param_.att_pool_out->Resize(
{(int64_t)id_lod.size() - 1, 2 * grnn_wh_dims[2]});
param_.concat_3in1_out->Resize({id_dims[0], 3 * grnn_wh_dims[2]});
param_.concat_3in1_out->set_lod({id_lod});
param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]});
param_.emb_fw_out->set_lod({id_lod});
return true;
}
bool XPUMmdnnBidEmbGrnnAttOp2::AttachImpl(const cpp::OpDesc& op_desc,
lite::Scope* scope) {
param_.id0 =
scope->FindVar(op_desc.Input("id0").front())->GetMutable<lite::Tensor>();
param_.id1 =
scope->FindVar(op_desc.Input("id1").front())->GetMutable<lite::Tensor>();
param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front())
->GetMutable<lite::Tensor>();
param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front())
->GetMutable<lite::Tensor>();
param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front())
->GetMutable<lite::Tensor>();
param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front())
->GetMutable<lite::Tensor>();
param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front())
->GetMutable<lite::Tensor>();
param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front())
->GetMutable<lite::Tensor>();
param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front())
->GetMutable<lite::Tensor>();
param_.emb0_out = scope->FindVar(op_desc.Output("emb0_out").front())
->GetMutable<lite::Tensor>();
param_.grnn_fw_pool_out =
scope->FindVar(op_desc.Output("grnn_fw_pool_out").front())
->GetMutable<lite::Tensor>();
param_.grnn_rv_pool_out =
scope->FindVar(op_desc.Output("grnn_rv_pool_out").front())
->GetMutable<lite::Tensor>();
param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front())
->GetMutable<lite::Tensor>();
param_.concat_3in1_out =
scope->FindVar(op_desc.Output("concat_3in1_out").front())
->GetMutable<lite::Tensor>();
param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front())
->GetMutable<lite::Tensor>();
param_.grnn_fw_wh_maxs =
op_desc.GetAttr<std::vector<float>>("grnn_fw_wh_maxs");
param_.grnn_fw_wi_maxs =
op_desc.GetAttr<std::vector<float>>("grnn_fw_wi_maxs");
param_.grnn_rv_wh_maxs =
op_desc.GetAttr<std::vector<float>>("grnn_rv_wh_maxs");
param_.grnn_rv_wi_maxs =
op_desc.GetAttr<std::vector<float>>("grnn_rv_wi_maxs");
param_.att_fc_w_max = op_desc.GetAttr<float>("att_fc_w_max");
return true;
}
bool XPUMmdnnBidEmbAttOp::CheckShape() const { return true; } bool XPUMmdnnBidEmbAttOp::CheckShape() const { return true; }
bool XPUMmdnnBidEmbAttOp::InferShapeImpl() const { bool XPUMmdnnBidEmbAttOp::InferShapeImpl() const {
...@@ -157,6 +229,7 @@ bool XPUMmdnnMatchConvTopkOp::AttachImpl(const cpp::OpDesc& op_desc, ...@@ -157,6 +229,7 @@ bool XPUMmdnnMatchConvTopkOp::AttachImpl(const cpp::OpDesc& op_desc,
param_.input_w_max = op_desc.GetAttr<float>("input_w_max"); param_.input_w_max = op_desc.GetAttr<float>("input_w_max");
param_.conv_w_max = op_desc.GetAttr<float>("conv_w_max"); param_.conv_w_max = op_desc.GetAttr<float>("conv_w_max");
param_.topks = op_desc.GetAttr<std::vector<int>>("topks"); param_.topks = op_desc.GetAttr<std::vector<int>>("topks");
param_.output_channel = op_desc.GetAttr<int>("output_channel");
param_.channel_num = op_desc.GetAttr<int>("channel_num"); param_.channel_num = op_desc.GetAttr<int>("channel_num");
param_.dim_t = op_desc.GetAttr<int>("dim_t"); param_.dim_t = op_desc.GetAttr<int>("dim_t");
return true; return true;
...@@ -182,10 +255,10 @@ bool XPUMmdnnMergeAllOp::AttachImpl(const cpp::OpDesc& op_desc, ...@@ -182,10 +255,10 @@ bool XPUMmdnnMergeAllOp::AttachImpl(const cpp::OpDesc& op_desc,
auto t = scope->FindVar(name)->GetMutable<lite::Tensor>(); auto t = scope->FindVar(name)->GetMutable<lite::Tensor>();
param_.concat_7in1_x.push_back(t); param_.concat_7in1_x.push_back(t);
} }
param_.concat_2in1_x.clear(); param_.concat_topk_x.clear();
for (auto& name : op_desc.Input("concat_2in1_x")) { for (auto& name : op_desc.Input("concat_topk_x")) {
auto t = scope->FindVar(name)->GetMutable<lite::Tensor>(); auto t = scope->FindVar(name)->GetMutable<lite::Tensor>();
param_.concat_2in1_x.push_back(t); param_.concat_topk_x.push_back(t);
} }
param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front()) param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front())
->GetMutable<lite::Tensor>(); ->GetMutable<lite::Tensor>();
...@@ -231,6 +304,8 @@ bool XPUMmdnnMergeAllOp::AttachImpl(const cpp::OpDesc& op_desc, ...@@ -231,6 +304,8 @@ bool XPUMmdnnMergeAllOp::AttachImpl(const cpp::OpDesc& op_desc,
REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att, REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att,
paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp); paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp);
REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att2,
paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp2);
REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_att, REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_att,
paddle::lite::operators::XPUMmdnnBidEmbAttOp); paddle::lite::operators::XPUMmdnnBidEmbAttOp);
REGISTER_LITE_OP(__xpu__mmdnn_match_conv_topk, REGISTER_LITE_OP(__xpu__mmdnn_match_conv_topk,
......
...@@ -41,6 +41,29 @@ class XPUMmdnnBidEmbGrnnAttOp : public OpLite { ...@@ -41,6 +41,29 @@ class XPUMmdnnBidEmbGrnnAttOp : public OpLite {
mutable XPUMmdnnBidEmbGrnnAttParam param_; mutable XPUMmdnnBidEmbGrnnAttParam param_;
}; };
class XPUMmdnnBidEmbGrnnAttOp2 : public OpLite {
public:
XPUMmdnnBidEmbGrnnAttOp2() {}
explicit XPUMmdnnBidEmbGrnnAttOp2(const std::string &op_type)
: OpLite(op_type) {}
bool CheckShape() const override;
bool InferShapeImpl() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override {
return "XPUMmdnnBidEmbGrnnAttOp2";
}
private:
mutable XPUMmdnnBidEmbGrnnAttParam2 param_;
};
class XPUMmdnnBidEmbAttOp : public OpLite { class XPUMmdnnBidEmbAttOp : public OpLite {
public: public:
XPUMmdnnBidEmbAttOp() {} XPUMmdnnBidEmbAttOp() {}
......
...@@ -1627,11 +1627,36 @@ struct XPUMmdnnBidEmbGrnnAttParam : ParamBase { ...@@ -1627,11 +1627,36 @@ struct XPUMmdnnBidEmbGrnnAttParam : ParamBase {
std::vector<float> grnn_rv_wi_maxs; std::vector<float> grnn_rv_wi_maxs;
float att_fc_w_max{0.0f}; float att_fc_w_max{0.0f};
lite::Tensor* grnn_fw_pool_out{}; // 1 lite::Tensor* grnn_fw_pool_out{};
lite::Tensor* grnn_rv_pool_out{}; // 2 lite::Tensor* grnn_rv_pool_out{};
lite::Tensor* att_pool_out{}; // 3 lite::Tensor* att_pool_out{};
lite::Tensor* concat_3in1_out{}; // 4 lite::Tensor* concat_3in1_out{};
lite::Tensor* emb_fw_out{}; // 5 lite::Tensor* emb_fw_out{};
};
struct XPUMmdnnBidEmbGrnnAttParam2 : ParamBase {
lite::Tensor* id0{};
lite::Tensor* id1{};
lite::Tensor* emb_tbl{};
lite::Tensor* grnn_fw_wh{};
lite::Tensor* grnn_fw_wi{};
lite::Tensor* grnn_rv_wh{};
lite::Tensor* grnn_rv_wi{};
lite::Tensor* att_fc_w{};
lite::Tensor* att_fc_b{};
std::vector<float> grnn_fw_wh_maxs;
std::vector<float> grnn_fw_wi_maxs;
std::vector<float> grnn_rv_wh_maxs;
std::vector<float> grnn_rv_wi_maxs;
float att_fc_w_max{0.0f};
lite::Tensor* emb0_out{};
lite::Tensor* grnn_fw_pool_out{};
lite::Tensor* grnn_rv_pool_out{};
lite::Tensor* att_pool_out{};
lite::Tensor* concat_3in1_out{};
lite::Tensor* emb_fw_out{};
}; };
struct XPUMmdnnBidEmbAttParam : ParamBase { struct XPUMmdnnBidEmbAttParam : ParamBase {
...@@ -1643,8 +1668,8 @@ struct XPUMmdnnBidEmbAttParam : ParamBase { ...@@ -1643,8 +1668,8 @@ struct XPUMmdnnBidEmbAttParam : ParamBase {
float att_fc_w_max{0.0f}; float att_fc_w_max{0.0f};
lite::Tensor* att_pool_out{}; // 1 lite::Tensor* att_pool_out{};
lite::Tensor* emb_fw_out{}; // 2 lite::Tensor* emb_fw_out{};
}; };
struct XPUMmdnnMatchConvTopkParam : ParamBase { struct XPUMmdnnMatchConvTopkParam : ParamBase {
...@@ -1656,6 +1681,7 @@ struct XPUMmdnnMatchConvTopkParam : ParamBase { ...@@ -1656,6 +1681,7 @@ struct XPUMmdnnMatchConvTopkParam : ParamBase {
float input_w_max{0.0f}; float input_w_max{0.0f};
float conv_w_max{0.0f}; float conv_w_max{0.0f};
std::vector<int> topks; std::vector<int> topks;
int output_channel{0};
int channel_num{0}; int channel_num{0};
int dim_t{0}; int dim_t{0};
...@@ -1664,7 +1690,7 @@ struct XPUMmdnnMatchConvTopkParam : ParamBase { ...@@ -1664,7 +1690,7 @@ struct XPUMmdnnMatchConvTopkParam : ParamBase {
struct XPUMmdnnMergeAllParam : ParamBase { struct XPUMmdnnMergeAllParam : ParamBase {
std::vector<lite::Tensor*> concat_7in1_x; std::vector<lite::Tensor*> concat_7in1_x;
std::vector<lite::Tensor*> concat_2in1_x; std::vector<lite::Tensor*> concat_topk_x;
lite::Tensor* grnn_fw_wh{}; lite::Tensor* grnn_fw_wh{};
lite::Tensor* grnn_fw_wi{}; lite::Tensor* grnn_fw_wi{};
lite::Tensor* grnn_rv_wh{}; lite::Tensor* grnn_rv_wh{};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册