未验证 提交 83f26d6d 编写于 作者: H HappyAngel 提交者: GitHub

Merge pull request #131 from PaddlePaddle/develop

pull code 
......@@ -59,7 +59,14 @@ Welcome to Paddle-Lite's documentation!
demo_guides/baidu_xpu
demo_guides/rockchip_npu
demo_guides/mediatek_apu
.. toctree::
:maxdepth: 1
:caption: 训练示例(预览)
:name: sec-train_demo_guides
demo_guides/cpp_train_demo
.. toctree::
:maxdepth: 1
:caption: API文档
......
......@@ -49,4 +49,4 @@ $ ./opt \
## 五. 测试工具
为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug)[Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug)[Profile工具](debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
......@@ -15,8 +15,8 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
#full api dynamic library
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
DEPS paddle_api paddle_api_light paddle_api_full)
add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
target_link_libraries(paddle_full_api_shared framework_proto)
add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry)
target_link_libraries(paddle_full_api_shared framework_proto op_registry)
if(LITE_WITH_X86)
add_dependencies(paddle_full_api_shared xxhash)
target_link_libraries(paddle_full_api_shared xxhash)
......
......@@ -13,18 +13,30 @@
// limitations under the License.
#include "lite/api/cxx_api.h"
#include <algorithm>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "lite/api/paddle_use_passes.h"
#include "lite/utils/io.h"
namespace paddle {
namespace lite {
std::vector<std::string> GetAllOps() {
const std::map<std::string, std::string> &op2path =
OpKernelInfoCollector::Global().GetOp2PathDict();
std::vector<std::string> res;
for (const auto &op : op2path) {
res.push_back(op.first);
}
return res;
}
void Predictor::SaveModel(const std::string &dir,
lite_api::LiteModelType model_type,
bool record_info) {
......
......@@ -36,6 +36,8 @@ static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
".tailored_kernels_source_list";
static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
std::vector<std::string> GetAllOps();
/*
* Predictor for inference, input a model, it will optimize and execute it.
*/
......
......@@ -13,6 +13,7 @@ nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
set (
math_cuda
......@@ -25,6 +26,7 @@ set (
cudnn_pool
cuda_gemm
cuda_batched_gemm
cuda_sequence_padding
)
set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/math/sequence_padding.h"
#include "lite/backends/cuda/math/utils.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
enum CopyType { kSeqToPad, kPadToSeq };
template <typename T, CopyType Type>
__global__ void SequencePadKernel(T* dst,
const T* src,
const T* pad_value,
bool is_constant_pad,
const size_t* seq_offsets,
const int seq_num,
const int pad_seq_len,
const int step_width) {
size_t seq_idx = blockIdx.y;
size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y;
size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width;
size_t pad_data_offset = (seq_idx * pad_seq_len + step_idx) * step_width;
T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset);
const T* src_data =
src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
if (step_idx < seq_len) {
for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
dst_data[i] = src_data[i];
}
} else if (step_idx < pad_seq_len && Type == kSeqToPad) {
for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i];
}
}
}
template <typename T>
void SequencePadding(T* pad_data,
const T* seq_data,
const T* pad_value_data,
bool is_constant_pad,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream) {
const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
SequencePadKernel<T, kSeqToPad><<<grid, threads, 0, *stream>>>(
pad_data,
seq_data,
pad_value_data,
is_constant_pad,
seq_offsets_data,
seq_num,
pad_seq_len,
step_width);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
template <typename T>
void SequenceUnpadding(T* seq_data,
const T* pad_data,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream) {
const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
SequencePadKernel<T, kPadToSeq><<<grid, threads, 0, *stream>>>(
seq_data,
pad_data,
nullptr,
false,
seq_offsets_data,
seq_num,
pad_seq_len,
step_width);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
template void SequencePadding(float* pad_data,
const float* seq_data,
const float* pad_value_data,
bool is_constant_pad,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream);
template void SequenceUnpadding(float* seq_data,
const float* pad_data,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream);
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include <string>
#include <vector>
#include "lite/core/context.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
void SequenceUnpadding(T* seq_data,
const T* pad_data,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream);
template <typename T>
void SequencePadding(T* pad_data,
const T* seq_data,
const T* pad_value_data,
bool is_constant_pad,
const size_t* seq_offsets_data,
int seq_num,
int pad_seq_len,
int step_width,
cudaStream_t* stream);
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
......@@ -174,24 +174,9 @@ void Transpose<T>::transpose(T* dst,
TransposeCUDAImpl<T>(src_dims, axes, src, dst, &Y_dims_, &strides_, stream);
}
// template <typename T>
// void Transpose<T>::transpose(T* dst,
// const T* src,
// const std::vector<int>& src_dims,
// const std::vector<int>& axes,
// cudaStream_t* stream) {
// std::vector<int64_t> _src_dims(src_dims.size(), 0);
// std::transform(
// src_dims.begin(),
// src_dims.end(),
// _src_dims.begin(),
// [](int data) -> int64_t { return static_cast<int64_t>(data); });
// TransposeCUDAImpl<T>(_src_dims, axes, src, dst, &Y_dims_, &strides_,
// stream);
//}
template class Transpose<int8_t>;
template class Transpose<float>;
template class Transpose<half>;
} // namespace math
} // namespace cuda
......
......@@ -20,8 +20,8 @@ limitations under the License. */
#include "lite/backends/x86/cupti_lib_path.h"
#include "lite/backends/x86/port.h"
#include "lite/backends/x86/warpctc_lib_path.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/env.h"
#include "lite/utils/paddle_enforce.h"
// DEFINE_string(cudnn_dir,
// "",
......@@ -178,7 +178,7 @@ auto error_msg =
#endif // !_WIN32
if (throw_on_error) {
CHECK(dso_handle != nullptr);
// PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno);
// CHECK(nullptr != dso_handle, error_msg, dlPath, errorno);
} else if (nullptr == dso_handle) {
// LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno);
}
......
......@@ -319,8 +319,8 @@ void BenchKernelSgd() {
const T lr = 0.1;
auto UnDuplicatedRandomVec = [](
int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
PADDLE_ENFORCE_GT(n, 0);
CHECK_LE(static_cast<size_t>(upper - lower), n - 1);
CHECK_GT(n, 0);
std::vector<int64_t> all, out;
for (int i = 0; i < n; ++i) {
all.push_back(i);
......
......@@ -129,11 +129,11 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
}
std::unique_ptr<GenBase> CreateJitCode(
const emb_seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.table_height, 0);
PADDLE_ENFORCE_GT(attr.table_width, 0);
PADDLE_ENFORCE_GT(attr.index_height, 0);
PADDLE_ENFORCE_GT(attr.index_width, 0);
PADDLE_ENFORCE_GT(attr.out_width, 0);
CHECK_GT(attr.table_height, 0);
CHECK_GT(attr.table_width, 0);
CHECK_GT(attr.index_height, 0);
CHECK_GT(attr.index_width, 0);
CHECK_GT(attr.out_width, 0);
return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
}
};
......
......@@ -17,7 +17,7 @@
#include <string>
#include "lite/backends/x86/jit/gen/jitcode.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
......
......@@ -27,7 +27,7 @@ void MatMulJitCode::genCode() {
preCode();
int block, rest;
const auto groups = packed_groups(n_, k_, &block, &rest);
PADDLE_ENFORCE_GT(groups.front(), 0);
CHECK_GT(groups.front(), 0);
const int block_len = sizeof(float) * block;
const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
......@@ -116,9 +116,9 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
}
std::unique_ptr<GenBase> CreateJitCode(
const matmul_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.m, 0);
PADDLE_ENFORCE_GT(attr.n, 0);
PADDLE_ENFORCE_GT(attr.k, 0);
CHECK_GT(attr.m, 0);
CHECK_GT(attr.n, 0);
CHECK_GT(attr.k, 0);
return make_unique<MatMulJitCode>(attr, CodeSize(attr));
}
};
......
......@@ -19,7 +19,7 @@
#include <vector>
#include "lite/backends/x86/jit/gen/jitcode.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
......@@ -32,7 +32,7 @@ class MatMulJitCode : public JitCode {
size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
CHECK_EQ(m_, 1) << "Only support m==1 yet";
this->genCode();
}
......
......@@ -69,8 +69,8 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
}
std::unique_ptr<GenBase> CreateJitCode(
const seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.w, 0);
PADDLE_ENFORCE_GT(attr.h, 0);
CHECK_GT(attr.w, 0);
CHECK_GT(attr.h, 0);
return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
}
};
......
......@@ -17,7 +17,7 @@
#include <string>
#include "lite/backends/x86/jit/gen/jitcode.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
......@@ -125,8 +125,8 @@ class SeqPoolJitCode : public JitCode {
vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
reg_idx++;
}
PADDLE_ENFORCE_EQ(
reg_idx, rest_used_num_regs, "All heights should use same regs");
CHECK_EQ(reg_idx, rest_used_num_regs)
<< "All heights should use same regs";
for (int i = 0; i < reg_idx; ++i) {
vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
}
......
......@@ -17,7 +17,7 @@
#include <memory>
#include <vector>
#include "lite/backends/x86/jit/registry.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -113,9 +113,9 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
}
std::unique_ptr<GenBase> CreateJitCode(
const sgd_attr_t& attr) const override {
PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
CHECK_EQ(attr.param_width, attr.grad_width);
CHECK_LE(attr.selected_rows_size, attr.grad_height);
CHECK_GE(attr.selected_rows_size, 0);
return make_unique<SgdJitCode>(attr, CodeSize(attr));
}
};
......
......@@ -16,7 +16,7 @@
#include <memory>
#include <vector>
#include "lite/backends/x86/jit/registry.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -76,7 +76,7 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
}
std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
PADDLE_ENFORCE_GT(w, 0);
CHECK_GT(w, 0);
return make_unique<VBroadcastJitCode>(w, CodeSize(w));
}
};
......
......@@ -21,8 +21,8 @@
// posix_memalign
#include "lite/backends/x86/cpu_info.h"
#include "lite/backends/x86/jit/macro.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/env.h"
#include "lite/utils/paddle_enforce.h"
#ifndef _WIN32
#define posix_memalign_free free
......@@ -62,12 +62,10 @@ void* GenBase::operator new(size_t size) {
#ifdef _WIN32
ptr = _aligned_malloc(size, alignment);
#else
PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
0,
"GenBase Alloc %ld error!",
size);
CHECK_EQ(posix_memalign(&ptr, alignment, size), 0) << "GenBase Alloc " << size
<< " error!";
#endif
PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
CHECK(ptr) << "Fail to allocate GenBase CPU memory: size = " << size;
return ptr;
}
......
......@@ -14,9 +14,10 @@
#include "lite/backends/x86/jit/helper.h"
#include <algorithm> // tolower
#include <cstring>
#include <numeric>
#include <string>
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -104,12 +105,12 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
int block, rest;
const auto groups = packed_groups(n, k, &block, &rest);
std::for_each(groups.begin(), groups.end(), [&](int i) {
PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
CHECK_GT(i, 0) << "each element of groups should be larger than 0.";
});
int sum = std::accumulate(groups.begin(), groups.end(), 0);
std::memset(dst, 0, k * sum * block * sizeof(float));
PADDLE_ENFORCE_GE(
sum * block, n, "The packed n should be equal to or larger than n");
CHECK_GE(sum * block, n)
<< "The packed n should be equal to or larger than n";
const int block_len = sizeof(float) * block;
int n_offset = 0;
......
......@@ -23,7 +23,7 @@
#include "lite/backends/x86/jit/kernel_base.h"
#include "lite/backends/x86/jit/kernel_key.h"
#include "lite/backends/x86/jit/kernel_pool.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -78,8 +78,8 @@ inline const Kernel* GetReferKernel() {
auto& ref_pool = ReferKernelPool::Instance().AllKernels();
KernelKey kkey(KernelTuple::kernel_type, lite::fluid::CPUPlace());
auto ref_iter = ref_pool.find(kkey);
PADDLE_ENFORCE(ref_iter != ref_pool.end(),
"Every Kernel should have reference function.");
CHECK(ref_iter != ref_pool.end())
<< "Every Kernel should have reference function.";
auto& ref_impls = ref_iter->second;
for (auto& impl : ref_impls) {
auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
......@@ -94,7 +94,7 @@ template <typename KernelTuple>
inline typename KernelTuple::func_type GetReferFunc() {
auto ker = GetReferKernel<KernelTuple>();
auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
PADDLE_ENFORCE(p, "The Refer kernel should exsit");
CHECK(p) << "The Refer kernel should exsit";
return p->GetFunc();
}
......@@ -125,7 +125,7 @@ std::vector<const Kernel*> GetAllCandidateKernels(
// The last implementation should be reference function on CPUPlace.
auto ref = GetReferKernel<KernelTuple>();
PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
CHECK(ref != nullptr) << "Refer Kernel can not be empty.";
res.emplace_back(ref);
return res;
}
......@@ -140,11 +140,11 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
std::string name = k->ImplType();
if (name == "JitCode") {
auto i = dynamic_cast<const GenBase*>(k);
PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
CHECK(i) << "jitcode kernel cast can not fail.";
res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
} else {
auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
PADDLE_ENFORCE(i, "kernel cast can not fail.");
CHECK(i) << "kernel cast can not fail.";
res.emplace_back(std::make_pair(name, i->GetFunc()));
}
}
......@@ -166,7 +166,7 @@ template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
typename KernelTuple::func_type GetDefaultBestFunc(
const typename KernelTuple::attr_type& attr) {
auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
PADDLE_ENFORCE_GE(funcs.size(), 1UL);
CHECK_GE(funcs.size(), 1UL);
// Here could do some runtime benchmark of this attr and return the best one.
// But yet just get the first one as the default best one,
// which is searched in order and tuned by offline.
......
......@@ -14,7 +14,7 @@
#include "lite/backends/x86/jit/kernel_key.h"
#include <xxhash.h> // XXH64: 13.8 GB/s
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......
......@@ -18,7 +18,7 @@
#include <type_traits>
#include <vector>
#include "lite/backends/x86/jit/kernel_base.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -104,11 +104,11 @@ void EmbSeqPool(const T* table,
const int64_t* idx,
T* out,
const emb_seq_pool_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);
auto check_idx_value_valid = [&](int64_t i) {
PADDLE_ENFORCE_LT(
idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
<< " i: " << i;
CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
};
for (int64_t w = 0; w != attr->index_width; ++w) {
......@@ -175,22 +175,22 @@ void Sgd(const T* lr,
const int64_t* rows,
T* out,
const sgd_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
CHECK_EQ(attr->param_width, attr->grad_width);
CHECK_LE(attr->selected_rows_size, attr->grad_height);
T scalar = -lr[0];
int width = attr->grad_width;
if (out == param) {
for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height);
PADDLE_ENFORCE_GE(h_idx, 0);
CHECK_LT(h_idx, attr->param_height);
CHECK_GE(h_idx, 0);
VAXPY(scalar, grad + i * width, out + h_idx * width, width);
}
} else {
for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height);
PADDLE_ENFORCE_GE(h_idx, 0);
CHECK_LT(h_idx, attr->param_height);
CHECK_GE(h_idx, 0);
VScal(&scalar, grad + i * width, out + h_idx * width, width);
VAdd(param + h_idx * width,
out + h_idx * width,
......
......@@ -22,7 +22,6 @@
#include "lite/backends/x86/jit/kernel_base.h"
#include "lite/backends/x86/jit/macro.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/paddle_enforce.h"
namespace paddle {
namespace lite {
......@@ -480,12 +479,12 @@ void EmbSeqPool(const T* table,
const int64_t* idx,
T* out,
const emb_seq_pool_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);
auto check_idx_value_valid = [&](int64_t i) {
PADDLE_ENFORCE_LT(
idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
<< " i: " << i;
CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
};
for (int64_t w = 0; w != attr->index_width; ++w) {
......@@ -527,12 +526,12 @@ void Sgd(const T* lr,
const int64_t* rows,
T* out,
const lite::jit::sgd_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
CHECK_EQ(attr->param_width, attr->grad_width);
CHECK_LE(attr->selected_rows_size, attr->grad_height);
for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height);
PADDLE_ENFORCE_GE(h_idx, 0);
CHECK_LT(h_idx, attr->param_height);
CHECK_GE(h_idx, 0);
for (int64_t j = 0; j < attr->grad_width; ++j) {
out[h_idx * attr->grad_width + j] =
param[h_idx * attr->grad_width + j] -
......
......@@ -910,8 +910,8 @@ void TestKernelSgd() {
const T lr = 0.1;
auto UnDuplicatedRandomVec = [](
int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
PADDLE_ENFORCE_GT(n, 0);
CHECK_LE(static_cast<size_t>(upper - lower), n - 1);
CHECK_GT(n, 0);
std::vector<int64_t> all, out;
for (int i = 0; i < n; ++i) {
all.push_back(i);
......
......@@ -116,7 +116,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
lod[0].assign(high_level.begin(), high_level.end());
lod[1].assign(low_level.begin(), low_level.end());
// if (!lite::fluid::CheckLoD(lod)) {
// //PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
// //LOG(FATAL)<<"lod %s is not right", framework::LoDToString(lod));
//}
selected_ids->set_lod(lod);
selected_scores->set_lod(lod);
......
......@@ -23,7 +23,7 @@ namespace math {
MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
int num_flatten_cols,
bool trans) {
PADDLE_ENFORCE_GT(tensor_dim.size(), 1u);
CHECK_GT(tensor_dim.size(), 1u);
MatDescriptor retv;
if (num_flatten_cols > 1) {
auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
......
......@@ -287,22 +287,22 @@ struct CBlas<double> {
template <>
struct CBlas<lite::fluid::float16> {
static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
static void GEMM(...) { LOG(FATAL) << "float16 GEMM not supported on CPU"; }
static void SMM_GEMM(...) {
PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
LOG(FATAL) << "float16 SMM_GEMM not supported on CPU";
}
static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
static void VMUL(...) { LOG(FATAL) << "float16 VMUL not supported on CPU"; }
static void VEXP(...) { LOG(FATAL) << "float16 VEXP not supported on CPU"; }
static void VSQUARE(...) {
PADDLE_THROW("float16 VSQUARE not supported on CPU");
LOG(FATAL) << "float16 VSQUARE not supported on CPU";
}
static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
static void VPOW(...) { LOG(FATAL) << "float16 VPOW not supported on CPU"; }
static void DOT(...) { LOG(FATAL) << "float16 DOT not supported on CPU"; };
static void SCAL(...) { LOG(FATAL) << "float16 SCAL not supported on CPU"; };
static void ASUM(...) { LOG(FATAL) << "float16 ASUM not supported on CPU"; };
#ifdef PADDLE_WITH_MKLML
static void GEMM_BATCH(...) {
PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
LOG(FATAL) << "float16 GEMM_BATCH not supported on CPU";
}
#endif
};
......@@ -461,11 +461,11 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
auto dim_a = mat_a.dims();
auto dim_b = mat_b.dims();
auto dim_out = mat_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
// PADDLE_ENFORCE(
// mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target(),
// "The targets of matrices must be same");
CHECK(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2)
<< "The input and output of matmul be matrix";
// CHECK(
// mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target())
// << "The targets of matrices must be same";
int M = dim_out[0];
int N = dim_out[1];
......@@ -746,7 +746,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
T alpha,
lite::Tensor *mat_out,
T beta) const {
PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
CHECK_EQ(dim_a.width_, dim_b.height_);
CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
......@@ -761,8 +761,8 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
beta,
mat_out->template mutable_data<T>());
} else {
PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
CHECK(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
dim_b.batch_size_ == 0);
this->template BatchedGEMM<T>(
transA,
transB,
......
......@@ -146,7 +146,7 @@ class ContextProjectFunctor {
}
}
if (padding_trainable) {
PADDLE_ENFORCE(padding_data != nullptr);
CHECK(padding_data != nullptr);
for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
if (lod_level_0[i] == lod_level_0[i + 1]) continue;
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include <functional>
#include <string>
#include "lite/backends/x86/cpu_info.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
#ifdef PADDLE_WITH_MKLML
#include "lite/backends/x86/mklml.h"
......@@ -652,7 +652,7 @@ class VecActivations {
} else if (type == "identity" || type == "") {
return vec_identity<T, isa>;
}
PADDLE_THROW("Not support type: %s", type);
LOG(FATAL) << "Not support type: " << type;
}
};
......
......@@ -57,7 +57,7 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < num_remain; j++) {
int lbl = label_data[i * num_remain + j];
PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
CHECK((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
int index = i * num_classes + lbl * num_remain + j;
int loss_idx = i * num_remain + j;
loss_data[loss_idx] =
......
......@@ -27,7 +27,7 @@ namespace math {
template <typename T>
struct TolerableValue {
HOSTDEVICE T operator()(const T& x) const {
PADDLE_ENFORCE(static_cast<bool>(std::is_floating_point<T>::value));
CHECK(static_cast<bool>(std::is_floating_point<T>::value));
const T kApproInf = 1e20;
if (x == INFINITY) return kApproInf;
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <math.h>
#include <string>
#include "lite/backends/x86/cpu_info.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -46,8 +46,6 @@ inline ActivationType GetActivationType(const std::string &type) {
return ActivationType::kIdentity;
}
LOG(ERROR) << "Not support type " << type;
// PADDLE_ENFORCE(false, "Not support type %s", type);
// PADDLE_THROW("Not support type %s.", type);
return ActivationType();
}
......
......@@ -13,7 +13,7 @@ limitations under the License. */
#include "lite/backends/x86/math/detail/activation_functions.h"
#include "lite/core/context.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include "lite/backends/x86/math/im2col.h"
#include <vector>
#include "lite/backends/x86/math/im2col_cfo_cpu.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -38,8 +38,8 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kCFO,
const std::vector<int>& stride,
const std::vector<int>& padding,
lite::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col->dims().size() == 5);
CHECK_EQ(im.dims().size(), 3);
CHECK_EQ(col->dims().size(), 5);
if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
dilation[1] == 1) {
......@@ -72,8 +72,8 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
const std::vector<int>& stride,
const std::vector<int>& padding,
lite::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
CHECK_EQ(im->dims().size(), 3);
CHECK_EQ(col.dims().size(), 5);
int im_channels = im->dims()[0];
int im_height = im->dims()[1];
int im_width = im->dims()[2];
......@@ -82,20 +82,20 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
int col_height = col.dims()[3];
int col_width = col.dims()[4];
PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
((dilation[0] * (filter_height - 1) + 1))) /
stride[0] +
1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
((dilation[1] * (filter_width - 1) + 1))) /
stride[1] +
1,
col_width,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
CHECK_EQ((im_height + padding[0] + padding[2] -
((dilation[0] * (filter_height - 1) + 1))) /
stride[0] +
1,
col_height)
<< "Output_height and padding(padding_up, padding_down) are "
"inconsistent.";
CHECK_EQ((im_width + padding[1] + padding[3] -
((dilation[1] * (filter_width - 1) + 1))) /
stride[1] +
1,
col_width)
<< "Output_height and padding(padding_up, padding_down) are "
"inconsistent.";
int channels_col = im_channels * filter_height * filter_width;
......@@ -150,8 +150,8 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
const std::vector<int>& stride,
const std::vector<int>& padding,
lite::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col->dims().size() == 5);
CHECK_EQ(im.dims().size(), 3);
CHECK_EQ(col->dims().size(), 5);
int im_channels = im.dims()[0];
int im_height = im.dims()[1];
int im_width = im.dims()[2];
......@@ -214,8 +214,8 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
const std::vector<int>& stride,
const std::vector<int>& padding,
lite::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
CHECK_EQ(im->dims().size(), 3);
CHECK_EQ(col.dims().size(), 5);
int im_channels = im->dims()[0];
int im_height = im->dims()[1];
int im_width = im->dims()[2];
......@@ -224,16 +224,16 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
int col_height = col.dims()[0];
int col_width = col.dims()[1];
PADDLE_ENFORCE_EQ(
CHECK_EQ(
(im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
col_height)
<< "Output_height and padding(padding_up, padding_down) are "
"inconsistent.";
CHECK_EQ(
(im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
col_width)
<< "col_width and padding(padding_left, padding_right) are "
"inconsistent.";
T* im_data = im->template mutable_data<T>();
const T* col_data = col.data<T>();
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include "lite/backends/x86/math/detail/activation_functions.h"
#include "lite/core/context.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......
......@@ -121,8 +121,8 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
lite::Tensor* output) {
const auto& in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
CHECK_EQ(vector.numel(), size);
CHECK_EQ(output->dims(), in_dims);
const T* input_data = input.data<T>();
const T* vector_data = vector.data<T>();
......
......@@ -20,8 +20,8 @@ limitations under the License. */
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "lite/fluid/float16.h"
#include "lite/utils/paddle_enforce.h"
//#include "lite/tensor_util.h"
#include "lite/utils/cp_logging.h"
// #include "lite/tensor_util.h"
namespace paddle {
namespace lite {
......
......@@ -59,7 +59,7 @@ void ColwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
lite::TensorLite* out) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(out->numel(), size);
CHECK_EQ(out->numel(), size);
auto in = lite::fluid::EigenMatrix<T>::From(input);
auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
......@@ -81,7 +81,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
auto& in_dims = input.dims();
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), size);
CHECK_EQ(out->numel(), size);
T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>();
......@@ -103,8 +103,8 @@ void RowwiseMean<Target, T>::operator()(const lite::Context<Target>& context,
const lite::TensorLite& input,
lite::TensorLite* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
CHECK_EQ(in_dims.size(), 2U);
CHECK_EQ(out->numel(), in_dims[0]);
auto in = lite::fluid::EigenMatrix<T>::From(input);
auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
......@@ -124,10 +124,10 @@ class RowwiseMean<lite::TargetType::kX86, T> {
const lite::TensorLite& input,
lite::TensorLite* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
CHECK_EQ(in_dims.size(), 2U);
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
CHECK_EQ(out->numel(), height);
auto inv_size = 1.0 / size;
T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>();
......@@ -147,8 +147,8 @@ void RowwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
const lite::TensorLite& input,
lite::TensorLite* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
CHECK_EQ(in_dims.size(), 2U);
CHECK_EQ(out->numel(), in_dims[0]);
auto in = lite::fluid::EigenMatrix<T>::From(input);
auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
......@@ -168,10 +168,10 @@ class RowwiseSum<lite::TargetType::kX86, T> {
const lite::TensorLite& input,
lite::TensorLite* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
CHECK_EQ(in_dims.size(), 2U);
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
CHECK_EQ(out->numel(), height);
T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>();
......
......@@ -273,7 +273,7 @@ TEST(math_funciton, set_constant) {
auto* ctx = new paddle::platform::CPUDeviceContext();
paddle::operators::math::set_constant(*ctx, &t, 10);
for (int64_t i = 0; i < t.numel(); ++i) {
PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
CHECK_EQ(10, t.data<int>()[i]);
}
delete ctx;
}
......
......@@ -32,7 +32,7 @@ namespace math {
class Sampler {
public:
explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
// PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
// CHECK_GT(range, 0, "Range should be greater than 0.");
if (seed == 0) {
std::random_device r;
seed_ = r();
......
......@@ -31,7 +31,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
const fluid::SelectedRows& input2,
fluid::SelectedRows* output) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2.height());
CHECK_EQ(in1_height, input2.height());
output->set_height(in1_height);
auto& in1_rows = input1.rows();
......@@ -49,8 +49,8 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
auto& in2_value = input2.value();
auto in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
CHECK_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
CHECK_EQ(in1_row_numel, out_value->numel() / out_rows.size());
auto* out_data = out_value->template mutable_data<T>();
auto* in1_data = in1_value.data<T>();
......@@ -73,15 +73,15 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
auto in1_height = input1.height();
auto in2_dims = input2.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
CHECK_EQ(in1_height, in2_dims[0]);
CHECK_EQ(in1_height, out_dims[0]);
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
CHECK_EQ(in1_row_numel, input2.numel() / in1_height);
CHECK_EQ(in1_row_numel, output->numel() / in1_height);
SetConstant<lite::TargetType::kX86, T> functor;
functor(context, output, 0.0);
......@@ -113,7 +113,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
const int64_t input2_offset,
fluid::SelectedRows* input2) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
CHECK_EQ(in1_height, input2->height());
auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows());
......@@ -149,7 +149,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
auto& in_rows = (*iter)->rows();
size += in_rows.end() - in_rows.begin();
auto in1_height = (*iter)->height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
CHECK_EQ(in1_height, input2->height());
}
// concat rows
std::vector<int64_t> in2_rows;
......@@ -185,13 +185,13 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
CHECK_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
CHECK_EQ(in1_row_numel, input2->numel() / in1_height);
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->template mutable_data<T>();
......@@ -291,12 +291,11 @@ struct MergeAdd<lite::TargetType::kX86, T> {
if (input->rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(input_width,
input->value().dims()[1],
"all input should have same "
"dimension except for the first one");
PADDLE_ENFORCE_EQ(
input_height, input->height(), "all input should have same height");
CHECK_EQ(input_width, input->value().dims()[1])
<< "all input should have same "
"dimension except for the first one";
CHECK_EQ(input_height, input->height())
<< "all input should have same height";
row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
......@@ -376,13 +375,13 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
lite::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
CHECK_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
CHECK_EQ(in1_row_numel, input2->numel() / in1_height);
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->template data<T>();
......
......@@ -30,12 +30,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
const uint64_t* index = index_lod.data();
const auto& src_dims = src.dims();
const auto& dst_dims = dst->dims();
PADDLE_ENFORCE_EQ(
src_dims.size(), 2UL, "The src must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(
dst_dims.size(), 2UL, "The dst must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(
src_dims[1], dst_dims[1], "The width of src and dst must be same.");
CHECK_EQ(src_dims.size(), 2UL) << "The src must be matrix with rank 2.";
CHECK_EQ(dst_dims.size(), 2UL) << "The dst must be matrix with rank 2.";
CHECK_EQ(src_dims[1], dst_dims[1])
<< "The width of src and dst must be same.";
auto height = dst_dims[0];
auto width = dst_dims[1];
auto* src_data = src.data<T>();
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "lite/core/context.h"
#include "lite/core/tensor.h"
#include "lite/fluid/eigen.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -66,21 +66,18 @@ class LoDTensor2BatchFunctor {
bool is_reverse = false) const {
if (!is_cal_batch_lod) {
auto lods = batch->lod();
PADDLE_ENFORCE_GT(lods.size(),
2UL,
"The LoD of LoDTensor should inlcude at least 2-level "
"sequence information.");
PADDLE_ENFORCE_EQ(
lods[1].size(),
static_cast<size_t>(lod_tensor.dims()[0]),
"The LoD information should be consistent with the dims.");
CHECK_GT(lods.size(), 2UL)
<< "The LoD of LoDTensor should inlcude at least 2-level "
"sequence information.";
CHECK_EQ(lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]))
<< "The LoD information should be consistent with the dims.";
CopyMatrixRowsFunctor<Target, T> to_batch;
to_batch(context, lod_tensor, lods[1], batch, true);
return;
}
auto lods = lod_tensor.lod();
PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
const auto& lod = lods[0];
......@@ -165,14 +162,11 @@ class Batch2LoDTensorFunctor {
const lite::Tensor& batch,
lite::Tensor* lod_tensor) const {
auto in_lod = batch.lod();
PADDLE_ENFORCE_GT(in_lod.size(),
2UL,
"The LoD of LoDTensor should inlcude at least 2-level "
"sequence information.");
PADDLE_ENFORCE_EQ(
in_lod[1].size(),
static_cast<size_t>(lod_tensor->dims()[0]),
"The LoD information should be consistent with the dims.");
CHECK_GT(in_lod.size(), 2UL)
<< "The LoD of LoDTensor should inlcude at least 2-level "
"sequence information.";
CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
<< "The LoD information should be consistent with the dims.";
CopyMatrixRowsFunctor<Target, T> to_seq;
to_seq(context, batch, in_lod[1], lod_tensor, false);
}
......
......@@ -37,10 +37,9 @@ void CopyValidData(lite::Tensor* dst_tensor,
layout == kBatchLengthWidth ? step_width : seq_num * step_width;
for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
PADDLE_ENFORCE_GE(
pad_seq_len,
valid_seq_len,
"The padded sequence length can not be less than its original length.");
CHECK_GE(pad_seq_len, valid_seq_len) << "The padded sequence length can "
"not be less than its original "
"length.";
int seq_data_offset = seq_offsets[seq_idx] * step_width;
int pad_data_offset = layout == kBatchLengthWidth
? seq_idx * pad_seq_len * step_width
......@@ -108,9 +107,9 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
pad_seq_len,
step_width,
layout);
PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
"The numel of 'pad_value' can only be 1 or be equal to the "
"'step_width'.");
CHECK(pad_value.numel() == 1 || pad_value.numel() == step_width)
<< "The numel of 'pad_value' can only be 1 or be equal to the "
"'step_width'.";
// fill padding value
T* pad_data = pad_tensor->template mutable_data<T>();
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "lite/core/context.h"
#include "lite/core/tensor.h"
#include "lite/fluid/lod.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -46,15 +46,14 @@ inline static void CheckDims(const lite::DDim& seq_tensor_dims,
int64_t padded_seq_len,
int64_t step_width,
const PadLayout& layout) {
PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]),
seq_offset.back(),
"Value of 1st dimension of the sequence tensor should be "
"equal to sum of lengths of all sequences.");
CHECK_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back())
<< "Value of 1st dimension of the sequence tensor should be "
"equal to sum of lengths of all sequences.";
PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
seq_tensor_dims.size() == pad_tensor_dims.size(),
"pad_tensor's rank should be 1 greater than seq_tensor's "
"rank, or be equal with it.");
CHECK(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
seq_tensor_dims.size() == pad_tensor_dims.size())
<< "pad_tensor's rank should be 1 greater than seq_tensor's "
"rank, or be equal with it.";
}
/*
......
......@@ -46,12 +46,12 @@ class MaxSeqPoolFunctor {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto idx_dims = index->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1u);
PADDLE_ENFORCE_GT(out_dims.size(), 1u);
CHECK_GT(in_dims.size(), 1u);
CHECK_GT(out_dims.size(), 1u);
for (size_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
CHECK_EQ(in_dims[i], out_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, out_dims);
CHECK_EQ(idx_dims, out_dims);
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
......@@ -95,10 +95,10 @@ class MaxSeqPoolFunctor<T, true> {
lite::Tensor* index) {
auto in_dims = input.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1u);
PADDLE_ENFORCE_GT(out_dims.size(), 1u);
CHECK_GT(in_dims.size(), 1u);
CHECK_GT(out_dims.size(), 1u);
for (size_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
CHECK_EQ(in_dims[i], out_dims[i]);
}
auto starts = input.lod()[0];
......@@ -136,12 +136,12 @@ class MaxSeqPoolGradFunctor {
auto og_dims = out_grad.dims();
auto ig_dims = in_grad->dims();
auto idx_dims = index.dims();
PADDLE_ENFORCE_GT(og_dims.size(), 1);
PADDLE_ENFORCE_GT(ig_dims.size(), 1);
CHECK_GT(og_dims.size(), 1);
CHECK_GT(ig_dims.size(), 1);
for (size_t i = 1; i < og_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
CHECK_EQ(og_dims[i], ig_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, og_dims);
CHECK_EQ(idx_dims, og_dims);
const T* og_data = out_grad.data<T>();
const int* max_index = index.data<int>();
......@@ -236,7 +236,7 @@ class SumSeqPoolGradFunctor {
auto lod = in_grad->lod()[0];
int64_t out_w = out_grad.numel() / out_grad.dims()[0];
int64_t in_w = in_grad->numel() / in_grad->dims()[0];
PADDLE_ENFORCE(in_w == out_w);
CHECK(in_w == out_w);
const T* out_g_data = out_grad.data<T>();
T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
auto blas = math::GetBlas<TARGET(kX86), T>(context);
......@@ -330,7 +330,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
out_e.device(eigen_device) = in_e.sum(Eigen::array<int, 1>({{0}})) /
std::sqrt(static_cast<T>(h));
} else {
PADDLE_THROW("unsupported pooling pooltype");
LOG(FATAL) << "unsupported pooling pooltype";
}
}
}
......@@ -389,7 +389,7 @@ class SequencePoolGradFunctor<TARGET(kX86), T> {
} else if (pooltype == "FIRST") {
in_g_e.chip(0, 0).device(eigen_device) = out_g_e_v;
} else {
PADDLE_THROW("unsupported pooling pooltype");
LOG(FATAL) << "unsupported pooling pooltype";
}
}
}
......
......@@ -50,9 +50,9 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
in_grad.mutable_data<T>(in_dims, context->GetPlace());
// check tensor contruction result
PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
CHECK_EQ(in_grad.dims().size(), out_grad.dims().size());
for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
CHECK_EQ(in_grad.dims()[i], out_grad.dims()[i]);
}
// call functor
......
......@@ -55,7 +55,7 @@ void Tree2ColUtil::construct_tree(const lite::Tensor &EdgeSet,
std::vector<std::vector<int>> *tr,
size_t *node_count) {
auto edge_set_dims = EdgeSet.dims();
PADDLE_ENFORCE_EQ(edge_set_dims[1], 2);
CHECK_EQ(edge_set_dims[1], 2);
int64_t edge_count = EdgeSet.numel();
const int *edge_data = EdgeSet.data<int>();
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/x86/math/unpooling.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -41,7 +41,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
for (int c = 0; c < output_channels; ++c) {
for (int i = 0; i < input_feasize; ++i) {
int index = indices_data[i];
PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
CHECK(index < output_feasize) << "err index in unpooling!";
output_data[index] = input_data[i];
}
input_data += input_feasize;
......@@ -77,7 +77,7 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
for (int c = 0; c < output_channels; ++c) {
for (int i = 0; i < input_feasize; ++i) {
int index = indices_data[i];
PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
CHECK(index < output_feasize) << "err index in unpooling!";
input_grad_data[i] = output_grad_data[index];
}
input_grad_data += input_feasize;
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "lite/backends/x86/math/vol2col.h"
#include <vector>
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -36,8 +36,8 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings,
lite::Tensor* col) const {
PADDLE_ENFORCE(vol.dims().size() == 4);
PADDLE_ENFORCE(col->dims().size() == 7);
CHECK_EQ(vol.dims().size(), 4);
CHECK_EQ(col->dims().size(), 7);
int input_channels = vol.dims()[0];
int input_depth = vol.dims()[1];
......@@ -52,27 +52,27 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
int channels_col =
input_channels * filter_depth * filter_height * filter_width;
PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
CHECK_EQ((input_depth + 2 * paddings[0] -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth)
<< "input_depth and output_depth are "
"mismatching.";
CHECK_EQ((input_height + 2 * paddings[1] -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height)
<< "input_height and output_height are "
"mismatching.";
CHECK_EQ((input_width + 2 * paddings[2] -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width)
<< "input_width and output_width are "
"mismatching.";
const T* vol_data = vol.data<T>();
T* col_data = col->template mutable_data<T>();
......@@ -122,8 +122,8 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings,
lite::Tensor* vol) const {
PADDLE_ENFORCE(vol->dims().size() == 4);
PADDLE_ENFORCE(col.dims().size() == 7);
CHECK_EQ(vol->dims().size(), 4);
CHECK_EQ(col.dims().size(), 7);
int input_channels = vol->dims()[0];
int input_depth = vol->dims()[1];
......@@ -138,27 +138,27 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
int channels_col =
input_channels * filter_depth * filter_height * filter_width;
PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
CHECK_EQ((input_depth + 2 * paddings[0] -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth)
<< "input_depth and output_depth are "
"mismatching.";
CHECK_EQ((input_height + 2 * paddings[1] -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height)
<< "input_height and output_height are "
"mismatching.";
CHECK_EQ((input_width + 2 * paddings[2] -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width)
<< "input_width and output_width are "
"mismatching.";
T* vol_data = vol->template mutable_data<T>();
const T* col_data = col.data<T>();
......
......@@ -157,12 +157,11 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
///////////////////////////////////////////////////////////////////////////////
if (enable_int8) {
std::string weight_name = conv_op_desc->Input("Filter").front();
PADDLE_ENFORCE(conv_op_desc->HasInputScale(weight_name),
"INT8 mode: Conv should has weight_scale attr");
CHECK(conv_op_desc->HasInputScale(weight_name))
<< "INT8 mode: Conv should has weight_scale attr";
auto conv_weight_d = conv_weight_t->mutable_data<int8_t>();
// compute new conv_weight for int8
auto weight_scale =
conv_op_desc->GetInputScale<std::vector<float>>(weight_name);
auto weight_scale = conv_op_desc->GetInputScale(weight_name);
if (conv_type_ == "conv2d_transpose" && !depthwise) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3];
......
......@@ -18,7 +18,7 @@
#include <memory>
#include <string>
#include "lite/core/mir/pattern_matcher_high_api.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......
......@@ -40,5 +40,4 @@ REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
.BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kXPU), TARGET(kX86)})
.ExcludeTargets({TARGET(kBM)})
.ExcludeTargets({TARGET(kCUDA)})
.BindKernel("fc");
......@@ -74,22 +74,15 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
auto op_desc = *matched.at("mul")->stmt()->op_info();
// Get the input scale from mul
float x_scale{};
std::vector<float> x_scale_vct;
std::vector<float> y_scale_vct;
auto y_var_node = matched.at("W")->arg();
auto input_x_name = op_desc.Input("X").front();
auto input_y_name = op_desc.Input("Y").front();
bool is_quantized_op = op_desc.HasInputScale(input_x_name) &&
op_desc.HasInputScale(input_y_name);
if (is_quantized_op) {
x_scale = op_desc.GetInputScale<float>(input_x_name);
if (y_var_node->is_weight) { // the scale of y is a vector
y_scale_vct =
op_desc.GetInputScale<std::vector<float>>(op_desc.Input("Y").front());
} else {
y_scale_vct.push_back( // the scale of y is scalar
op_desc.GetInputScale<float>(op_desc.Input("Y").front()));
}
x_scale_vct = op_desc.GetInputScale(input_x_name);
y_scale_vct = op_desc.GetInputScale(op_desc.Input("Y").front());
}
op_desc.mutable_inputs()->clear();
......@@ -108,12 +101,8 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
// Set the input scale into fc
if (is_quantized_op) {
op_desc.SetInputScale(matched.at("x")->arg()->name, x_scale);
if (y_var_node->is_weight) {
op_desc.SetInputScale(matched.at("W")->arg()->name, y_scale_vct);
} else {
op_desc.SetInputScale(matched.at("W")->arg()->name, y_scale_vct.front());
}
op_desc.SetInputScale(matched.at("x")->arg()->name, x_scale_vct);
op_desc.SetInputScale(matched.at("W")->arg()->name, y_scale_vct);
}
return op_desc;
......
......@@ -64,7 +64,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
for (auto* quantized_node : outlinks) {
// save input scale in quantized op by input argname + index
auto op_desc = *quantized_node->stmt()->mutable_op_info();
op_desc.SetInputScale(out_act_name, scale_value);
op_desc.SetInputScale(out_act_name, {scale_value});
op_desc.SetAttr<int>("bit_length", bit_length);
op_desc.UpdateAllInputs(out_act_name, in_act_name);
quantized_node->stmt()->ResetOp(op_desc, graph->valid_places());
......@@ -150,7 +150,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
auto quantized_weight_t =
scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
std::vector<float> weight_scale;
int weight_scale_size;
int weight_scale_size = 0;
if (quantized_op_type_ == "conv2d" ||
quantized_op_type_ == "depthwise_conv2d") {
op_desc.SetInput("Input", {quantized_op_input->arg()->name});
......@@ -348,7 +348,7 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
// Save quantization info in op_info attr
auto op_info = *quantized_node->stmt()->op_info();
op_info.SetAttr<int>("bit_length", bit_length);
op_info.SetInputScale(output_act_name, scale_value);
op_info.SetInputScale(output_act_name, {scale_value});
op_info.UpdateAllInputs(output_act_name, input_act_name);
quantized_node->stmt()->ResetOp(op_info, graph->valid_places());
......
......@@ -38,12 +38,12 @@ void QuantizedOpAttributesInferencePass::Apply(
auto op_info = inst.op_info();
auto op_type = op_info->Type();
// Check only if all of the inputs of the op have scale value
bool has_input_scale = true;
// Check if any of the inputs of the op have scale value
bool has_input_scale = false;
for (auto in_var_node : op_node->inlinks) {
CHECK(in_var_node->IsArg());
auto in_var_node_name = in_var_node->arg()->name;
has_input_scale &= op_info->HasInputScale(in_var_node_name);
has_input_scale |= op_info->HasInputScale(in_var_node_name);
}
if (!has_input_scale) continue;
......@@ -52,31 +52,31 @@ void QuantizedOpAttributesInferencePass::Apply(
bool is_quantized = true;
for (auto out_var_node : op_node->outlinks) {
CHECK(out_var_node->IsArg());
bool found = false;
float output_scale;
std::vector<float> output_scale;
bool has_output_scale = false;
auto out_var_node_name = out_var_node->arg()->name;
for (auto out_op_node : out_var_node->outlinks) {
CHECK(out_op_node->IsStmt());
auto& out_inst = out_op_node->AsStmt();
auto out_op_info = out_inst.op_info();
if (!out_op_info->HasInputScale(out_var_node_name)) continue;
auto input_scale = out_op_info->GetInputScale<float>(out_var_node_name);
if (!found) {
found = true;
auto input_scale = out_op_info->GetInputScale(out_var_node_name);
if (!has_output_scale) {
output_scale = input_scale;
has_output_scale = true;
} else {
CHECK_EQ(output_scale, input_scale);
CHECK_EQ(output_scale.size(), input_scale.size());
}
}
if (found) {
if (has_output_scale) {
inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale);
} else if (op_info->HasAttr("out_threshold")) {
// Only consider one output, there are only one out_threshold
int bit_length = op_info->GetAttr<int>("bit_length");
int range = (1 << (bit_length - 1)) - 1;
output_scale = op_info->GetAttr<float>("out_threshold");
inst.mutable_op_info()->SetOutputScale(out_var_node_name,
output_scale / range);
output_scale = std::vector<float>{
op_info->GetAttr<float>("out_threshold") / range};
inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale);
} else {
is_quantized = false;
}
......
......@@ -119,7 +119,7 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
instruct.mutable_op_info()->SetOutputScale(
out_node_name,
one_adj_instruct.op_info()->GetInputScale<float>(out_node_name));
one_adj_instruct.op_info()->GetInputScale(out_node_name));
auto update_desc = *instruct.mutable_op_info();
instruct.ResetOp(update_desc, graph->valid_places());
......
......@@ -452,39 +452,6 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
output_var_names);
// Set input/output scale values of input/output var nodes for
// type_precision_cast_pass.
std::vector<float> input_data_scales;
std::vector<float> output_data_scales;
for (auto &var_node : input_var_nodes) {
auto var_node_name = var_node->arg()->name;
auto any_op_node = var_node->outlinks.front();
CHECK(any_op_node->IsStmt());
auto &any_inst = any_op_node->AsStmt();
if (any_inst.op_info()->HasInputScale(var_node_name)) {
input_data_scales.push_back(
any_inst.op_info()->GetInputScale<float>(var_node_name));
}
}
for (auto &var_node : output_var_nodes) {
auto var_node_name = var_node->arg()->name;
auto any_op_node = var_node->inlinks.front();
CHECK(any_op_node->IsStmt());
auto &any_inst = any_op_node->AsStmt();
if (any_inst.op_info()->HasOutputScale(var_node_name)) {
output_data_scales.push_back(
any_inst.op_info()->GetOutputScale<float>(var_node_name));
}
}
if (input_data_scales.size() > 0) {
subgraph_op_desc.SetAttr<std::vector<float>>("input_data_scales",
input_data_scales);
}
if (output_data_scales.size() > 0) {
subgraph_op_desc.SetAttr<std::vector<float>>("output_data_scales",
output_data_scales);
}
// Set all of the inputs and outputs to the target subgraph op
// To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
for (auto &var_node : weight_var_nodes) {
......@@ -504,6 +471,29 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
subgraph_op->Attach(subgraph_op_desc, any_op->scope());
// Export the scale values of the input/output var nodes of the inner op nodes
// only for type_precision_cast_pass.
for (auto &var_node : input_var_nodes) {
auto var_node_name = var_node->arg()->name;
auto any_op_node = var_node->outlinks.front();
CHECK(any_op_node->IsStmt());
auto &any_inst = any_op_node->AsStmt();
if (any_inst.op_info()->HasInputScale(var_node_name)) {
subgraph_op->mutable_op_info()->SetInputScale(
var_node_name, any_inst.op_info()->GetInputScale(var_node_name));
}
}
for (auto &var_node : output_var_nodes) {
auto var_node_name = var_node->arg()->name;
auto any_op_node = var_node->inlinks.front();
CHECK(any_op_node->IsStmt());
auto &any_inst = any_op_node->AsStmt();
if (any_inst.op_info()->HasOutputScale(var_node_name)) {
subgraph_op->mutable_op_info()->SetOutputScale(
var_node_name, any_inst.op_info()->GetOutputScale(var_node_name));
}
}
// Create and add a new subgraph node into the graph
auto subgraph_op_node =
graph->GraphCreateInstructNode(subgraph_op, any_op->valid_places());
......
......@@ -66,65 +66,30 @@ void UpdateInputs(OpLite* op, const std::string& from, const std::string& to) {
}
}
// Infer the scale value for the new calib op from the subgraph op
static bool InferScaleFromSubgraph(std::string var_name,
const OpInfo* op_info,
float* scale,
bool reverse = false) {
std::string attr_name = reverse ? "output_data_names" : "input_data_names";
if (!op_info->HasAttr(attr_name)) return false;
auto input_or_output_names =
op_info->GetAttr<std::vector<std::string>>(attr_name);
attr_name = reverse ? "output_data_scales" : "input_data_scales";
if (!op_info->HasAttr(attr_name)) return false;
auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name);
auto size = input_or_output_names.size();
CHECK(size == input_or_output_scales.size());
for (size_t i = 0; i < size; i++) {
if (input_or_output_names[i] == var_name) {
*scale = input_or_output_scales[i];
return true;
}
}
return false;
}
// Infer the scale value for the new calib op from the input_scale of the
// current op and output_scale of the previous op.
// case 1: prev_op->var_node->op_node(int8->any op, with input_scale).
// case 2: prev_op->var_node->op_node(subgraph op, int8->any, with
// input_data_scales).
// case 3: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
// case 2: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
// without input_scale).
// case 4: prev_op(any->int8, subgraph_op, with
// output_data_scales)->var_node->op_node(fp32->any, without input_scale).
static bool InferScale(Node* var_node, Node* op_node, float* scale) {
bool found = false;
auto& inst = op_node->AsStmt();
auto op_info = inst.op_info();
auto op_type = op_info->Type();
auto var_name = var_node->AsArg().name;
if (op_type == "subgraph") {
found = InferScaleFromSubgraph(var_name, op_info, scale, false);
if (op_info->HasInputScale(var_name)) {
*scale = op_info->GetInputScale(var_name)[0];
found = true;
} else {
if (op_info->HasInputScale(var_name)) {
*scale = op_info->GetInputScale<float>(var_name);
// Obtain the output_scale from one of its previous Ops
auto prev_op_node = var_node->inlinks.front();
CHECK(prev_op_node->IsStmt());
auto& prev_inst = prev_op_node->AsStmt();
auto prev_op_info = prev_inst.op_info();
auto prev_op_type = prev_op_info->Type();
if (prev_op_info->HasOutputScale(var_name)) {
*scale = prev_op_info->GetOutputScale(var_name)[0];
found = true;
} else {
// Obtain the output_scale from one of its previous Ops
auto prev_op_node = var_node->inlinks.front();
CHECK(prev_op_node->IsStmt());
auto& prev_inst = prev_op_node->AsStmt();
auto prev_op_info = prev_inst.op_info();
auto prev_op_type = prev_op_info->Type();
if (prev_op_type == "subgraph") {
found = InferScaleFromSubgraph(var_name, prev_op_info, scale, true);
} else {
if (prev_op_info->HasOutputScale(var_name)) {
*scale = prev_op_info->GetOutputScale<float>(var_name);
found = true;
}
}
}
}
return found;
......
......@@ -22,7 +22,7 @@
namespace paddle {
namespace lite {
std::string int2string(int index) {
static std::string int2string(int index) {
const int BUFFER_LENGTH = 30;
char buffer[BUFFER_LENGTH];
int num = snprintf(buffer, sizeof(buffer), "%d", index);
......@@ -262,17 +262,6 @@ bool OpInfo::HasOutputScale(const std::string &output_name) const {
}
}
template <>
void OpInfo::SetInputScale(const std::string &input_name,
const float &scale_value) {
std::string argname;
int index;
CHECK(GetInputArgname(input_name, &argname));
CHECK(GetInputIndex(input_name, &index));
SetAttr<float>(argname + int2string(index) + "_scale", scale_value);
}
template <>
void OpInfo::SetInputScale(const std::string &input_name,
const std::vector<float> &scale_value) {
std::string argname;
......@@ -283,25 +272,31 @@ void OpInfo::SetInputScale(const std::string &input_name,
scale_value);
}
template <>
void OpInfo::SetOutputScale(const std::string &output_name,
const float &scale_value) {
const std::vector<float> &scale_value) {
std::string argname;
int index;
CHECK(GetOutputArgname(output_name, &argname));
CHECK(GetOutputIndex(output_name, &index));
SetAttr<float>(argname + int2string(index) + "_scale", scale_value);
SetAttr<std::vector<float>>(argname + int2string(index) + "_scale",
scale_value);
}
template <>
void OpInfo::SetOutputScale(const std::string &output_name,
const std::vector<float> &scale_value) {
std::vector<float> OpInfo::GetInputScale(const std::string &input_name) const {
std::string argname;
int index;
CHECK(GetInputArgname(input_name, &argname));
CHECK(GetInputIndex(input_name, &index));
return GetAttr<std::vector<float>>(argname + int2string(index) + "_scale");
}
std::vector<float> OpInfo::GetOutputScale(
const std::string &output_name) const {
std::string argname;
int index;
CHECK(GetOutputArgname(output_name, &argname));
CHECK(GetOutputIndex(output_name, &index));
SetAttr<std::vector<float>>(argname + int2string(index) + "_scale",
scale_value);
return GetAttr<std::vector<float>>(argname + int2string(index) + "_scale");
}
} // namespace lite
......
......@@ -30,8 +30,6 @@
namespace paddle {
namespace lite {
std::string int2string(int index);
// For registry factory.
struct Registry {
void Touch() {}
......@@ -231,38 +229,6 @@ class OpInfo : public cpp::OpDesc {
return OutputArgumentNames();
}
bool GetInputArgname(const std::string &value_name, std::string *out) const;
bool GetOutputArgname(const std::string &value_name, std::string *out) const;
bool GetInputIndex(const std::string &input_name, int *out) const;
bool GetOutputIndex(const std::string &output_name, int *out) const;
bool HasInputScale(const std::string &input_name) const;
bool HasOutputScale(const std::string &output_name) const;
template <typename T>
void SetInputScale(const std::string &input_name, const T &scale_value);
template <typename T>
void SetOutputScale(const std::string &output_name, const T &scale_value);
template <typename T>
T GetInputScale(const std::string &input_name) const {
std::string argname;
int index;
CHECK(GetInputArgname(input_name, &argname));
CHECK(GetInputIndex(input_name, &index));
return GetAttr<T>(argname + int2string(index) + "_scale");
}
template <typename T>
T GetOutputScale(const std::string &output_name) const {
std::string argname;
int index;
CHECK(GetOutputArgname(output_name, &argname));
CHECK(GetOutputIndex(output_name, &index));
return GetAttr<T>(argname + int2string(index) + "_scale");
}
void UpdateAllInputs(const std::string &from, const std::string &to) {
for (auto &item : inputs_) {
for (auto &var : item.second) {
......@@ -278,6 +244,26 @@ class OpInfo : public cpp::OpDesc {
}
}
}
bool GetInputArgname(const std::string &value_name, std::string *out) const;
bool GetOutputArgname(const std::string &value_name, std::string *out) const;
bool GetInputIndex(const std::string &input_name, int *out) const;
bool GetOutputIndex(const std::string &output_name, int *out) const;
bool HasInputScale(const std::string &input_name) const;
bool HasOutputScale(const std::string &output_name) const;
void SetInputScale(const std::string &input_name,
const std::vector<float> &scale_value);
void SetOutputScale(const std::string &output_name,
const std::vector<float> &scale_value);
// For conv2d, depthwise_conv2d and mul, the scale of weight are a vector.
// Otherwise, all input and output scales are scalar, but we save these
// as vecotr.
std::vector<float> GetInputScale(const std::string &input_name) const;
std::vector<float> GetOutputScale(const std::string &output_name) const;
};
} // namespace lite
......
......@@ -67,7 +67,7 @@ framework::proto::VarType::Type ToDataType(std::type_index type) {
if (it != gDataTypeMap().cpp_to_proto_.end()) {
return it->second;
}
PADDLE_THROW("Not support %s as tensor type", type.name());
LOG(FATAL) << "Not support " << type.name() << " as tensor type";
return static_cast<framework::proto::VarType::Type>(-1);
}
......@@ -76,8 +76,8 @@ std::type_index ToTypeIndex(framework::proto::VarType::Type type) {
if (it != gDataTypeMap().proto_to_cpp_.end()) {
return it->second;
}
PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type",
static_cast<int>(type));
LOG(FATAL) << "Not support framework::proto::VarType::Type("
<< static_cast<int>(type) << ") as tensor type";
return std::type_index(typeid(void));
}
......@@ -86,8 +86,8 @@ std::string DataTypeToString(const framework::proto::VarType::Type type) {
if (it != gDataTypeMap().proto_to_str_.end()) {
return it->second;
}
PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type",
static_cast<int>(type));
LOG(FATAL) << "Not support framework::proto::VarType::Type("
<< static_cast<int>(type) << ") as tensor type";
return std::string();
}
......@@ -96,7 +96,8 @@ size_t SizeOfType(framework::proto::VarType::Type type) {
if (it != gDataTypeMap().proto_to_size_.end()) {
return it->second;
}
PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type).c_str());
LOG(FATAL) << "Not support " << DataTypeToString(type).c_str()
<< " as tensor type";
return 0;
}
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include <typeindex>
#include "lite/core/framework.pb.h"
#include "lite/fluid/float16.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -72,7 +72,7 @@ inline void VisitDataType(framework::proto::VarType::Type type,
_ForEachDataType_(VisitDataTypeCallback);
#undef VisitDataTypeCallback
PADDLE_THROW("Not supported %d", type);
LOG(FATAL) << "Not supported " << type;
}
extern std::string DataTypeToString(const framework::proto::VarType::Type type);
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include <vector>
#include "lite/core/tensor.h"
#include "lite/fluid/float16.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle {
......@@ -30,7 +30,7 @@ struct EigenDim {
using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
static Type From(const lite::DDim& dims) {
PADDLE_ENFORCE_EQ(dims.size(), D, "D must match DDim::size");
CHECK_EQ(dims.size(), D) << "D must match DDim::size";
Type ret;
for (size_t d = 0; d < dims.size(); d++) {
ret[d] = dims[d];
......@@ -39,7 +39,7 @@ struct EigenDim {
}
static Type From(const DDim::value_type length) {
PADDLE_ENFORCE_EQ(D, 1, "D must be 1.");
CHECK_EQ(D, 1) << "D must be 1.";
Type ret;
ret[0] = length;
return ret;
......@@ -84,16 +84,16 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
static typename EigenMatrix::Type Reshape(Tensor& tensor, // NOLINT
int num_col_dims) {
int rank = tensor.dims().size();
PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
"`num_col_dims` must be between (0, rank_of_tensor).");
CHECK(num_col_dims > 0 && num_col_dims < rank)
<< "`num_col_dims` must be between (0, rank_of_tensor).";
return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims));
}
static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
int num_col_dims) {
int rank = tensor.dims().size();
PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
"`num_col_dims` must be between (0, rank_of_tensor).");
CHECK(num_col_dims > 0 && num_col_dims < rank)
<< "`num_col_dims` must be between (0, rank_of_tensor).";
return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims));
}
};
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include <mutex> // NOLINT
#endif // !_WIN32
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
......@@ -33,17 +33,15 @@ struct RWLock {
~RWLock() { pthread_rwlock_destroy(&lock_); }
inline void RDLock() {
PADDLE_ENFORCE_EQ(
pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed");
CHECK_EQ(pthread_rwlock_rdlock(&lock_), 0) << "acquire read lock failed";
}
inline void WRLock() {
PADDLE_ENFORCE_EQ(
pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed");
CHECK_EQ(pthread_rwlock_wrlock(&lock_), 0) << "acquire write lock failed";
}
inline void UNLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
CHECK_EQ(pthread_rwlock_unlock(&lock_), 0) << "unlock failed";
}
private:
......
......@@ -119,7 +119,7 @@ void DeserializeFromStream(
// the 1st field, unit32_t version for SelectedRows
uint32_t version;
is.read(reinterpret_cast<char*>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
CHECK_EQ(version, 0U) << "Only version 0 is supported";
}
{
// the 2st field, rows information
......@@ -163,24 +163,22 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key,
if (iter == id_to_index_.end()) {
rwlock_->UNLock();
if (!auto_grown) {
PADDLE_THROW("key %ld not found", key);
LOG(FATAL) << "key " << key << " not found";
}
rwlock_->WRLock();
auto map_size = id_to_index_.size();
auto vector_size = rows_.size();
if (map_size != vector_size) {
rwlock_->UNLock();
PADDLE_THROW(
"id_to_index_ size %lu should have the same size with rows_ %lu",
map_size,
vector_size);
LOG(FATAL) << "id_to_index_ size " << map_size
<< " should have the same size with rows_ " << vector_size;
}
auto write_iter = id_to_index_.find(key);
if (write_iter == id_to_index_.end()) {
int row_num = rows_.size();
if (row_num == value_->dims()[0]) {
rwlock_->UNLock();
PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
LOG(FATAL) << "selected rows is full, then length exceed " << row_num;
}
// key logic to put a key into id_to_index_
rows_.push_back(key);
......@@ -213,16 +211,14 @@ void SelectedRows::Get(const lite::Tensor& ids,
lite::Tensor* value,
bool auto_grown,
bool is_test) {
PADDLE_ENFORCE(value->IsInitialized(),
"The value tensor should be initialized.");
CHECK(value->IsInitialized()) << "The value tensor should be initialized.";
if (ids.numel() == 0) {
VLOG(3) << "keys is empty, please check data!";
} else {
int64_t value_width = value_->numel() / value_->dims()[0];
PADDLE_ENFORCE_EQ(value_width,
value->numel() / value->dims()[0],
"output tensor should have the same shape with table "
"except the dims[0].");
CHECK_EQ(value_width, value->numel() / value->dims()[0])
<< "output tensor should have the same shape with table "
"except the dims[0].";
for (int i = 0; i < ids.numel(); ++i) {
auto id = ids.data<int64_t>()[i];
int64_t index = AutoGrownIndex(id, auto_grown, is_test);
......
......@@ -82,7 +82,7 @@ class SelectedRows {
int64_t Index(int64_t key) const {
auto it = std::find(rows_.begin(), rows_.end(), key);
if (it == rows_.end()) {
PADDLE_THROW("id %ld not in table", key);
LOG(FATAL) << "id " << key << " not in table";
}
return static_cast<int64_t>(std::distance(rows_.begin(), it));
}
......
......@@ -35,6 +35,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
int neuron_errCode;
VLOG(3) << "[APU] Converting [" << op_type << "]";
CHECK(op_info->HasAttr("enable_int8") &&
op_info->GetAttr<bool>("enable_int8"));
// Get input and output vars and op attributes
auto input_name = op_info->Input("Input").front();
auto input = scope->FindMutableTensor(input_name);
......@@ -94,34 +97,18 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
input_dims,
filter_dims);
float input_scale;
float output_scale;
std::vector<float> weight_scale;
if (op_info->HasAttr("enable_int8")) {
if (op_info->GetAttr<bool>("enable_int8")) {
auto input_name = op_info->Input("Input").front();
auto filter_name = op_info->Input("Filter").front();
auto output_name = op_info->Output("Output").front();
if (op_info->HasInputScale(input_name))
input_scale = op_info->GetInputScale<float>(input_name);
if (op_info->HasInputScale(filter_name))
weight_scale = op_info->GetInputScale<std::vector<float>>(filter_name);
if (op_info->HasOutputScale(output_name)) {
output_scale = op_info->GetOutputScale<float>(output_name);
}
VLOG(3) << "has output scale:" << output_scale;
} else {
return FAILED;
}
} else {
return FAILED;
}
CHECK(op_info->HasInputScale(input_name));
auto input_scale = op_info->GetInputScale(input_name)[0];
CHECK(op_info->HasInputScale(filter_name));
auto filter_scale = op_info->GetInputScale(filter_name);
CHECK(op_info->HasOutputScale(output_name));
auto output_scale = op_info->GetOutputScale(output_name)[0];
VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
<< " ,dilations: " << dilations[0] << ":" << dilations[1];
VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type;
VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims
<< " ,weight_scale size: " << weight_scale.size();
<< " ,filter_scale size: " << filter_scale.size();
VLOG(3) << "filter_dims: " << filter_dims
<< " ,memory_size: " << filter->memory_size()
<< " ,data_size: " << filter->data_size();
......@@ -220,10 +207,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
NeuronOperandType filterType;
NeuronOperandType channelFilterType;
NeuronSymmPerChannelQuantParams symmPerChannelQuantParams;
if (1 == weight_scale.size()) {
if (1 == filter_scale.size()) {
// Per layer type
filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
filterType.scale = weight_scale[0];
filterType.scale = filter_scale[0];
filterType.zeroPoint = 128;
filterType.dimensionCount = filter_dims.size();
filterType.dimensions = &dims_filter[0];
......@@ -241,17 +228,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
symmPerChannelQuantParams.channelDim = 3;
else
symmPerChannelQuantParams.channelDim = 0;
symmPerChannelQuantParams.scaleCount = weight_scale.size();
symmPerChannelQuantParams.scales = weight_scale.data();
symmPerChannelQuantParams.scaleCount = filter_scale.size();
symmPerChannelQuantParams.scales = filter_scale.data();
biasType.scale = 0;
}
std::shared_ptr<Node> filter_node = nullptr;
if (1 == weight_scale.size()) {
if (1 == filter_scale.size()) {
NeuronModel_addOperand(model, &filterType); // 1: filter
filter_node = graph->Add(filter_name, dims_filter);
VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]"
<< weight_scale[0] << ": filterType: " << filterType.dimensions[0]
VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]"
<< filter_scale[0] << ": filterType: " << filterType.dimensions[0]
<< ":" << filterType.dimensions[1] << ":"
<< filterType.dimensions[2] << ":" << filterType.dimensions[3];
memcpy(filter->mutable_data<int8_t>(),
......@@ -267,8 +254,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
NeuronModel_addOperand(model, &channelFilterType); // 1: filter
filter_node = graph->Add(filter_name, dims_filter);
VLOG(3) << "chennel filter node idx: " << filter_node->index()
<< " ,scale_count:" << weight_scale.size()
<< " weight_scale[0]:" << weight_scale.data()[0]
<< " ,scale_count:" << filter_scale.size()
<< " filter_scale[0]:" << filter_scale.data()[0]
<< " ,channelFilterType: " << channelFilterType.dimensions[0] << ":"
<< channelFilterType.dimensions[1] << ":"
<< channelFilterType.dimensions[2] << ":"
......@@ -302,7 +289,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
std::shared_ptr<Node> bias_node = nullptr;
if (HasInputArg(op_info, scope, "Bias")) {
auto bias_name = op_info->Input("Bias").front();
auto bias_type = kernel->GetInputDeclType("Bias");
auto bias = scope->FindMutableTensor(bias_name);
auto bias_dims = bias->dims();
......@@ -368,10 +354,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// Add output tensor type
NeuronOperandType outType;
outType.type = NEURON_TENSOR_QUANT8_ASYMM;
if (graph->IsOutput(output_name))
outType.scale = output_scale / 127;
else
outType.scale = output_scale;
outType.scale = output_scale;
outType.zeroPoint = 128;
outType.dimensionCount = output_dims.size();
std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
......@@ -405,7 +388,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
int32_t* int32_bias_data =
reinterpret_cast<int32_t*>(bias->mutable_data<float>());
float2int32(
bias->data<float>(), input_scale, weight_scale, int32_bias_data);
bias->data<float>(), input_scale, filter_scale, int32_bias_data);
VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : "
<< int32_bias_data[1] << " : " << int32_bias_data[2] << " : "
......
......@@ -31,6 +31,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto scope = op->scope();
VLOG(3) << "[APU] Converting [" + op_type + "]";
CHECK(op_info->HasAttr("enable_int8") &&
op_info->GetAttr<bool>("enable_int8"));
// Get input and output vars and op attributes
auto input_name = op_info->Input("Input").front();
auto input = scope->FindMutableTensor(input_name);
auto input_dims = input->dims();
......@@ -52,26 +56,12 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
<< " out_dims: " << out_dims << " m: " << m << " k: " << k
<< " n: " << n;
float input_scale = 1.0f;
float out_scale = 1.0f;
std::vector<float> w_scale;
if (op_info->HasAttr("enable_int8")) {
if (op_info->GetAttr<bool>("enable_int8")) {
auto input_name = op_info->Input("Input").front();
auto weight_name = op_info->Input("W").front();
auto out_name = op_info->Output("Out").front();
if (op_info->HasInputScale(input_name))
input_scale = op_info->GetInputScale<float>(input_name);
if (op_info->HasInputScale(weight_name))
w_scale = op_info->GetInputScale<std::vector<float>>(weight_name);
if (op_info->HasOutputScale(out_name))
out_scale = op_info->GetOutputScale<float>(out_name);
} else {
return FAILED;
}
} else {
return FAILED;
}
CHECK(op_info->HasInputScale(input_name));
auto input_scale = op_info->GetInputScale(input_name)[0];
CHECK(op_info->HasInputScale(w_name));
auto w_scale = op_info->GetInputScale(w_name);
CHECK(op_info->HasOutputScale(out_name));
auto out_scale = op_info->GetOutputScale(out_name)[0];
// Add input tensor type
NeuronOperandType inType;
......
......@@ -32,6 +32,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto scope = op->scope();
VLOG(3) << "[APU] Converting [" + op_type + "] ";
CHECK(op_info->HasAttr("enable_int8") &&
op_info->GetAttr<bool>("enable_int8"));
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name);
......@@ -87,24 +90,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
ksize);
// Add x tensor type
float x_scale = 1.0f;
float out_scale = 1.0f;
if (op_info->HasAttr("enable_int8")) {
if (op_info->GetAttr<bool>("enable_int8")) {
auto x_name = op_info->Input("X").front();
auto out_name = op_info->Output("Out").front();
if (op_info->HasInputScale(x_name))
x_scale = op_info->GetInputScale<float>(x_name);
if (op_info->HasOutputScale(out_name))
out_scale = op_info->GetOutputScale<float>(out_name);
} else {
LOG(WARNING) << "Do not enable_int8";
return FAILED;
}
} else {
LOG(WARNING) << "Do not enable_int8";
return FAILED;
}
CHECK(op_info->HasInputScale(x_name));
auto x_scale = op_info->GetInputScale(x_name)[0];
CHECK(op_info->HasOutputScale(out_name));
auto out_scale = op_info->GetOutputScale(out_name)[0];
NeuronOperandType xType;
xType.type = NEURON_TENSOR_QUANT8_ASYMM;
......
......@@ -31,6 +31,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto scope = op->scope();
VLOG(3) << "[APU] Converting [" + op_type + "]";
CHECK(op_info->HasAttr("enable_int8") &&
op_info->GetAttr<bool>("enable_int8"));
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name);
......@@ -45,24 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
axis += x_rank;
}
float input_scale = 1.0f;
float out_scale = 1.0f;
if (op_info->HasAttr("enable_int8")) {
if (op_info->GetAttr<bool>("enable_int8")) {
auto x_name = op_info->Input("X").front();
auto out_name = op_info->Output("Out").front();
if (op_info->HasInputScale(x_name))
input_scale = op_info->GetInputScale<float>(x_name);
if (op_info->HasOutputScale(out_name))
out_scale = op_info->GetOutputScale<float>(out_name);
} else {
LOG(WARNING) << "Do not enable_int8";
return FAILED;
}
} else {
LOG(WARNING) << "Do not enable_int8";
return FAILED;
}
CHECK(op_info->HasInputScale(x_name));
auto input_scale = op_info->GetInputScale(x_name)[0];
CHECK(op_info->HasOutputScale(out_name));
auto out_scale = op_info->GetOutputScale(out_name)[0];
// Check output scale
NeuronOperandType xType;
......@@ -106,14 +95,14 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// Add out operand
NeuronOperandType outType;
outType.type = NEURON_TENSOR_QUANT8_ASYMM;
outType.scale = out_scale / 127;
outType.scale = out_scale;
outType.zeroPoint = 128;
outType.dimensionCount = x_dims.size();
outType.dimensions = &dims_x[0];
NeuronModel_addOperand(model, &outType); // 3: output
std::shared_ptr<Node> out_node = nullptr;
out_node = graph->Add(out_name, dims_x);
VLOG(3) << "output_scale: " << out_scale;
VLOG(3) << "out_scale: " << out_scale;
float beta_val[] = {1.0f};
NeuronModel_setOperandValue(
......
......@@ -153,18 +153,15 @@ int SubgraphEngine::LaunchDeviceProgram() {
}
// Set input buffer
Tensor input_temp;
for (size_t i = 0; i < origin_itensors_.size(); i++) {
input_temp.Resize({origin_idims_[i]});
uint8_t* input_data = input_temp.mutable_data<uint8_t>();
memcpy(input_data,
origin_itensors_[i]->raw_data(),
origin_itensors_[i]->memory_size());
auto origin_data = origin_itensors_[i]->mutable_data<int8_t>();
auto converted_data = reinterpret_cast<uint8_t*>(origin_data);
for (int j = 0; j < origin_itensors_[i]->data_size(); j++) {
input_data[j] += (uint8_t)128;
converted_data[j] =
static_cast<uint8_t>(static_cast<int16_t>(origin_data[j]) + 128);
}
NeuronExecution_setInput(
run, i, NULL, input_data, origin_itensors_[i]->memory_size());
run, i, NULL, converted_data, origin_itensors_[i]->memory_size());
}
// Set output buffer
......@@ -184,10 +181,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
}
for (size_t i = 0; i < origin_otensors_.size(); i++) {
int8_t* output_data = origin_otensors_[i]->mutable_data<int8_t>();
VLOG(3) << "output size:" << origin_otensors_[i]->memory_size();
auto converted_data = origin_otensors_[i]->mutable_data<int8_t>();
auto origin_data = reinterpret_cast<uint8_t*>(converted_data);
for (int j = 0; j < origin_otensors_[i]->data_size(); j++) {
output_data[j] -= (int8_t)128;
converted_data[j] =
static_cast<int8_t>(static_cast<int16_t>(origin_data[j]) - 128);
}
}
NeuronExecution_free(run);
......
......@@ -6,6 +6,7 @@ message(STATUS "compile with lite CUDA kernels")
# basic kernels
add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
add_kernel(fc_compute_cuda CUDA basic SRCS fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
......@@ -34,7 +35,10 @@ add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.
add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda})
add_kernel(sequence_reverse_compute_cuda CUDA extra SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_pad_compute_cuda CUDA extra SRCS sequence_pad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
add_kernel(sequence_unpad_compute_cuda CUDA extra SRCS sequence_unpad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
add_kernel(sequence_concat_compute_cuda CUDA extra SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_mask_compute_cuda CUDA extra SRCS sequence_mask_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_arithmetic_compute_cuda CUDA extra SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps})
add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps})
add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps})
......@@ -45,6 +49,7 @@ add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_ma
add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
add_kernel(var_conv_2d_compute_cuda CUDA extra SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
add_kernel(topk_pooling_compute_cuda CUDA extra SRCS topk_pooling_compute.cu DEPS ${lite_kernel_deps})
add_kernel(assign_value_compute_cuda CUDA extra SRCS assign_value_compute.cu DEPS ${lite_kernel_deps})
# unit test
lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda)
......@@ -61,7 +66,8 @@ nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute
nv_test(elementwise_compute_cuda_test SRCS elementwise_compute_test.cc DEPS elementwise_compute_cuda)
nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda)
#nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
nv_test(fc_compute_cuda_test SRCS fc_compute_test.cc DEPS fc_compute_cuda)
nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
#nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
......@@ -75,10 +81,14 @@ if(LITE_BUILD_EXTRA)
nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda)
nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda)
nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda)
nv_test(sequence_pad_compute_cuda_test SRCS sequence_pad_compute_test.cc DEPS sequence_pad_compute_cuda)
nv_test(sequence_unpad_compute_cuda_test SRCS sequence_unpad_compute_test.cc DEPS sequence_unpad_compute_cuda)
nv_test(sequence_mask_compute_cuda_test SRCS sequence_mask_compute_test.cc DEPS sequence_mask_compute_cuda)
nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
#nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
#nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda)
nv_test(topk_pooling_compute_cuda_test SRCS topk_pooling_compute_test.cc DEPS topk_pooling_compute_cuda)
nv_test(assign_value_compute_cuda_test SRCS assign_value_compute_test.cc DEPS assign_value_compute_cuda)
endif()
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "lite/backends/cuda/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/types.h"
#include "lite/kernels/cuda/assign_value_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <class T>
void TensorFromVector(const std::vector<T>& src,
lite::Tensor* dst,
cudaStream_t* stream) {
auto* src_ptr = static_cast<const void*>(src.data());
auto* dst_ptr = static_cast<void*>(dst->mutable_data<T>(TARGET(kCUDA)));
auto size = src.size() * sizeof(T);
TargetWrapperCuda::MemcpyAsync(
dst_ptr, src_ptr, size, IoDirection::HtoD, *stream);
}
void AssignValueCompute::Run() {
auto& param = Param<operators::AssignValueParam>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
int dtype = param.dtype;
std::vector<float> fp32_values = param.fp32_values;
std::vector<int> int32_values = param.int32_values;
std::vector<int64_t> int64_values = param.int64_values;
std::vector<int> bool_values = param.bool_values;
auto* out = param.Out;
if (dtype == static_cast<int>(lite::core::FluidType::INT32)) {
TensorFromVector(int32_values, out, &stream);
} else if (dtype == static_cast<int>(lite::core::FluidType::FP32)) {
TensorFromVector(fp32_values, out, &stream);
} else if (dtype == static_cast<int>(lite::core::FluidType::INT64)) {
TensorFromVector(int64_values, out, &stream);
} else if (dtype == static_cast<int>(lite::core::FluidType::BOOL)) {
TensorFromVector(bool_values, out, &stream);
} else {
LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype;
}
return;
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(assign_value,
kCUDA,
kAny,
kNCHW,
paddle::lite::kernels::cuda::AssignValueCompute,
def)
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kAny))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -12,28 +12,23 @@
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines PADDLE_ENFORCE_xx, which helps to adapt the legacy fluid
* codes.
*/
#pragma once
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
#include "lite/core/kernel.h"
#define PADDLE_ENFORCE(cond, ...) \
CHECK((cond)) << paddle::lite::string_format("" __VA_ARGS__);
#define PADDLE_ENFORCE_EQ(a, b, ...) \
CHECK_EQ((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
#define PADDLE_ENFORCE_LE(a, b, ...) \
CHECK_LE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
#define PADDLE_ENFORCE_LT(a, b, ...) \
CHECK_LT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
#define PADDLE_ENFORCE_GE(a, b, ...) \
CHECK_GE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
#define PADDLE_ENFORCE_GT(a, b, ...) \
CHECK_GT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
class AssignValueCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public:
using param_t = operators::AssignValueParam;
#ifndef PADDLE_THROW
#define PADDLE_THROW(...) printf("" __VA_ARGS__);
#endif
void Run() override;
virtual ~AssignValueCompute() = default;
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/assign_value_compute.h"
#include <gtest/gtest.h>
#include <functional>
#include <memory>
#include <random>
#include <utility>
#include <vector>
#include "lite/api/test_helper.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/types.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class AssignValueTest : public ::testing::Test {
protected:
AssignValueTest() : dtype(5), shape({1}) {
int num =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
fp32_values.resize(num);
int32_values.resize(num);
int64_values.resize(num);
bool_values.resize(num);
for (int i = 0; i < num; ++i) {
fp32_values[i] = i + 5;
int32_values[i] = i;
int64_values[i] = i;
bool_values[i] = i;
}
std::vector<int64_t> out_shape(shape.size(), 0);
for (size_t i = 0; i < shape.size(); ++i) out_shape[i] = shape[i];
Out_ref.Resize(lite::DDim(out_shape));
Out_gpu.Resize(Out_ref.dims());
Out_cpu.Resize(Out_ref.dims());
cpu_base(&Out_ref);
device_init();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.shape = shape;
param.dtype = dtype;
param.fp32_values = fp32_values;
param.int32_values = int32_values;
param.int64_values = int64_values;
param.bool_values = bool_values;
param.Out = &Out_gpu;
}
void float_data_init() {}
void half_data_init() {}
void cpu_base(lite::Tensor* Out) {
if (dtype == static_cast<int>(lite::core::FluidType::INT32)) {
for (size_t i = 0; i < int32_values.size(); ++i) {
Out->mutable_data<int>()[i] = int32_values[i];
}
} else if (dtype == static_cast<int>(lite::core::FluidType::FP32)) {
for (size_t i = 0; i < fp32_values.size(); ++i) {
Out->mutable_data<float>()[i] = fp32_values[i];
}
} else if (dtype == static_cast<int>(lite::core::FluidType::INT64)) {
for (size_t i = 0; i < int64_values.size(); ++i) {
Out->mutable_data<int64_t>()[i] = int64_values[i];
}
} else if (dtype == static_cast<bool>(lite::core::FluidType::BOOL)) {
for (size_t i = 0; i < bool_values.size(); ++i) {
Out->mutable_data<bool>()[i] = bool_values[i];
}
} else {
LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype;
}
}
int dtype;
std::vector<int> shape;
std::vector<float> fp32_values;
std::vector<int> int32_values;
std::vector<int64_t> int64_values;
std::vector<int> bool_values;
lite::Tensor Out_ref;
lite::Tensor Out_gpu;
lite::Tensor Out_cpu;
operators::AssignValueParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
};
TEST_F(AssignValueTest, fp32) {
float_data_init();
AssignValueCompute kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
cudaDeviceSynchronize();
}
auto start = GetCurrentUS();
kernel.PrepareForRun();
for (int i = 0; i < FLAGS_repeats; ++i) {
kernel.Run();
}
cudaDeviceSynchronize();
auto duration = (GetCurrentUS() - start) / 1000.0;
LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/fc_compute.h"
#include <string>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T>
struct FcTypeTraits;
template <>
struct FcTypeTraits<float> {
typedef float4 Type;
};
template <typename T>
__global__ void bias_v4(const int num, const T* bias, T* data, int K) {
CUDA_KERNEL_LOOP(index, num) {
int bias_idx = index % K;
const T bias_ptr = bias[bias_idx];
const T in_ptr = data[index];
T packed_val;
packed_val.x = in_ptr.x + bias_ptr.x;
packed_val.y = in_ptr.y + bias_ptr.y;
packed_val.z = in_ptr.z + bias_ptr.z;
packed_val.w = in_ptr.w + bias_ptr.w;
data[index] = packed_val;
}
}
template <typename T>
__global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) {
CUDA_KERNEL_LOOP(index, num) {
int bias_idx = index % K;
const T bias_ptr = bias[bias_idx];
const T in_ptr = data[index];
T packed_val;
packed_val.x = fmaxf(0.f, in_ptr.x + bias_ptr.x);
packed_val.y = fmaxf(0.f, in_ptr.y + bias_ptr.y);
packed_val.z = fmaxf(0.f, in_ptr.z + bias_ptr.z);
packed_val.w = fmaxf(0.f, in_ptr.w + bias_ptr.w);
data[index] = packed_val;
}
}
template <typename T>
__global__ void general_bias(const int num, const T* bias, T* data) {
int offset = blockIdx.x * num;
for (int i = threadIdx.x; i < num; i += blockDim.x) {
T temp;
#if __CUDA_ARCH__ >= 350
temp = __ldg(data + offset + i) + __ldg(bias + i);
#else
temp = data[offset + i] + bias[i];
#endif
data[offset + i] = temp;
}
}
template <typename T>
__global__ void general_relu_bias(const int num, const T* bias, T* data) {
int offset = blockIdx.x * num;
for (int i = threadIdx.x; i < num; i += blockDim.x) {
T temp;
#if __CUDA_ARCH__ >= 350
temp = __ldg(data + offset + i) + __ldg(bias + i);
#else
temp = data[offset + i] + bias[i];
#endif
data[offset + i] = static_cast<int>(temp > 0) * temp;
}
}
template <typename T, PrecisionType PType>
void FcCompute<T, PType>::PrepareForRun() {
gemm_impl_.reset(new lite::cuda::math::Gemm<T, T>);
}
template <typename T, PrecisionType PType>
void FcCompute<T, PType>::Run() {
auto& context = this->ctx_->template As<CUDAContext>();
auto stream = context.exec_stream();
auto& param = this->template Param<param_t>();
const auto* x_data = param.input->template data<T>();
const auto* w_data = param.w->template data<T>();
const auto* b_data = param.bias ? param.bias->template data<T>() : nullptr;
auto out_vec = param.output->dims().Vectorize();
out_vec.back() = param.w->dims()[1];
param.output->Resize(out_vec);
auto* out_data = param.output->template mutable_data<T>(TARGET(kCUDA));
int in_num_col_dims = param.in_num_col_dims;
int M = static_cast<int>(
param.input->dims().Slice(0, param.in_num_col_dims).production());
int K = static_cast<int>(
param.input->dims()
.Slice(param.in_num_col_dims, param.input->dims().size())
.production());
int K2 = static_cast<int>(param.w->dims()[0]);
int N = static_cast<int>(param.w->dims()[1]);
CHECK_EQ(K, K2) << "x_w must be equal with y_h";
CHECK(gemm_impl_->init(false, false, M, N, K, &context));
gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &context);
if (b_data == nullptr) {
return;
}
std::string activation_type = param.activation_type;
if (N % 4 == 0) {
const int threads = 256;
const int num = M * N / 4;
const int blocks = (num + threads - 1) / threads;
typedef typename FcTypeTraits<T>::Type trans_type;
const auto* bias_ptr_v4 = reinterpret_cast<const trans_type*>(b_data);
auto* data_ptr_v4 = reinterpret_cast<trans_type*>(out_data);
if (activation_type == "relu") {
bias_relu_v4<trans_type><<<blocks, threads, 0, stream>>>(
num, bias_ptr_v4, data_ptr_v4, N / 4);
} else if (activation_type == "") {
bias_v4<trans_type><<<blocks, threads, 0, stream>>>(
num, bias_ptr_v4, data_ptr_v4, N / 4);
} else {
LOG(FATAL) << "not supported activation type: " << activation_type;
}
} else {
const int threads = 256;
const int blocks = M;
if (activation_type == "relu") {
general_relu_bias<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
} else if (activation_type == "") {
general_bias<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
} else {
LOG(FATAL) << "not supported activation type: " << activation_type;
}
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
using FcFp32 = paddle::lite::kernels::cuda::FcCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(fc, kCUDA, kFloat, kNCHW, FcFp32, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "lite/backends/cuda/math/gemm.h"
#include "lite/core/kernel.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T, PrecisionType PType>
class FcCompute : public KernelLite<TARGET(kCUDA), PType> {
public:
using param_t = operators::FcParam;
void PrepareForRun() override;
void Run() override;
virtual ~FcCompute() = default;
private:
std::unique_ptr<lite::cuda::math::Gemm<T, T>> gemm_impl_{nullptr};
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/fc_compute.h"
#include <gtest/gtest.h>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "lite/api/test_helper.h"
#include "lite/utils/float16.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class FcTest : public ::testing::Test {
protected:
FcTest()
: m(128),
k(512),
n(64),
in_num_col_dims(1),
act_type("relu"),
x_shape({m, k}),
w_shape({k, n}),
b_shape({n}),
out_shape({m, n}) {
X_gpu.Resize(lite::DDim(x_shape));
X_ref.Resize(lite::DDim(x_shape));
W_gpu.Resize(lite::DDim(w_shape));
W_ref.Resize(lite::DDim(w_shape));
b_gpu.Resize(lite::DDim(b_shape));
b_ref.Resize(lite::DDim(b_shape));
auto x_ref_data = X_ref.mutable_data<float>();
auto w_ref_data = W_ref.mutable_data<float>();
auto b_ref_data = b_ref.mutable_data<float>();
// prepare input
for (int64_t i = 0; i < X_ref.numel(); i++) {
x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
}
for (int64_t i = 0; i < W_ref.numel(); i++) {
w_ref_data[i] = static_cast<float>(i % 10 * 0.2);
}
for (int64_t i = 0; i < b_ref.numel(); i++) {
b_ref_data[i] = static_cast<float>(i % 10 * 0.2);
}
Out_ref.Resize(lite::DDim(out_shape));
Out_cpu.Resize(Out_ref.dims());
Out_gpu.Resize(Out_ref.dims());
fc_cpu_base(&X_ref, &W_ref, &b_ref, &Out_ref);
device_init();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.input = &X_gpu;
param.w = &W_gpu;
param.bias = &b_gpu;
param.in_num_col_dims = in_num_col_dims;
param.activation_type = act_type;
param.output = &Out_gpu;
}
void float_data_init() {
X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
X_gpu.dims());
W_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(W_ref.data<float>(),
W_gpu.dims());
b_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(b_ref.data<float>(),
b_gpu.dims());
}
void half_data_init() {
X_half.Resize(lite::DDim(x_shape));
auto x_half_data = X_half.mutable_data<half>();
for (int64_t i = 0; i < X_half.numel(); i++) {
x_half_data[i] = half(lite::float16(X_ref.data<float>()[i]));
}
X_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, X_gpu.dims());
W_half.Resize(W_ref.dims());
auto w_half_data = W_half.mutable_data<half>();
for (int64_t i = 0; i < W_half.numel(); i++) {
w_half_data[i] = half(lite::float16(W_ref.data<float>()[i]));
}
W_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(w_half_data, W_gpu.dims());
b_half.Resize(b_ref.dims());
auto b_half_data = b_half.mutable_data<half>();
for (int64_t i = 0; i < b_half.numel(); i++) {
b_half_data[i] = half(lite::float16(b_ref.data<float>()[i]));
}
b_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(b_half_data, b_gpu.dims());
}
void fc_cpu_base(const lite::Tensor* X,
const lite::Tensor* W,
const lite::Tensor* b,
lite::Tensor* Out) {
const float* data_in = X->data<float>();
const float* bias = b->data<float>();
const float* weights = W->data<float>();
float* data_out = Out->mutable_data<float>();
int out_rows = X->dims()[0];
int in_cols = X->numel() / out_rows;
int out_cols = W->numel() / in_cols;
int index_out;
for (int i = 0; i < out_rows; i++) {
for (int j = 0; j < out_cols; j++) {
index_out = i * out_cols + j;
data_out[index_out] = bias ? bias[j] : 0;
for (int k = 0; k < in_cols; k++) {
data_out[index_out] +=
data_in[i * in_cols + k] * weights[k * out_cols + j];
}
if (act_type == "relu") {
data_out[index_out] *= static_cast<int>(data_out[index_out] > 0);
}
}
}
}
int m, k, n, in_num_col_dims;
std::string act_type;
std::vector<int64_t> x_shape, w_shape, b_shape, out_shape;
lite::Tensor X_ref, W_ref, b_ref, Out_ref;
lite::Tensor X_gpu, W_gpu, b_gpu;
lite::Tensor X_half, W_half, b_half;
lite::Tensor Out_cpu, Out_gpu;
operators::FcParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
};
TEST_F(FcTest, TestFP32) {
float_data_init();
FcCompute<float, PRECISION(kFloat)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
cudaDeviceSynchronize();
}
auto start = GetCurrentUS();
kernel.PrepareForRun();
for (int i = 0; i < FLAGS_repeats; ++i) {
kernel.Run();
}
cudaDeviceSynchronize();
auto duration = (GetCurrentUS() - start) / 1000.0;
LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
float res = Out_cpu.data<float>()[i];
float ref = Out_ref.data<float>()[i];
EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/sequence_mask_compute.h"
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T>
__global__ void SequenceMaskKernel(T* dst,
const int64_t* src,
int count,
int maxlen) {
CUDA_KERNEL_LOOP(index, count) {
int src_idx = index / maxlen;
int inner_idx = index % maxlen;
dst[index] = static_cast<T>(inner_idx < src[src_idx] ? 1 : 0);
}
}
template <typename T, PrecisionType Ptype>
void SequenceMaskCompute<T, Ptype>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
const auto* x = param.X;
auto* x_data = x->template data<int64_t>();
auto* y = param.Y;
int maxlen = param.maxlen;
if (param.MaxLenTensor) {
auto* len_tensor_data = param.MaxLenTensor->template data<int32_t>();
int32_t len_data{0};
TargetWrapperCuda::MemcpySync(
&len_data, len_tensor_data, sizeof(int32_t), IoDirection::DtoH);
maxlen = len_data;
}
if (maxlen < 0) {
maxlen =
thrust::reduce(x_data, x_data + x->numel(), 0, thrust::maximum<T>());
}
auto y_dim = x->dims().Vectorize();
y_dim.push_back(maxlen);
y->Resize(y_dim);
const int count = y->numel();
auto* dst_data = y->template mutable_data<float>(TARGET(kCUDA));
if (param.out_dtype == 5) {
SequenceMaskKernel<
float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
dst_data, x_data, count, maxlen);
} else {
LOG(FATAL) << "not supported out_dtype: " << param.out_dtype;
}
CUDA_POST_KERNEL_CHECK;
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
using SeqMaskFp32 =
paddle::lite::kernels::cuda::SequenceMaskCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(sequence_mask, kCUDA, kFloat, kNCHW, SeqMaskFp32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
.BindInput("MaxLenTensor", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T, PrecisionType Ptype>
class SequenceMaskCompute : public KernelLite<TARGET(kCUDA), Ptype> {
public:
using param_t = operators::SequenceMaskParam;
void Run() override;
virtual ~SequenceMaskCompute() = default;
// private:
// lite::Tensor seq_offsets_;
// std::vector<int64_t> seq_len_;
// std::vector<size_t> seq_offsets_vec_;
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <memory>
#include <random>
#include <utility>
#include <vector>
#include "lite/api/test_helper.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/kernels/cuda/sequence_mask_compute.h"
// #include "lite/utils/float16.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class SequenceMaskTest : public ::testing::Test {
protected:
SequenceMaskTest()
: maxlen(4),
out_dtype(5),
x_data({3, 2, 1, 0}),
out_shape({static_cast<int64_t>(x_data.size()), maxlen}) {
X_ref.Resize(lite::DDim({static_cast<int64_t>(x_data.size())}));
X_gpu.Resize(X_ref.dims());
auto* x_ref_data = X_ref.mutable_data<int64_t>();
// prepare input
for (size_t i = 0; i < x_data.size(); i++) {
x_ref_data[i] = x_data[i];
}
Out_ref.Resize(lite::DDim(out_shape));
Out_gpu.Resize(Out_ref.dims());
Out_cpu.Resize(Out_ref.dims());
cpu_base(&X_ref, &Out_ref);
device_init();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.X = &X_gpu;
param.Y = &Out_gpu;
param.maxlen = maxlen;
param.out_dtype = out_dtype;
}
void float_data_init() {
X_gpu.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(X_ref.data<int64_t>(),
X_gpu.dims());
}
void half_data_init() {}
void cpu_base(const lite::Tensor* X, lite::Tensor* Out) {
auto* out_data = Out->mutable_data<float>();
for (size_t i = 0; i < x_data.size(); ++i) {
for (int j = 0; j < maxlen; ++j) {
out_data[i * maxlen + j] = j < x_data[i] ? 1 : 0;
}
}
}
int maxlen, out_dtype;
std::vector<int64_t> x_data, out_shape;
lite::Tensor X_ref, Out_ref;
lite::Tensor X_gpu, Out_gpu;
lite::Tensor Out_cpu;
operators::SequenceMaskParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
};
TEST_F(SequenceMaskTest, fp32) {
float_data_init();
SequenceMaskCompute<float, PRECISION(kFloat)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
cudaDeviceSynchronize();
}
auto start = GetCurrentUS();
kernel.PrepareForRun();
for (int i = 0; i < FLAGS_repeats; ++i) {
kernel.Run();
}
cudaDeviceSynchronize();
auto duration = (GetCurrentUS() - start) / 1000.0;
LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/math/sequence_padding.h"
#include "lite/core/op_registry.h"
#include "lite/core/target_wrapper.h"
#include "lite/kernels/cuda/sequence_pad_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T, PrecisionType Ptype>
void SequencePadCompute<T, Ptype>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
const auto* x = param.X;
const auto* pad_value = param.PadValue;
auto* out = param.Out;
auto* len_t = param.Length;
int padded_length = param.padded_length;
int seq_num = x->lod()[0].size() - 1;
int max_seq_len = 0;
int step_width = x->numel() / x->dims()[0];
// calc for param.Lenght
seq_len_.resize(seq_num);
seq_offsets_vec_.resize(x->lod()[0].size());
for (size_t i = 0; i < seq_num; ++i) {
max_seq_len = std::max(
max_seq_len, static_cast<int>(x->lod()[0][i + 1] - x->lod()[0][i]));
seq_len_[i] = x->lod()[0][i + 1] - x->lod()[0][i];
seq_offsets_vec_[i] = x->lod()[0][i];
}
seq_offsets_vec_[seq_num] = x->lod()[0][seq_num];
TargetWrapperCuda::MemcpyAsync(
len_t->template mutable_data<int64_t>(TARGET(kCUDA)),
seq_len_.data(),
sizeof(int64_t) * seq_len_.size(),
IoDirection::HtoD,
stream);
seq_offsets_.Resize({static_cast<int64_t>(x->lod()[0].size())});
TargetWrapperCuda::MemcpyAsync(
seq_offsets_.mutable_data<size_t>(TARGET(kCUDA)),
seq_offsets_vec_.data(),
sizeof(size_t) * seq_offsets_vec_.size(),
IoDirection::HtoD,
stream);
const T* seq_data = x->template data<T>();
T* pad_data = out->template mutable_data<T>(TARGET(kCUDA));
const T* pad_value_data = pad_value->template data<T>();
lite::cuda::math::SequencePadding(pad_data,
seq_data,
pad_value_data,
pad_value->numel() == 1,
seq_offsets_.data<size_t>(),
seq_num,
padded_length,
step_width,
&stream);
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
using SeqPadFp32 =
paddle::lite::kernels::cuda::SequencePadCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(sequence_pad, kCUDA, kFloat, kNCHW, SeqPadFp32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("PadValue", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Length", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T, PrecisionType Ptype>
class SequencePadCompute : public KernelLite<TARGET(kCUDA), Ptype> {
public:
using param_t = operators::SequencePadParam;
void Run() override;
virtual ~SequencePadCompute() = default;
private:
lite::Tensor seq_offsets_;
std::vector<int64_t> seq_len_;
std::vector<size_t> seq_offsets_vec_;
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/sequence_pad_compute.h"
#include <gtest/gtest.h>
#include <memory>
#include <random>
#include <utility>
#include <vector>
#include "lite/api/test_helper.h"
#include "lite/backends/cuda/cuda_utils.h"
// #include "lite/utils/float16.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class SequencePadTest : public ::testing::Test {
protected:
SequencePadTest()
: batch(5),
features(2),
padded_length(3),
x_lod({{0, 2, 5}}),
x_shape({batch, features}),
pad_value_shape({features}),
out_shape({static_cast<int64_t>(x_lod[0].size() - 1),
padded_length,
features}) {
X_ref.Resize(lite::DDim(x_shape));
X_ref.set_lod(x_lod);
X_gpu.Resize(X_ref.dims());
PadValue_ref.Resize(lite::DDim(pad_value_shape));
PadValue_gpu.Resize(PadValue_ref.dims());
Length_ref.Resize(lite::DDim({static_cast<int64_t>(x_lod[0].size() - 1)}));
Length_gpu.Resize(Length_ref.dims());
auto x_ref_data = X_ref.mutable_data<float>();
auto pad_value_ref_data = PadValue_ref.mutable_data<float>();
// prepare input
for (int64_t i = 0; i < X_ref.numel(); i++) {
x_ref_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < PadValue_ref.numel(); i++) {
pad_value_ref_data[i] = static_cast<float>(i);
}
Out_ref.Resize(lite::DDim(out_shape));
Out_gpu.Resize(Out_ref.dims());
Out_cpu.Resize(Out_ref.dims());
cpu_base(&X_ref, &PadValue_ref, &Out_ref, &Length_ref);
device_init();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.X = &X_gpu;
param.PadValue = &PadValue_gpu;
param.Length = &Length_gpu;
param.Out = &Out_gpu;
param.padded_length = padded_length;
}
void float_data_init() {
X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
X_gpu.dims());
X_gpu.set_lod(X_ref.lod());
PadValue_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(
PadValue_ref.data<float>(), PadValue_gpu.dims());
}
void half_data_init() {}
void cpu_base(const lite::Tensor* X,
const lite::Tensor* PadValue,
lite::Tensor* Out,
lite::Tensor* Length) {
auto* length_data = Length->mutable_data<int64_t>();
auto* out_data = Out->mutable_data<float>();
length_data[0] = 2;
length_data[1] = 3;
for (size_t i = 0; i < 4; ++i) {
out_data[i] = i;
}
out_data[4] = 0;
out_data[5] = 1;
for (size_t i = 4; i < 10; ++i) {
out_data[2 + i] = i;
}
}
int batch, features, padded_length;
LoD x_lod;
std::vector<int64_t> x_shape, pad_value_shape, out_shape;
lite::Tensor X_ref, PadValue_ref, Out_ref, Length_ref;
lite::Tensor X_gpu, PadValue_gpu, Out_gpu, Length_gpu;
lite::Tensor Out_cpu, Length_cpu;
operators::SequencePadParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
};
TEST_F(SequencePadTest, fp32) {
float_data_init();
SequencePadCompute<float, PRECISION(kFloat)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
cudaDeviceSynchronize();
}
auto start = GetCurrentUS();
kernel.PrepareForRun();
for (int i = 0; i < FLAGS_repeats; ++i) {
kernel.Run();
}
cudaDeviceSynchronize();
auto duration = (GetCurrentUS() - start) / 1000.0;
LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
IoDirection::DtoH);
CopySync<TARGET(kCUDA)>(Length_cpu.mutable_data<int64_t>(),
Length_gpu.data<int64_t>(),
sizeof(int64_t) * Length_gpu.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
}
for (int i = 0; i < Length_gpu.numel(); ++i) {
EXPECT_NEAR(
Length_cpu.data<int64_t>()[i], Length_ref.data<int64_t>()[i], 1e-5);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include "lite/backends/cuda/math/sequence_padding.h"
#include "lite/core/op_registry.h"
#include "lite/core/target_wrapper.h"
#include "lite/kernels/cuda/sequence_unpad_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T, PrecisionType Ptype>
void SequenceUnpadCompute<T, Ptype>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
const auto* pad_tensor = param.X;
const auto* len_t = param.Length;
auto* seq_tensor = param.Out;
int padded_length = pad_tensor->dims()[1];
int seq_num = seq_tensor->lod()[0].size() - 1;
int max_seq_len = 0;
int step_width = seq_tensor->numel() / seq_tensor->dims()[0];
seq_offsets_vec_.resize(seq_tensor->lod()[0].size());
for (size_t i = 0; i < seq_num; ++i) {
max_seq_len = std::max(max_seq_len,
static_cast<int>(seq_tensor->lod()[0][i + 1] -
seq_tensor->lod()[0][i]));
seq_offsets_vec_[i] = seq_tensor->lod()[0][i];
}
seq_offsets_vec_[seq_num] = seq_tensor->lod()[0][seq_num];
seq_offsets_.Resize({static_cast<int64_t>(seq_tensor->lod()[0].size())});
TargetWrapperCuda::MemcpyAsync(
seq_offsets_.mutable_data<size_t>(TARGET(kCUDA)),
seq_offsets_vec_.data(),
sizeof(size_t) * seq_offsets_vec_.size(),
IoDirection::HtoD,
stream);
const T* pad_data = pad_tensor->template data<T>();
T* seq_data = seq_tensor->template mutable_data<T>(TARGET(kCUDA));
lite::cuda::math::SequenceUnpadding(seq_data,
pad_data,
seq_offsets_.data<size_t>(),
seq_num,
padded_length,
step_width,
&stream);
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
using SeqUnadFp32 =
paddle::lite::kernels::cuda::SequenceUnpadCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(sequence_unpad, kCUDA, kFloat, kNCHW, SeqUnadFp32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("Length", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T, PrecisionType Ptype>
class SequenceUnpadCompute : public KernelLite<TARGET(kCUDA), Ptype> {
public:
using param_t = operators::SequenceUnpadParam;
void Run() override;
virtual ~SequenceUnpadCompute() = default;
private:
lite::Tensor seq_offsets_;
std::vector<size_t> seq_offsets_vec_;
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/sequence_unpad_compute.h"
#include <gtest/gtest.h>
#include <memory>
#include <random>
#include <utility>
#include <vector>
#include "lite/api/test_helper.h"
#include "lite/backends/cuda/cuda_utils.h"
// #include "lite/utils/float16.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class SequenceUnpadTest : public ::testing::Test {
protected:
SequenceUnpadTest()
: batch(5),
features(2),
padded_length(3),
out_lod({{0, 2, 5}}),
x_shape({static_cast<int64_t>(out_lod[0].size() - 1),
padded_length,
features}),
out_shape({batch, features}) {
X_ref.Resize(lite::DDim(x_shape));
X_gpu.Resize(X_ref.dims());
Length_ref.Resize(
lite::DDim({static_cast<int64_t>(out_lod[0].size() - 1)}));
Length_gpu.Resize(Length_ref.dims());
auto* x_ref_data = X_ref.mutable_data<float>();
auto* length_ref_data = Length_ref.mutable_data<int64_t>();
// prepare input
for (int64_t i = 0; i < X_ref.numel(); i++) {
x_ref_data[i] = static_cast<float>(i);
}
for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
length_ref_data[i] = out_lod[0][i + 1] - out_lod[0][i];
}
Out_ref.Resize(lite::DDim(out_shape));
Out_ref.set_lod(out_lod);
Out_gpu.Resize(Out_ref.dims());
Out_gpu.set_lod(Out_ref.lod());
Out_cpu.Resize(Out_ref.dims());
Out_cpu.set_lod(Out_ref.lod());
cpu_base(&X_ref, &Length_ref, &Out_ref);
device_init();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.X = &X_gpu;
param.Length = &Length_gpu;
param.Out = &Out_gpu;
}
void float_data_init() {
X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
X_gpu.dims());
Length_gpu.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(
Length_ref.data<int64_t>(), Length_gpu.dims());
}
void half_data_init() {}
void cpu_base(const lite::Tensor* X,
const lite::Tensor* Length,
lite::Tensor* Out) {
auto* out_data = Out->mutable_data<float>();
for (size_t i = 0; i < 4; ++i) {
out_data[i] = i;
}
for (size_t i = 6; i < 12; ++i) {
out_data[i - 2] = i;
}
}
int batch, features, padded_length;
LoD out_lod;
std::vector<int64_t> x_shape, out_shape;
lite::Tensor X_ref, Out_ref, Length_ref;
lite::Tensor X_gpu, Out_gpu, Length_gpu;
lite::Tensor Out_cpu, Length_cpu;
operators::SequencePadParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
};
TEST_F(SequenceUnpadTest, fp32) {
float_data_init();
SequenceUnpadCompute<float, PRECISION(kFloat)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
cudaDeviceSynchronize();
}
auto start = GetCurrentUS();
kernel.PrepareForRun();
for (int i = 0; i < FLAGS_repeats; ++i) {
kernel.Run();
}
cudaDeviceSynchronize();
auto duration = (GetCurrentUS() - start) / 1000.0;
LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -13,17 +13,20 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "lite/kernels/cuda/transpose_compute.h"
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/kernels/cuda/transpose_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
void TransposeCompute::Run() {
auto& param = this->Param<param_t>();
template <typename T, PrecisionType Ptype>
void TransposeCompute<T, Ptype>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
......@@ -31,8 +34,8 @@ void TransposeCompute::Run() {
lite::Tensor* Out = param.output;
std::vector<int> axes = param.axis;
const float* in = X->data<float>();
float* out = Out->mutable_data<float>(TARGET(kCUDA));
const T* in = X->template data<T>();
T* out = Out->mutable_data<T>(TARGET(kCUDA));
int ndim = X->dims().size();
std::vector<int64_t> dims = X->dims().data();
......@@ -65,34 +68,31 @@ void TransposeCompute::Run() {
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(transpose,
kCUDA,
kFloat,
kNCHW,
paddle::lite::kernels::cuda::TransposeCompute,
def)
using TransFp32 =
paddle::lite::kernels::cuda::TransposeCompute<float, PRECISION(kFloat)>;
using TransFp16 =
paddle::lite::kernels::cuda::TransposeCompute<half, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(transpose, kCUDA, kFloat, kNCHW, TransFp32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
REGISTER_LITE_KERNEL(transpose2,
kCUDA,
kFloat,
kNCHW,
paddle::lite::kernels::cuda::TransposeCompute,
def)
REGISTER_LITE_KERNEL(transpose2, kCUDA, kFloat, kNCHW, TransFp32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// REGISTER_LITE_KERNEL(transpose2,
// kCUDA,
// kFloat,
// kNCHW,
// paddle::lite::kernels::cuda::TransposeCompute,
// def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
// .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))})
// .Finalize();
REGISTER_LITE_KERNEL(transpose, kCUDA, kFP16, kNCHW, TransFp16, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
.Finalize();
REGISTER_LITE_KERNEL(transpose2, kCUDA, kFP16, kNCHW, TransFp16, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
.BindOutput("XShape",
{LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
.Finalize();
......@@ -21,7 +21,8 @@ namespace lite {
namespace kernels {
namespace cuda {
class TransposeCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
template <typename Dtype, PrecisionType Ptype>
class TransposeCompute : public KernelLite<TARGET(kCUDA), Ptype> {
public:
using param_t = operators::TransposeParam;
......@@ -29,7 +30,7 @@ class TransposeCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
virtual ~TransposeCompute() = default;
private:
lite::cuda::math::Transpose<float> trans;
lite::cuda::math::Transpose<Dtype> trans;
};
} // namespace cuda
......
......@@ -13,11 +13,16 @@
// limitations under the License.
#include "lite/kernels/cuda/transpose_compute.h"
#include <gtest/gtest.h>
#include <memory>
#include <utility>
#include <vector>
#include "lite/api/test_helper.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/utils/float16.h"
namespace paddle {
namespace lite {
namespace kernels {
......@@ -89,7 +94,7 @@ void nhwc2nchw_ref(lite::Tensor* input,
}
}
void transpose_ref(lite::Tensor* input,
void transpose_ref(const lite::Tensor* input,
lite::Tensor* output,
const std::vector<int> axes) {
auto* input_data = input->data<float>();
......@@ -123,7 +128,7 @@ void transpose_ref(lite::Tensor* input,
} // namespace
TEST(transpose_nchw, normal) {
TransposeCompute transpose_kernel;
TransposeCompute<float, PRECISION(kFloat)> transpose_kernel;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto& context = ctx->As<CUDAContext>();
......@@ -177,7 +182,7 @@ TEST(transpose_nchw, normal) {
}
TEST(transpose_nhwc, normal) {
TransposeCompute transpose_kernel;
TransposeCompute<float, PRECISION(kFloat)> transpose_kernel;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto& context = ctx->As<CUDAContext>();
......@@ -228,54 +233,139 @@ TEST(transpose_nhwc, normal) {
}
}
TEST(transpose, normal) {
TransposeCompute transpose_kernel;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto& context = ctx->As<CUDAContext>();
class TransposeTest : public ::testing::Test {
protected:
TransposeTest()
: C(3),
H(128),
W(64),
axes({1, 2, 0}),
x_shape({C, H, W}),
out_shape({H, W, C}) {
X_ref.Resize(lite::DDim(x_shape));
X_gpu.Resize(X_ref.dims());
auto x_ref_data = X_ref.mutable_data<float>();
// prepare input
for (int64_t i = 0; i < X_ref.numel(); i++) {
x_ref_data[i] = static_cast<float>(i);
}
operators::TransposeParam param;
Out_ref.Resize(lite::DDim(out_shape));
Out_gpu.Resize(Out_ref.dims());
Out_cpu.Resize(Out_ref.dims());
cpu_base(&X_ref, &Out_ref);
lite::Tensor x, x_cpu, x_ref;
lite::Tensor out, out_cpu, out_ref;
int C = 3, H = 128, W = 128;
std::vector<int> axes({2, 0, 1});
x.Resize({C, H, W});
out.Resize({W, C, H});
device_init();
}
x_cpu.Resize({C, H, W});
out_cpu.Resize({W, C, H});
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.x = &X_gpu;
param.output = &Out_gpu;
param.axis = axes;
}
x_ref.Resize({C, H, W});
out_ref.Resize({W, C, H});
void float_data_init() {
X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
X_gpu.dims());
}
auto* x_cpu_data = x_cpu.mutable_data<float>();
auto* out_cpu_data = out_cpu.mutable_data<float>();
auto* x_ref_data = x_ref.mutable_data<float>();
void half_data_init() {
X_half.Resize(lite::DDim(X_ref.dims()));
auto x_half_data = X_half.mutable_data<half>();
for (int64_t i = 0; i < X_half.numel(); i++) {
x_half_data[i] = half(lite::float16(X_ref.data<float>()[i]));
}
X_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, X_gpu.dims());
}
for (int i = 0; i < x_cpu.numel(); ++i) {
x_cpu_data[i] = i + 1;
x_ref_data[i] = i + 1;
void cpu_base(const lite::Tensor* X, lite::Tensor* Out) {
transpose_ref(X, Out, axes);
}
x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
param.x = &x;
param.output = &out;
param.axis = axes;
transpose_kernel.SetParam(param);
int C, H, W;
std::vector<int> axes;
std::vector<int64_t> x_shape, out_shape;
lite::Tensor X_ref, Out_ref;
lite::Tensor X_gpu, Out_gpu;
lite::Tensor X_half;
lite::Tensor Out_cpu;
operators::TransposeParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
cudaStreamCreate(&stream);
context.SetExecStream(stream);
transpose_kernel.SetContext(std::move(ctx));
transpose_kernel.Launch();
};
TEST_F(TransposeTest, fp32) {
float_data_init();
TransposeCompute<float, PRECISION(kFloat)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
cudaDeviceSynchronize();
}
auto start = GetCurrentUS();
kernel.PrepareForRun();
for (int i = 0; i < FLAGS_repeats; ++i) {
kernel.Run();
}
cudaDeviceSynchronize();
auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
CopySync<TARGET(kCUDA)>(
out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
transpose_ref(&x_ref, &out_ref, axes);
auto* out_ref_data = out_ref.mutable_data<float>();
for (int i = 0; i < out.numel(); i++) {
EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
auto duration = (GetCurrentUS() - start) / 1000.0;
LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
}
}
TEST_F(TransposeTest, TestFP16) {
half_data_init();
TransposeCompute<half, PRECISION(kFP16)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
cudaDeviceSynchronize();
}
auto start = GetCurrentUS();
kernel.PrepareForRun();
for (int i = 0; i < FLAGS_repeats; ++i) {
kernel.Run();
}
cudaDeviceSynchronize();
auto duration = (GetCurrentUS() - start) / 1000.0;
LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
const half* out_gpu_data = Out_gpu.data<half>();
half* out_cpu_data = Out_cpu.mutable_data<half>();
CopySync<TARGET(kCUDA)>(out_cpu_data,
out_gpu_data,
sizeof(half) * Out_gpu.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_cpu.numel(); ++i) {
float res = static_cast<float>(lite::float16(out_cpu_data[i]));
float ref = Out_ref.data<float>()[i];
EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
}
}
......
......@@ -22,7 +22,6 @@ limitations under the License. */
#include "lite/fluid/for_range.h"
#include "lite/fluid/transform.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/utils/variant.h"
namespace paddle {
......@@ -66,9 +65,8 @@ inline void get_mid_dims(const lite::DDim &x_dims,
for (size_t i = 0; i < y_dims.size(); ++i) {
if (x_dims[i + axis] != y_dims[i]) {
// only support single y_dims[i] = 1 now.
PADDLE_ENFORCE_EQ(
*mid_flag, 0, "Broadcast support y_dims with single 1.");
PADDLE_ENFORCE_EQ(y_dims[i], 1, "Broadcast dimension mismatch.");
CHECK_EQ(*mid_flag, 0) << "Broadcast support y_dims with single 1.";
CHECK_EQ(y_dims[i], 1) << "Broadcast dimension mismatch.";
// m*n*k m*1*k
for (size_t j = 0; j < i; ++j) {
(*pre) *= y_dims[j];
......@@ -95,8 +93,7 @@ inline void get_mid_dims(const lite::DDim &x_dims,
}
for (size_t i = 0; i < y_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(
x_dims[i + axis], y_dims[i], "Broadcast dimension mismatch.");
CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
(*n) *= y_dims[i];
}
......@@ -314,17 +311,16 @@ void ElementwiseComputeEx(const lite::Context<Target> &ctx,
TransformFunctor<Functor, T, Target, OutType> functor(x, y, z, ctx, func);
auto x_dims = x->dims();
auto y_dims_untrimed = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(),
y_dims_untrimed.size(),
"Rank of first input must >= rank of second input.");
CHECK_GE(x_dims.size(), y_dims_untrimed.size())
<< "Rank of first input must >= rank of second input.";
if (x_dims == y_dims_untrimed) {
functor.Run();
return;
}
axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < static_cast<int>(x_dims.size()),
"Axis should be in range [0, x_dims)");
CHECK(axis >= 0 && axis < static_cast<int>(x_dims.size()))
<< "Axis should be in range [0, x_dims)";
auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post, mid_flag = 0;
......@@ -560,9 +556,8 @@ void FusedElemwiseAndActComputeEx(const lite::Context<Target> &ctx,
lite::Tensor *out,
lite::Tensor *intermediate_out) {
if (KeepIntermediateOut) {
PADDLE_ENFORCE(intermediate_out,
"The save_intermediate_out is opened, "
"intermediate_out should not be nullptr.");
CHECK(intermediate_out) << "The save_intermediate_out is opened, "
"intermediate_out should not be nullptr.";
}
const lite::DDim &x_dim = x.dims();
......
......@@ -63,10 +63,10 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
out.ShareDataWith(*y);
out.Resize(matrix_shape);
PADDLE_ENFORCE_EQ(Mean->numel(), left);
PADDLE_ENFORCE_EQ(Var->numel(), left);
PADDLE_ENFORCE_EQ(Scale->numel(), right);
PADDLE_ENFORCE_EQ(Bias->numel(), right);
CHECK_EQ(Mean->numel(), left);
CHECK_EQ(Var->numel(), left);
CHECK_EQ(Scale->numel(), right);
CHECK_EQ(Bias->numel(), right);
auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<T>,
lite::fluid::CPUPlace>::Cache()
......
......@@ -41,8 +41,8 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto *param_out = &sgd_param.ParamOut->raw_tensor();
auto sz = param_out->numel();
PADDLE_ENFORCE_EQ(param->numel(), sz);
PADDLE_ENFORCE_EQ(grad->numel(), sz);
CHECK_EQ(param->numel(), sz);
CHECK_EQ(grad->numel(), sz);
paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1);
const T *lr = learning_rate->template data<T>();
......
......@@ -60,7 +60,7 @@ inline void TransCompute(const int dim,
trans6(context, in, out, axis);
break;
default:
PADDLE_THROW("Tensors with rank at most 6 are supported");
LOG(FATAL) << "Tensors with rank at most 6 are supported";
}
}
......
......@@ -9,3 +9,6 @@ lite_fbs_library(fbs_op_desc SRCS op_desc.cc FBS_DEPS framework_fbs_header)
lite_fbs_library(fbs_var_desc SRCS var_desc.cc FBS_DEPS framework_fbs_header)
lite_fbs_library(fbs_block_desc SRCS block_desc.cc FBS_DEPS framework_fbs_header)
lite_fbs_library(fbs_program_desc SRCS program_desc.cc FBS_DEPS framework_fbs_header)
lite_fbs_library(vector_view SRCS vector_view.cc FBS_DEPS framework_fbs_header)
lite_cc_test(test_vector_view SRCS vector_view_test.cc DEPS vector_view)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/model_parser/flatbuffers/vector_view.h"
namespace paddle {
namespace lite {} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <type_traits>
#include <vector>
#include "flatbuffers/flatbuffers.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace fbs {
struct Flatbuffers {};
struct Standand {};
template <typename T, typename U = void>
struct ElementTraits {
typedef T element_type;
};
template <typename T>
struct ElementTraits<T*,
typename std::enable_if<std::is_class<T>::value>::type> {
typedef flatbuffers::Offset<T> element_type;
};
template <>
struct ElementTraits<std::string, void> {
typedef flatbuffers::Offset<flatbuffers::String> element_type;
};
template <typename T, typename U>
struct VectorTraits;
template <typename T>
struct VectorTraits<T, Flatbuffers> {
typedef flatbuffers::Vector<typename ElementTraits<T>::element_type>
vector_type;
typedef typename vector_type::const_iterator const_iterator;
typedef typename const_iterator::value_type value_type;
typedef const typename const_iterator::reference const_reference;
typedef value_type subscript_return_type;
};
template <typename T>
struct VectorTraits<T, Standand> {
typedef std::vector<T> vector_type;
typedef typename vector_type::const_iterator const_iterator;
typedef typename vector_type::const_reference const_reference;
typedef const_reference subscript_return_type;
};
template <typename T, typename U = Flatbuffers>
class VectorView {
public:
typedef VectorTraits<T, U> Traits;
explicit VectorView(typename Traits::vector_type const* cvec) {
cvec_ = cvec;
}
typename Traits::subscript_return_type operator[](size_t i) const {
return cvec_->operator[](i);
}
typename Traits::const_iterator begin() const { return cvec_->begin(); }
typename Traits::const_iterator end() const { return cvec_->end(); }
size_t size() const { return cvec_->size(); }
operator std::vector<T>() {
VLOG(10) << "Copying elements out of VectorView will damage performance.";
std::vector<T> tmp;
tmp.reserve(cvec_->size());
for (auto val : *cvec_) {
tmp.push_back(val);
}
return tmp;
}
~VectorView() = default;
private:
typename Traits::vector_type const* cvec_;
};
struct FBSStrIterator {
typedef flatbuffers::VectorIterator<
flatbuffers::Offset<flatbuffers::String>,
typename flatbuffers::IndirectHelper<
flatbuffers::Offset<flatbuffers::String>>::return_type>
VI;
explicit FBSStrIterator(const VI& iter) { iter_ = iter; }
const VI& raw_iter() const { return iter_; }
bool operator==(const FBSStrIterator& other) const {
return iter_ == other.raw_iter();
}
bool operator<(const FBSStrIterator& other) const {
return iter_ < other.raw_iter();
}
bool operator!=(const FBSStrIterator& other) const {
return iter_ != other.raw_iter();
}
ptrdiff_t operator-(const FBSStrIterator& other) const {
return iter_ - other.raw_iter();
}
std::string operator*() const { return iter_.operator*()->str(); }
std::string operator->() const { return iter_.operator->()->str(); }
FBSStrIterator& operator++() {
iter_++;
return *this;
}
FBSStrIterator& operator--() {
iter_--;
return *this;
}
FBSStrIterator operator+(const size_t& offset) {
return FBSStrIterator(iter_ + offset);
}
FBSStrIterator operator-(const size_t& offset) {
return FBSStrIterator(iter_ - offset);
}
private:
VI iter_;
};
template <>
class VectorView<std::string, Flatbuffers> {
public:
typedef VectorTraits<std::string, Flatbuffers> Traits;
explicit VectorView(typename Traits::vector_type const* cvec) {
cvec_ = cvec;
}
std::string operator[](size_t i) const { return cvec_->operator[](i)->str(); }
FBSStrIterator begin() const { return FBSStrIterator(cvec_->begin()); }
FBSStrIterator end() const { return FBSStrIterator(cvec_->end()); }
size_t size() const { return cvec_->size(); }
operator std::vector<std::string>() {
VLOG(10) << "Copying elements out of VectorView will damage performance.";
std::vector<std::string> tmp;
tmp.reserve(cvec_->size());
for (auto val : *cvec_) {
tmp.push_back(val->str());
}
return tmp;
}
~VectorView() = default;
private:
typename Traits::vector_type const* cvec_;
};
} // namespace fbs
} // namespace lite
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册