未验证 提交 ea42e431 编写于 作者: Q qingqing01 提交者: GitHub

Speed unit testing. (#16978)

* Speed affine_channel_op unit testing
* Add check in tensor_py
* Fix ONLY_CPU Compiling
上级 ae7a2cb8
......@@ -65,6 +65,9 @@ class AffineChannelCUDAKernel : public framework::OpKernel<T> {
int block = 1024;
int grid = (num + block - 1) / block;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
grid = std::min(std::max(max_threads / block, 1), grid);
if (layout == framework::DataLayout::kNCHW) {
KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
true><<<grid, block, 0, dev_ctx.stream()>>>(
......@@ -162,7 +165,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
}
} else {
if (dx) {
KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
false><<<grid1, block, 0, dev_ctx.stream()>>>(
dy_d, s_d, nullptr, C, HxW, num, dx_d);
}
......
......@@ -138,25 +138,33 @@ inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
template <typename T>
T TensorGetElement(const framework::Tensor &self, size_t offset) {
PADDLE_ENFORCE_LT(offset, self.numel());
T b = static_cast<T>(0);
if (platform::is_cpu_place(self.place())) {
return self.data<T>()[offset];
b = self.data<T>()[offset];
#ifdef PADDLE_WITH_CUDA
} else {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
framework::TensorCopySync(self, platform::CPUPlace(), dst.get());
return dst->data<T>()[offset];
const T *a = self.data<T>();
auto p = boost::get<platform::CUDAPlace>(self.place());
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
nullptr);
#endif
}
return b;
}
// TODO(dzhwinter) : fix the redundant Tensor allocate and free
template <typename T>
void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
if (platform::is_gpu_place(self->place())) {
framework::Tensor dst;
framework::TensorCopySync(*self, platform::CPUPlace(), &dst);
dst.mutable_data<T>(platform::CPUPlace())[offset] = elem;
framework::TensorCopySync(dst, self->place(), self);
} else if (platform::is_cpu_place(self->place())) {
PADDLE_ENFORCE_LT(offset, self->numel());
if (platform::is_cpu_place(self->place())) {
self->mutable_data<T>(self->place())[offset] = elem;
#ifdef PADDLE_WITH_CUDA
} else {
auto p = boost::get<platform::CUDAPlace>(self->place());
T *a = self->mutable_data<T>(p);
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
nullptr);
#endif
}
}
......
......@@ -70,6 +70,12 @@ class TestAffineChannelNHWC(TestAffineChannelOp):
self.C = 32
self.layout = 'NHWC'
def test_check_grad_stopgrad_dx(self):
return
def test_check_grad_stopgrad_dscale_dbias(self):
return
class TestAffineChannel2D(TestAffineChannelOp):
def init_test_case(self):
......@@ -77,10 +83,16 @@ class TestAffineChannel2D(TestAffineChannelOp):
self.C = 64
self.layout = 'NCHW'
def test_check_grad_stopgrad_dx(self):
return
def test_check_grad_stopgrad_dscale_dbias(self):
return
class TestAffineChannelNCHWLargeShape(TestAffineChannelOp):
def init_test_case(self):
self.shape = [64, 128, 112, 112]
self.shape = [4, 128, 112, 112]
self.C = 128
self.layout = 'NCHW'
......@@ -95,9 +107,9 @@ class TestAffineChannelNCHWLargeShape(TestAffineChannelOp):
pass
class TestAffineChannelNCHWLargeShape(TestAffineChannelNCHWLargeShape):
class TestAffineChannelNHWCLargeShape(TestAffineChannelNCHWLargeShape):
def init_test_case(self):
self.shape = [64, 112, 112, 512]
self.shape = [64, 32, 32, 512]
self.C = 512
self.layout = 'NHWC'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册