未验证 提交 ea42e431 编写于 作者: Q qingqing01 提交者: GitHub

Speed unit testing. (#16978)

* Speed affine_channel_op unit testing
* Add check in tensor_py
* Fix ONLY_CPU Compiling
上级 ae7a2cb8
...@@ -65,6 +65,9 @@ class AffineChannelCUDAKernel : public framework::OpKernel<T> { ...@@ -65,6 +65,9 @@ class AffineChannelCUDAKernel : public framework::OpKernel<T> {
int block = 1024; int block = 1024;
int grid = (num + block - 1) / block; int grid = (num + block - 1) / block;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
grid = std::min(std::max(max_threads / block, 1), grid);
if (layout == framework::DataLayout::kNCHW) { if (layout == framework::DataLayout::kNCHW) {
KeAffineChannelCUDA<T, framework::DataLayout::kNCHW, KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
true><<<grid, block, 0, dev_ctx.stream()>>>( true><<<grid, block, 0, dev_ctx.stream()>>>(
...@@ -162,7 +165,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> { ...@@ -162,7 +165,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
} }
} else { } else {
if (dx) { if (dx) {
KeAffineChannelCUDA<T, framework::DataLayout::kNCHW, KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
false><<<grid1, block, 0, dev_ctx.stream()>>>( false><<<grid1, block, 0, dev_ctx.stream()>>>(
dy_d, s_d, nullptr, C, HxW, num, dx_d); dy_d, s_d, nullptr, C, HxW, num, dx_d);
} }
......
...@@ -138,25 +138,33 @@ inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) { ...@@ -138,25 +138,33 @@ inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
template <typename T> template <typename T>
T TensorGetElement(const framework::Tensor &self, size_t offset) { T TensorGetElement(const framework::Tensor &self, size_t offset) {
PADDLE_ENFORCE_LT(offset, self.numel());
T b = static_cast<T>(0);
if (platform::is_cpu_place(self.place())) { if (platform::is_cpu_place(self.place())) {
return self.data<T>()[offset]; b = self.data<T>()[offset];
#ifdef PADDLE_WITH_CUDA
} else { } else {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor); const T *a = self.data<T>();
framework::TensorCopySync(self, platform::CPUPlace(), dst.get()); auto p = boost::get<platform::CUDAPlace>(self.place());
return dst->data<T>()[offset]; paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
nullptr);
#endif
} }
return b;
} }
// TODO(dzhwinter) : fix the redundant Tensor allocate and free
template <typename T> template <typename T>
void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
if (platform::is_gpu_place(self->place())) { PADDLE_ENFORCE_LT(offset, self->numel());
framework::Tensor dst; if (platform::is_cpu_place(self->place())) {
framework::TensorCopySync(*self, platform::CPUPlace(), &dst);
dst.mutable_data<T>(platform::CPUPlace())[offset] = elem;
framework::TensorCopySync(dst, self->place(), self);
} else if (platform::is_cpu_place(self->place())) {
self->mutable_data<T>(self->place())[offset] = elem; self->mutable_data<T>(self->place())[offset] = elem;
#ifdef PADDLE_WITH_CUDA
} else {
auto p = boost::get<platform::CUDAPlace>(self->place());
T *a = self->mutable_data<T>(p);
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
nullptr);
#endif
} }
} }
......
...@@ -70,6 +70,12 @@ class TestAffineChannelNHWC(TestAffineChannelOp): ...@@ -70,6 +70,12 @@ class TestAffineChannelNHWC(TestAffineChannelOp):
self.C = 32 self.C = 32
self.layout = 'NHWC' self.layout = 'NHWC'
def test_check_grad_stopgrad_dx(self):
return
def test_check_grad_stopgrad_dscale_dbias(self):
return
class TestAffineChannel2D(TestAffineChannelOp): class TestAffineChannel2D(TestAffineChannelOp):
def init_test_case(self): def init_test_case(self):
...@@ -77,10 +83,16 @@ class TestAffineChannel2D(TestAffineChannelOp): ...@@ -77,10 +83,16 @@ class TestAffineChannel2D(TestAffineChannelOp):
self.C = 64 self.C = 64
self.layout = 'NCHW' self.layout = 'NCHW'
def test_check_grad_stopgrad_dx(self):
return
def test_check_grad_stopgrad_dscale_dbias(self):
return
class TestAffineChannelNCHWLargeShape(TestAffineChannelOp): class TestAffineChannelNCHWLargeShape(TestAffineChannelOp):
def init_test_case(self): def init_test_case(self):
self.shape = [64, 128, 112, 112] self.shape = [4, 128, 112, 112]
self.C = 128 self.C = 128
self.layout = 'NCHW' self.layout = 'NCHW'
...@@ -95,9 +107,9 @@ class TestAffineChannelNCHWLargeShape(TestAffineChannelOp): ...@@ -95,9 +107,9 @@ class TestAffineChannelNCHWLargeShape(TestAffineChannelOp):
pass pass
class TestAffineChannelNCHWLargeShape(TestAffineChannelNCHWLargeShape): class TestAffineChannelNHWCLargeShape(TestAffineChannelNCHWLargeShape):
def init_test_case(self): def init_test_case(self):
self.shape = [64, 112, 112, 512] self.shape = [64, 32, 32, 512]
self.C = 512 self.C = 512
self.layout = 'NHWC' self.layout = 'NHWC'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册