提交 a7855d3e 编写于 作者: H hedaoyuan

Lazy Assignment

上级 3a5d60bc
......@@ -87,6 +87,12 @@ if(NOT WITH_GPU)
add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else()
if(${CUDA_VERSION_MAJOR} GREATER 6)
if(COMPILER_SUPPORT_CXX11)
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
endif()
endif()
# TODO(yuyang18): Change it to remove std=c++11 in cuda compile.
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
if(NOT CUDNN_FOUND)
......
/**
* TensorAssign.h
*
* Author: hedaoyuan (hedaoyuan@baidu.com)
* Created on: 2016-10-08
*
* Copyright (c) Baidu.com, Inc. All Rights Reserved
*
*/
#pragma once
#include <algorithm>
#include "paddle/utils/Logging.h"
namespace paddle {
template<typename LhsType, typename RhsType, class T>
class TensorAssignOp {
public:
explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
: lhs_(lhs), rhs_(rhs) {
#ifndef __CUDA_ARCH__
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
#endif
}
INLINE void apply(const int i, const int j) {
lhs_.applyRef(i, j) = rhs_.apply(i, j);
}
INLINE void apply(const int index) {
lhs_.applyRef(index) = rhs_.apply(index);
}
INLINE size_t getWidth() const { return lhs_.getWidth(); }
INLINE size_t getHeight() const { return rhs_.getHeight(); }
INLINE bool isContiguous() const {
return lhs_.isContiguous() && rhs_.isContiguous();
}
INLINE bool useGpu() const { return lhs_.useGpu(); }
private:
TensorApply<LhsType, T> lhs_;
TensorApply<const RhsType, T> rhs_;
};
template <typename Assign, typename... AssignOp>
void AssignCpuEvaluate(int height, int width, bool isContiguous,
Assign&& assign, AssignOp&& ... args) {
if (isContiguous) {
int size = height * width;
for (int index = 0; index < size; index++) {
assign.apply(index);
__attribute__((unused)) int dummy[] = { (((args)).apply(index), 0)... };
}
} else {
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
assign.apply(i, j);
__attribute__((unused)) int dummy[] = { (((args)).apply(i, j), 0)... };
}
}
}
}
#ifdef __NVCC__
template <typename Assign, typename... AssignOp>
__global__
void AssignGpuEvaluate1(const int border, Assign assign, AssignOp ... args) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < border) {
assign.apply(idx);
__attribute__((unused)) int dummy[] = { (((args)).apply(idx), 0)... };
}
}
template <typename Assign, typename... AssignOp>
__global__
void AssignGpuEvaluate2(const int height, const int width,
Assign assign, AssignOp ... args) {
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
assign.apply(i, j);
__attribute__((unused)) int dummy[] = { (((args)).apply(i, j), 0)... };
}
}
}
#endif
// At least one assignment expression is required
template <typename Assign, typename... AssignOp>
void AssignEvaluate(Assign&& assign, AssignOp&& ... args) {
const bool useGpu_ = assign.useGpu();
bool isContiguous_ = assign.isContiguous();
const size_t height = assign.getHeight();
const size_t width = assign.getWidth();
const int packSize = sizeof...(args);
const bool packUseGpu[] = { ((args)).useGpu()... };
const bool packIsContiguous[] = { ((args)).isContiguous()... };
const size_t packHeight[] = { ((args)).getHeight()... };
const size_t packWidth[] = { ((args)).getWidth()... };
for (int i = 0; i < packSize; i++) {
CHECK_EQ(useGpu_, packUseGpu[i]);
CHECK_EQ(height, packHeight[i]);
CHECK_EQ(width, packWidth[i]);
isContiguous_ = isContiguous_ && packIsContiguous[i];
}
if (useGpu_) {
#ifdef __NVCC__
if (isContiguous_) {
int size = height * width;
int blockSize = size <= 1024 ? size : 1024;
int gridSize = (size + 1024 - 1) / 1024;
AssignGpuEvaluate1
<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(size, assign, args...);
} else {
int blockSizeY = std::min(32, (int)height);
int blockSizeX = (32 / blockSizeY) * 32;
int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
dim3 threads(blockSizeX, blockSizeY);
dim3 grid(gridSizeX, gridSizeY);
AssignGpuEvaluate2
<<<grid, threads, 0, STREAM_DEFAULT>>>(height, width, assign, args...);
}
CHECK_SYNC("AssignEvaluate failed");
#endif
} else {
AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
}
}
} // namespace paddle
......@@ -27,14 +27,16 @@ inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
int height = lhs_.getHeight();
int width = lhs_.getWidth();
if (lhs_.isContiguous() && rhs_.isContiguous()) {
int size = lhs_.getHeight() * lhs_.getWidth();
int size = height * width;
for (int index = 0; index < size; index++) {
lhs_.applyRef(index) = rhs_.apply(index);
}
} else {
for (size_t i = 0; i < lhs_.getHeight(); i++) {
for (size_t j = 0; j < lhs_.getWidth(); j++) {
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
lhs_.applyRef(i, j) = rhs_.apply(i, j);
}
}
......
......@@ -27,6 +27,8 @@ typename ExprType2,
typename ExprType3,
class T> class TensorTernaryOp;
template<typename LhsType, typename RhsType, class T> class TensorAssignOp;
/**
* \brief Tensor base class.
*
......@@ -318,6 +320,12 @@ public:
(hppl::unary::constant<T>(p), derived());
}
template<typename ExpressionType>
TensorAssignOp<Derived, ExpressionType, T>
lazyAssign(const ExpressionType& expr) const {
return TensorAssignOp<Derived, ExpressionType, T> (derived(), expr);
}
protected:
const Derived& derived() const { return *static_cast<const Derived*>(this); }
};
......
......@@ -12,6 +12,175 @@
#include "BaseMatrix.h"
#include "TrainingAlgorithmOp.h"
#if __cplusplus > 199711L
#include "TensorAssign.h"
namespace paddle {
void sparseMomentumApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& momU,
BaseMatrix& momV,
real alpha,
real beta,
real gamma,
real tau,
real learningRate) {
auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
auto expr2 = momV.lazyAssign(
momV + (tau * alpha * gamma * learningRate) * grad);
auto expr3 = value.lazyAssign(
(tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV);
AssignEvaluate(expr1, expr2, expr3);
}
void adadeltaApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum,
BaseMatrix& accum_update,
BaseMatrix& lr,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate) {
auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
auto expr2 = lr.lazyAssign(
((accum_update + epsilon) / (accum + epsilon)).sqrt());
auto expr3 = accum_update.lazyAssign(
rou * accum_update + ((real)1 - rou) * (grad * lr).square());
auto expr4 = mom.lazyAssign(
mom * momentum - learningRate * lr * (grad + value * decayRate));
auto expr5 = value.lazyAssign(value + mom);
AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
}
void adagradApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum_buffer,
BaseMatrix& accum,
BaseMatrix& lr,
real epsilon,
real learningRate,
real momentum,
real decayRate) {
auto expr1 = accum.lazyAssign(accum + grad.square());
auto expr2 = lr.lazyAssign(
(accum_buffer + accum + epsilon).sqrt().reciprocal());
auto expr3 = mom.lazyAssign(
mom * momentum - learningRate * lr * (grad + value * decayRate));
auto expr4 = value.lazyAssign(value + mom);
AssignEvaluate(expr1, expr2, expr3, expr4);
}
void rmspropApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& g,
BaseMatrix& f,
BaseMatrix& lr,
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime) {
auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
auto expr4 = mom.lazyAssign(
mom * momentum - learningRate * lr * (grad + value * decayRate));
auto expr5 = value.lazyAssign(value + mom);
if (firstTime) {
auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
} else {
auto expr1 = g.lazyAssign(
accumulatedRou * g + ((real)1 - rou) * grad.square());
AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
}
}
void decayedAdagradApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum,
BaseMatrix& lr,
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime) {
auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
auto expr3 = mom.lazyAssign(
mom * momentum - learningRate * lr * (grad + value * decayRate));
auto expr4 = value.lazyAssign(value + mom);
if (firstTime) {
auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
AssignEvaluate(expr1, expr2, expr3, expr4);
} else {
auto expr1 = accum.lazyAssign(
accumulatedRou * accum + ((real)1 - rou) * grad.square());
AssignEvaluate(expr1, expr2, expr3, expr4);
}
}
void adamApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom, // firse moment
BaseMatrix& v, // second moment
real beta1,
real beta2,
real beta1_power,
real beta2_power,
real epsilon,
real learningRate) {
real alpha = learningRate *
std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
auto expr3 = value.lazyAssign(
value - (mom * alpha) / (v.sqrt() + epsilon));
AssignEvaluate(expr1, expr2, expr3);
}
void adamaxApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom, // firse moment
BaseMatrix& u, // weighted infinity norm
real beta1,
real beta2,
int64_t step,
real alpha) {
auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
auto expr2 = u.lazyAssign(
(beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
auto expr3 = value.lazyAssign(
value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
AssignEvaluate(expr1, expr2, expr3);
}
} // namespace paddle
#else
namespace paddle {
void sparseMomentumApply(BaseMatrix& value,
......@@ -180,3 +349,6 @@ void adamaxApply(BaseMatrix& value,
}
} // namespace paddle
#endif
......@@ -15,13 +15,16 @@ add_simple_unittest(test_perturbation)
add_simple_unittest(test_CpuGpuVector)
add_simple_unittest(test_Allocator)
if(WITH_GPU)
CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
link_paddle_test(test_Tensor)
if(COMPILER_SUPPORT_CXX11)
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
link_paddle_test(test_Tensor)
CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
link_paddle_test(test_lazyAssign)
endif()
else()
compile_cu_as_cpp(test_Tensor.cu)
add_unittest(test_Tensor test_Tensor.cu)
compile_cu_as_cpp(test_lazyAssign.cu)
add_unittest(test_lazyAssign test_lazyAssign.cu)
endif(WITH_GPU)
/**
* test_Tensor.cpp
*
* Author: hedaoyuan (hedaoyuan@baidu.com)
* Created on: 2016-06-06
*
* Copyright (c) Baidu.com, Inc. All Rights Reserved
*/
#include <gtest/gtest.h>
#include "paddle/math/Matrix.h"
using namespace paddle; // NOLINT
using namespace std; // NOLINT
template<typename Tensor>
extern void TensorCheckEqual(const Tensor& tensor1, const Tensor& tensor2);
void TensorCheckEqual(const CpuMatrix& matrix1, const CpuMatrix& matrix2) {
CHECK(matrix1.getHeight() == matrix2.getHeight());
CHECK(matrix1.getWidth() == matrix2.getWidth());
int height = matrix1.getHeight();
int width = matrix1.getWidth();
const real* data1 = matrix1.getData();
const real* data2 = matrix2.getData();
int count = 0;
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
if (data1[i * width + j] != data2[i * width + j]) {
count++;
}
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different element.";
}
void TensorCheckEqual(const GpuMatrix& matrix1, const GpuMatrix& matrix2) {
CpuMatrix cpu1(matrix1.getHeight(), matrix1.getWidth());
CpuMatrix cpu2(matrix2.getHeight(), matrix2.getWidth());
cpu1.copyFrom(matrix1);
cpu2.copyFrom(matrix2);
TensorCheckEqual(cpu1, cpu2);
}
void TensorCheckErr(const CpuMatrix& matrix1, const CpuMatrix& matrix2) {
CHECK(matrix1.getHeight() == matrix2.getHeight());
CHECK(matrix1.getWidth() == matrix2.getWidth());
#ifndef PADDLE_TYPE_DOUBLE
real err = 1e-5;
#else
real err = 1e-10;
#endif
int height = matrix1.getHeight();
int width = matrix1.getWidth();
const real* data1 = matrix1.getData();
const real* data2 = matrix2.getData();
int count = 0;
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
real a = data1[i * width + j];
real b = data2[i * width + j];
if (fabs(a - b) > err) {
if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
count++;
}
}
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different element.";
}
void TensorCheckErr(const GpuMatrix& matrix1, const GpuMatrix& matrix2) {
CpuMatrix cpu1(matrix1.getHeight(), matrix1.getWidth());
CpuMatrix cpu2(matrix2.getHeight(), matrix2.getWidth());
cpu1.copyFrom(matrix1);
cpu2.copyFrom(matrix2);
TensorCheckErr(cpu1, cpu2);
}
template<class T>
void TensorCheckEqual(const CpuVectorT<T>& vector1,
const CpuVectorT<T>& vector2) {
CHECK(vector1.getSize() == vector2.getSize());
const T* data1 = vector1.getData();
const T* data2 = vector2.getData();
size_t size = vector1.getSize();
int count = 0;
for (size_t i = 0; i < size; i++) {
if (data1[i] != data2[i]) {
count++;
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different element.";
}
template<class T>
void TensorCheckEqual(const GpuVectorT<T>& vector1,
const GpuVectorT<T>& vector2) {
CpuVectorT<T> cpu1(vector1.getSize());
CpuVectorT<T> cpu2(vector2.getSize());
cpu1.copyFrom(vector1);
cpu2.copyFrom(vector2);
TensorCheckEqual(cpu1, cpu2);
}
int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
CHECK(vector1.getSize() == vector2.getSize());
const real* data1 = vector1.getData();
const real* data2 = vector2.getData();
size_t size = vector1.getSize();
int count = 0;
for (size_t i = 0; i < size; i++) {
real a = data1[i];
real b = data2[i];
if (fabs(a - b) > FLAGS_max_diff) {
if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
count++;
}
}
}
return count;
}
#define INIT_UNARY(A1, A2) \
Tensor A1(height, width); \
Tensor A2(height, width); \
A1.randomizeUniform(); \
A2.copyFrom(A1)
#define INIT_BINARY(A1, A2, B) \
INIT_UNARY(A1, A2); \
Tensor B(height, width); \
B.randomizeUniform()
#define INIT_TERNARY(A1, A2, B, C) \
INIT_BINARY(A1, A2, B); \
Tensor C(height, width); \
C.randomizeUniform()
#define INIT_QUATERNARY(A1, A2, B, C, D) \
INIT_TERNARY(A1, A2, B, C); \
Tensor D(height, width); \
D.randomizeUniform()
// Performance Check
#ifdef PADDLE_DISABLE_TIMER
#define CHECK_VECTORPTR(vector1, vector2) \
EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
#define EXPRESSION_PERFORMANCE(expression) \
expression;
#else
#include "paddle/utils/Stat.h"
#define CHECK_VECTORPTR(vector1, vector2)
#define EXPRESSION_PERFORMANCE(expression) \
do {\
char expr[30];\
strncpy(expr, #expression, 30);\
if (expr[29] != '\0') {\
expr[27] = '.'; expr[28] = '.'; expr[29] = '\0';\
}\
expression;\
for (int i = 0; i < 20; i++) {\
REGISTER_TIMER(expr);\
expression;\
}\
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')\
<< *globalStat.getStat(expr);\
globalStat.reset();\
} while (0)
#endif
......@@ -11,6 +11,7 @@
#include "paddle/utils/Util.h"
#include "paddle/math/TrainingAlgorithmOp.h"
#include "OriginalOptimizerApi.h"
#include "TensorCheck.h"
using namespace paddle; // NOLINT
......@@ -33,26 +34,6 @@ private:
double max_diff_;
};
int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
CHECK(vector1.getSize() == vector2.getSize());
const real* data1 = vector1.getData();
const real* data2 = vector2.getData();
size_t size = vector1.getSize();
int count = 0;
for (size_t i = 0; i < size; i++) {
real a = data1[i];
real b = data2[i];
if (fabs(a - b) > FLAGS_max_diff) {
if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
count++;
}
}
}
return count;
}
#define COPY_VECTOR_TO_CPU(cpuVec, vector) \
do {\
if (vector->useGpu()) {\
......@@ -71,39 +52,6 @@ int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
return VectorCheckErr(*tmp1, *tmp2);
}
#ifdef PADDLE_DISABLE_TIMER
#define CHECK_VECTORPTR(vector1, vector2) \
EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
#define EXPRESSION_PERFORMANCE(expression) \
expression;
#else
#include "paddle/utils/Stat.h"
#define CHECK_VECTORPTR(vector1, vector2)
#define EXPRESSION_PERFORMANCE(expression) \
do {\
char expr[30];\
strncpy(expr, #expression, 30);\
if (expr[29] != '\0') {\
expr[27] = '.'; expr[28] = '.'; expr[29] = '\0';\
}\
expression;\
for (int i = 0; i < 20; i++) {\
REGISTER_TIMER(expr);\
expression;\
}\
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')\
<< *globalStat.getStat(expr);\
globalStat.reset();\
} while (0)
#endif
typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
void testCase(testMatrixFunc matrixFunc) {
......
/**
* test_lazyAssign.cpp
*
* Author: hedaoyuan (hedaoyuan@baidu.com)
* Created on: 2016-10-15
*
* Copyright (c) Baidu.com, Inc. All Rights Reserved
*/
#include <gtest/gtest.h>
#include "paddle/math/Matrix.h"
#include "paddle/math/TensorAssign.h"
#include "TensorCheck.h"
using namespace paddle; // NOLINT
using namespace std; // NOLINT
typedef std::function<void(int height, int width)> testMatrixFunc;
void testMatrixCase(testMatrixFunc matrixFunc) {
for (auto height : {1}) {
for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072,
262144, 524288, 1048576, 2097152, 4194304, 8388608}) {
matrixFunc(height, width);
}
}
}
template<typename Tensor>
void testLazyAssign(int height, int width) {
INIT_QUATERNARY(A1, A2, B, C, D);
EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
EXPRESSION_PERFORMANCE(
auto expr1 = A2.lazyAssign(B + C);
auto expr2 = A2.lazyAssign(A2 * D);
AssignEvaluate(expr1, expr2););
TensorCheckErr(A1, A2);
}
TEST(lazyAssign, CPU) {
testMatrixCase(testLazyAssign<CpuMatrix>);
}
#ifndef PADDLE_ONLY_CPU
TEST(lazyAssign, GPU) {
testMatrixCase(testLazyAssign<GpuMatrix>);
}
#endif
template<typename Tensor>
void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D,
real p1, real p2, real p3) {
C = C * p2 - D * (B + A * p3) * p1;
A += C;
}
void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B,
BaseMatrix& C, BaseMatrix& D,
real p1, real p2, real p3) {
auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
auto expr2 = A.lazyAssign(A + C);
AssignEvaluate(expr1, expr2);
}
template<typename Tensor>
void testSgdUpdate(int height, int width) {
Tensor A1(height, width);
Tensor A2(height, width);
Tensor A3(height, width);
A1.randomizeUniform();
A2.copyFrom(A1);
A3.copyFrom(A1);
Tensor B(height, width);
B.randomizeUniform();
Tensor C1(height, width);
Tensor C2(height, width);
Tensor C3(height, width);
C1.randomizeUniform();
C2.copyFrom(C1);
C3.copyFrom(C1);
Tensor D(height, width);
D.randomizeUniform();
real p1 = 0.2;
real p2 = 0.3;
real p3 = 0.5;
/**
* c = p2 * c - p1 * (b + p3 * a);
* a = a + c;
*/
// BaseMatrix API
EXPRESSION_PERFORMANCE(
A1.sgdUpdate(B, C1, D, p1, p2, p3););
// Tensor expression
EXPRESSION_PERFORMANCE(
sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
// lazyAssign
EXPRESSION_PERFORMANCE(
sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
TensorCheckErr(A1, A2);
TensorCheckErr(A1, A3);
TensorCheckErr(C1, C2);
TensorCheckErr(C1, C3);
}
TEST(sgdUpdate, CPU) {
testMatrixCase(testSgdUpdate<CpuMatrix>);
}
#ifndef PADDLE_ONLY_CPU
TEST(sgdUpdate, GPU) {
testMatrixCase(testSgdUpdate<GpuMatrix>);
}
#endif
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
hl_start();
hl_init(0);
return RUN_ALL_TESTS();
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册