提交 903d5c7e 编写于 作者: H He 提交者: Yu Yang

bug fix for hl_matrix_classification_error

上级 fdd40e55
...@@ -266,25 +266,21 @@ template<int blockSize> ...@@ -266,25 +266,21 @@ template<int blockSize>
__global__ void KeMatrixClassificationError(real* in_A, __global__ void KeMatrixClassificationError(real* in_A,
int* in_B, int* in_B,
real* out_C, real* out_C,
int dimM,
int dimN) { int dimN) {
__shared__ real max_s[blockSize]; __shared__ real max_s[blockSize];
__shared__ int max_l[blockSize]; __shared__ int max_l[blockSize];
int cnt = (dimN + blockSize -1) / blockSize; const int tid = threadIdx.x;
int tid = threadIdx.x; const int rowId = blockIdx.x;
int lmt = tid;
int index = 0;
real t;
max_s[tid] = -1e30f; max_s[tid] = -1e30f;
for (int ii = 0; ii < cnt && lmt < dimN; ii++) { in_A += rowId * dimN;
index = blockIdx.y*dimN + lmt; real tmp;
t = in_A[index]; for (int colId = tid; colId < dimN; colId += blockSize) {
if (max_s[tid] < t) { tmp = in_A[colId];
max_s[tid] = t; if (max_s[tid] < tmp) {
max_l[tid] = lmt; max_s[tid] = tmp;
max_l[tid] = colId;
} }
lmt += blockSize;
} }
__syncthreads(); __syncthreads();
...@@ -300,7 +296,7 @@ __global__ void KeMatrixClassificationError(real* in_A, ...@@ -300,7 +296,7 @@ __global__ void KeMatrixClassificationError(real* in_A,
__syncthreads(); __syncthreads();
if (tid == 0) { if (tid == 0) {
out_C[blockIdx.y] = (max_l[0] == in_B[blockIdx.y] ? 0 : 1.0f); out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
} }
} }
...@@ -313,12 +309,9 @@ void hl_matrix_classification_error(real* A_d, ...@@ -313,12 +309,9 @@ void hl_matrix_classification_error(real* A_d,
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
int blocksX = 1; // each sample is calculated by one block
int blocksY = dimM; KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
dim3 threads(1024, 1); (A_d, B_d, C_d, dimN);
dim3 grid(blocksX, blocksY);
KeMatrixClassificationError<1024><<< grid, threads, 0, STREAM_DEFAULT >>>
(A_d, B_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_classification_error"); CHECK_SYNC("hl_matrix_classification_error");
} }
......
...@@ -1697,7 +1697,6 @@ TEST(Matrix, cosSimDerivate) { ...@@ -1697,7 +1697,6 @@ TEST(Matrix, cosSimDerivate) {
} }
} }
void testParamReluForward(int height, int width, int w_height, void testParamReluForward(int height, int width, int w_height,
int w_width) { int w_width) {
MatrixPtr output = CpuMatrix::create(height, width, false, false); MatrixPtr output = CpuMatrix::create(height, width, false, false);
...@@ -1736,7 +1735,6 @@ TEST(Matrix, paramReluForward) { ...@@ -1736,7 +1735,6 @@ TEST(Matrix, paramReluForward) {
} }
} }
void testParamReluBackwardW(int height, int width, int w_height, void testParamReluBackwardW(int height, int width, int w_height,
int w_width) { int w_width) {
MatrixPtr oGrad = CpuMatrix::create(height, width, false, false); MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
...@@ -1775,7 +1773,6 @@ TEST(Matrix, paramReluBackwardW) { ...@@ -1775,7 +1773,6 @@ TEST(Matrix, paramReluBackwardW) {
} }
} }
void testParamReluBackwardDiff(int height, int width, int w_height, void testParamReluBackwardDiff(int height, int width, int w_height,
int w_width) { int w_width) {
MatrixPtr oGrad = CpuMatrix::create(height, width, false, false); MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
...@@ -1819,6 +1816,36 @@ TEST(Matrix, paramReluBackwardDiff) { ...@@ -1819,6 +1816,36 @@ TEST(Matrix, paramReluBackwardDiff) {
} }
} }
void testClassificationError(int numSamples, int dim) {
MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
cpuOutput->randomizeUniform();
cpuLabel->rand(dim);
gpuOutput->copyFrom(*cpuOutput);
gpuLabel->copyFrom(*cpuLabel);
cpuError->classificationError(cpuOutput, cpuLabel);
gpuError->classificationError(gpuOutput, gpuLabel);
MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, 1);
check->copyFrom(*gpuError);
MatrixCheckEqual(*cpuError, *check);
}
TEST(Matrix, classificationError) {
for (auto numSamples : {1, 10, 100, 1000, 70000}) {
for (auto dim : {1, 10, 100, 1000}) {
VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
testClassificationError(numSamples, dim);
}
}
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
initMain(argc, argv); initMain(argc, argv);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册