checksum_kernel_union4.mlu 2.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
/**
 * \file dnn/src/cambricon/checksum/checksum_kernel_union4.mlu
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#include "checksum.mlu.h"
#include "cnsccl.h"
#include "mlu.h"

#define CLUSTER_DIM 4
#define CORE_DIM 4
#define STRIDE 1024
__mlu_entry__ void checksum_kernel_union4(uint32_t* dst, uint32_t* src,
                                          int nr_elems) {
    __nram__ uint32_t sum = 0;
    __nram__ uint32_t val[STRIDE];
    __mlu_shared__ uint32_t partial_sum_send[CORE_DIM];
    __mlu_shared__ uint32_t partial_sum_recv[CLUSTER_DIM];

    int task_stride = STRIDE;
    int start_offset = taskId * task_stride;
    int global_stride = taskDim * task_stride;

    for (int task_offset = start_offset; task_offset < nr_elems;
         task_offset += global_stride) {
        int end_offset = task_offset + task_stride;
        end_offset = end_offset > nr_elems ? nr_elems : end_offset;
        int copy_elems = end_offset - task_offset;
        __memcpy(val, src + task_offset, copy_elems * sizeof(uint32_t),
                 GDRAM2NRAM);
        for (int i = 0; i < copy_elems; i++) {
            sum = sum + val[i] * (task_offset + i + 1);
        }
    }

    partial_sum_send[coreId] = sum;

    __sync_cluster();

    if (coreId == 0) {
        for (int i = 1; i < CORE_DIM; ++i) {
            partial_sum_send[0] += partial_sum_send[i];
        }
    }

    __sync_all();
    cnscclGather((void*)&partial_sum_send, (void*)&partial_sum_recv, 1,
                 cnscclInt, 0);

    __sync_all();

    if (clusterId == 0 && coreId == 0) {
        uint32_t res = 0;
        for (int i = 0; i < CLUSTER_DIM; ++i) {
            res += partial_sum_recv[i];
        }
        dst[0] = res;
    }
}
#undef CLUSTER_DIM
#undef CORE_DIM
#undef STRIDE

// vim: ft=cpp syntax=cpp.doxygen