提交 5de406c0 编写于 作者: O Ofir Bitton 提交者: Oded Gabbay

habanalabs: sync stream collective support

Implement sync stream collective for GAUDI. Need to allocate additional
resources for that and add ctx_fini() to clean up those resources.
Signed-off-by: NOfir Bitton <obitton@habana.ai>
Reviewed-by: NOded Gabbay <ogabbay@kernel.org>
Signed-off-by: NOded Gabbay <ogabbay@kernel.org>
上级 0940caba
......@@ -142,7 +142,7 @@ static void hl_fence_init(struct hl_fence *fence)
init_completion(&fence->completion);
}
static void cs_get(struct hl_cs *cs)
void cs_get(struct hl_cs *cs)
{
kref_get(&cs->refcount);
}
......@@ -917,6 +917,9 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
job->job_cb_size = job->user_cb_size;
hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
/* increment refcount as for external queues we get completion */
cs_get(cs);
cs->jobs_in_queue_cnt[job->hw_queue_id]++;
list_add_tail(&job->cs_node, &cs->job_list);
......@@ -1070,11 +1073,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
cs, q_idx, collective_engine_id);
if (rc)
goto put_cs;
/* increment refcount as for external queues we get completion */
cs_get(cs);
goto free_cs_object;
rc = hl_hw_queue_schedule_cs(cs);
if (rc) {
......
......@@ -40,6 +40,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
if ((hdev->in_debug) && (hdev->compute_ctx == ctx))
hl_device_set_debug_mode(hdev, false);
hdev->asic_funcs->ctx_fini(ctx);
hl_cb_va_pool_fini(ctx);
hl_vm_ctx_fini(ctx);
hl_asid_free(hdev, ctx->asid);
......
......@@ -65,8 +65,8 @@
* HL_RSVD_SOBS 'sync stream' reserved sync objects per QMAN stream
* HL_RSVD_MONS 'sync stream' reserved monitors per QMAN stream
*/
#define HL_RSVD_SOBS 4
#define HL_RSVD_MONS 2
#define HL_RSVD_SOBS 2
#define HL_RSVD_MONS 1
/*
* HL_COLLECTIVE_RSVD_MSTR_MONS 'collective' reserved monitors per QMAN stream
......@@ -785,6 +785,7 @@ enum div_select_defs {
* @wreg: Write a register. Needed for simulator support.
* @halt_coresight: stop the ETF and ETR traces.
* @ctx_init: context dependent initialization.
* @ctx_fini: context dependent cleanup.
* @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz
* @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index.
* @read_device_fw_version: read the device's firmware versions that are
......@@ -891,6 +892,7 @@ struct hl_asic_funcs {
void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
void (*halt_coresight)(struct hl_device *hdev);
int (*ctx_init)(struct hl_ctx *ctx);
void (*ctx_fini)(struct hl_ctx *ctx);
int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx);
void (*read_device_fw_version)(struct hl_device *hdev,
......@@ -1992,6 +1994,7 @@ void hl_sob_reset_error(struct kref *ref);
int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask);
void hl_fence_put(struct hl_fence *fence);
void hl_fence_get(struct hl_fence *fence);
void cs_get(struct hl_cs *cs);
void goya_set_asic_funcs(struct hl_device *hdev);
void gaudi_set_asic_funcs(struct hl_device *hdev);
......
......@@ -23,6 +23,10 @@
NUMBER_OF_CPU_HW_QUEUES + \
NUMBER_OF_INT_HW_QUEUES)
/* 10 NIC QMANs, DMA5 QMAN, TPC7 QMAN */
#define NUMBER_OF_COLLECTIVE_QUEUES 12
#define NUMBER_OF_SOBS_IN_GRP 11
/*
* Number of MSI interrupts IDS:
* Each completion queue has 1 ID
......@@ -149,10 +153,14 @@
/* Virtual address space */
#define VA_HOST_SPACE_START 0x1000000000000ull /* 256TB */
#define VA_HOST_SPACE_END 0x3FF8000000000ull /* 1PB - 1TB */
#define VA_HOST_SPACE_END 0x3FF7FFFE00000ull /* 1PB - 1TB */
#define VA_HOST_SPACE_SIZE (VA_HOST_SPACE_END - \
VA_HOST_SPACE_START) /* 767TB */
#define VA_HOST_SPACE_INTERNAL_CB_START 0x3FF7FFFE00000ull /* 1PB - 1TB - 2MB */
#define VA_HOST_SPACE_INTERNAL_CB_END 0x3FF8000000000ull /* 1PB - 1TB */
#define HOST_SPACE_INTERNAL_CB_SZ SZ_2M
#define HW_CAP_PLL BIT(0)
#define HW_CAP_HBM BIT(1)
#define HW_CAP_MMU BIT(2)
......@@ -240,6 +248,34 @@ enum gaudi_nic_mask {
GAUDI_NIC_MASK_ALL = 0x3FF
};
/*
* struct gaudi_hw_sob_group - H/W SOB group info.
* @hdev: habanalabs device structure.
* @kref: refcount of this SOB group. group will reset once refcount is zero.
* @base_sob_id: base sob id of this SOB group.
*/
struct gaudi_hw_sob_group {
struct hl_device *hdev;
struct kref kref;
u32 base_sob_id;
};
#define NUM_SOB_GROUPS (HL_RSVD_SOBS * QMAN_STREAMS)
/**
* struct gaudi_collective_properties -
* holds all SOB groups and queues info reserved for the collective
* @hw_sob_group: H/W SOB groups.
* @next_sob_group_val: the next value to use for the currently used SOB group.
* @curr_sob_group_idx: the index of the currently used SOB group.
* @mstr_sob_mask: pre-defined masks for collective master monitors
*/
struct gaudi_collective_properties {
struct gaudi_hw_sob_group hw_sob_group[NUM_SOB_GROUPS];
u16 next_sob_group_val[QMAN_STREAMS];
u8 curr_sob_group_idx[QMAN_STREAMS];
u8 mstr_sob_mask[HL_COLLECTIVE_RSVD_MSTR_MONS];
};
/**
* struct gaudi_internal_qman_info - Internal QMAN information.
* @pq_kernel_addr: Kernel address of the PQ memory area in the host.
......@@ -285,6 +321,8 @@ struct gaudi_device {
struct gaudi_internal_qman_info internal_qmans[GAUDI_QUEUE_ID_SIZE];
struct gaudi_collective_properties collective_props;
u64 hbm_bar_cur_addr;
u64 max_freq_value;
......
......@@ -5343,6 +5343,11 @@ int goya_collective_wait_create_jobs(struct hl_device *hdev,
return -EINVAL;
}
static void goya_ctx_fini(struct hl_ctx *ctx)
{
}
static const struct hl_asic_funcs goya_funcs = {
.early_init = goya_early_init,
.early_fini = goya_early_fini,
......@@ -5404,6 +5409,7 @@ static const struct hl_asic_funcs goya_funcs = {
.wreg = hl_wreg,
.halt_coresight = goya_halt_coresight,
.ctx_init = goya_ctx_init,
.ctx_fini = goya_ctx_fini,
.get_clk_rate = goya_get_clk_rate,
.get_queue_id_for_cq = goya_get_queue_id_for_cq,
.read_device_fw_version = goya_read_device_fw_version,
......
......@@ -18,8 +18,18 @@
#define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START 0x8000 /* 32KB */
#define GAUDI_DRIVER_SRAM_RESERVED_SIZE_FROM_START 0x80 /* 128 bytes */
#define GAUDI_FIRST_AVAILABLE_W_S_SYNC_OBJECT 32
#define GAUDI_FIRST_AVAILABLE_W_S_MONITOR 16
/*
* 128 SOBs reserved for collective wait
* 16 SOBs reserved for sync stream
*/
#define GAUDI_FIRST_AVAILABLE_W_S_SYNC_OBJECT 144
/*
* 64 monitors reserved for collective wait
* 8 monitors reserved for sync stream
*/
#define GAUDI_FIRST_AVAILABLE_W_S_MONITOR 72
/*
* Goya queue Numbering
*
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册