提交 58310b3f 编写于 作者: D David S. Miller

Merge branch 'mlx4-next'

Or Gerlitz says:

====================
mlx4: CQE/EQE stride support

This series from Ido Shamay is intended for archs having
cache line larger then 64 bytes.

Since our CQE/EQEs are generally 64B in those systems, HW will write
twice to the same cache line consecutively, causing pipe locks due to
he hazard prevention mechanism. For elements in a cyclic buffer, writes
are consecutive, so entries smaller than a cache line should be
avoided, especially if they are written at a high rate.

Reduce consecutive writes to same cache line in CQs/EQs, by allowing the
driver to increase the distance between entries so that each will reside
in a different cache line.
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
......@@ -2459,6 +2459,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
}
priv->rx_ring_num = prof->rx_ring_num;
priv->cqe_factor = (mdev->dev->caps.cqe_size == 64) ? 1 : 0;
priv->cqe_size = mdev->dev->caps.cqe_size;
priv->mac_index = -1;
priv->msg_enable = MLX4_EN_MSG_LEVEL;
spin_lock_init(&priv->stats_lock);
......
......@@ -671,7 +671,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
* descriptor offset can be deduced from the CQE index instead of
* reading 'cqe->index' */
index = cq->mcq.cons_index & ring->size_mask;
cqe = &cq->buf[(index << factor) + factor];
cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
/* Process all completed CQEs */
while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
......@@ -858,7 +858,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
++cq->mcq.cons_index;
index = (cq->mcq.cons_index) & ring->size_mask;
cqe = &cq->buf[(index << factor) + factor];
cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
if (++polled == budget)
goto out;
}
......
......@@ -382,7 +382,7 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
return true;
index = cons_index & size_mask;
cqe = &buf[(index << factor) + factor];
cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
ring_index = ring->cons & size_mask;
stamp_index = ring_index;
......@@ -430,7 +430,7 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
++cons_index;
index = cons_index & size_mask;
cqe = &buf[(index << factor) + factor];
cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
}
......
......@@ -101,21 +101,24 @@ static void eq_set_ci(struct mlx4_eq *eq, int req_not)
mb();
}
static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor)
static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor,
u8 eqe_size)
{
/* (entry & (eq->nent - 1)) gives us a cyclic array */
unsigned long offset = (entry & (eq->nent - 1)) * (MLX4_EQ_ENTRY_SIZE << eqe_factor);
/* CX3 is capable of extending the EQE from 32 to 64 bytes.
* When this feature is enabled, the first (in the lower addresses)
unsigned long offset = (entry & (eq->nent - 1)) * eqe_size;
/* CX3 is capable of extending the EQE from 32 to 64 bytes with
* strides of 64B,128B and 256B.
* When 64B EQE is used, the first (in the lower addresses)
* 32 bytes in the 64 byte EQE are reserved and the next 32 bytes
* contain the legacy EQE information.
* In all other cases, the first 32B contains the legacy EQE info.
*/
return eq->page_list[offset / PAGE_SIZE].buf + (offset + (eqe_factor ? MLX4_EQ_ENTRY_SIZE : 0)) % PAGE_SIZE;
}
static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor)
static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor, u8 size)
{
struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor);
struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor, size);
return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe;
}
......@@ -459,8 +462,9 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
enum slave_port_gen_event gen_event;
unsigned long flags;
struct mlx4_vport_state *s_info;
int eqe_size = dev->caps.eqe_size;
while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor))) {
while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor, eqe_size))) {
/*
* Make sure we read EQ entry contents after we've
* checked the ownership bit.
......@@ -894,8 +898,10 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
eq->dev = dev;
eq->nent = roundup_pow_of_two(max(nent, 2));
/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes */
npages = PAGE_ALIGN(eq->nent * (MLX4_EQ_ENTRY_SIZE << dev->caps.eqe_factor)) / PAGE_SIZE;
/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with
* strides of 64B,128B and 256B.
*/
npages = PAGE_ALIGN(eq->nent * dev->caps.eqe_size) / PAGE_SIZE;
eq->page_list = kmalloc(npages * sizeof *eq->page_list,
GFP_KERNEL);
......@@ -997,8 +1003,10 @@ static void mlx4_free_eq(struct mlx4_dev *dev,
struct mlx4_cmd_mailbox *mailbox;
int err;
int i;
/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes */
int npages = PAGE_ALIGN((MLX4_EQ_ENTRY_SIZE << dev->caps.eqe_factor) * eq->nent) / PAGE_SIZE;
/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with
* strides of 64B,128B and 256B
*/
int npages = PAGE_ALIGN(dev->caps.eqe_size * eq->nent) / PAGE_SIZE;
mailbox = mlx4_alloc_cmd_mailbox(dev);
if (IS_ERR(mailbox))
......
......@@ -137,7 +137,9 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
[8] = "Dynamic QP updates support",
[9] = "Device managed flow steering IPoIB support",
[10] = "TCP/IP offloads/flow-steering for VXLAN support",
[11] = "MAD DEMUX (Secure-Host) support"
[11] = "MAD DEMUX (Secure-Host) support",
[12] = "Large cache line (>64B) CQE stride support",
[13] = "Large cache line (>64B) EQE stride support"
};
int i;
......@@ -557,6 +559,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
#define QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET 0x74
#define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET 0x76
#define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET 0x77
#define QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE 0x7a
#define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET 0x80
#define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET 0x82
#define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET 0x84
......@@ -733,6 +736,11 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
dev_cap->max_rq_sg = field;
MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET);
dev_cap->max_rq_desc_sz = size;
MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
if (field & (1 << 6))
dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
if (field & (1 << 7))
dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
MLX4_GET(dev_cap->bmme_flags, outbox,
QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
......@@ -1376,6 +1384,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
#define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30)
#define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37)
#define INIT_HCA_EQE_CQE_OFFSETS (INIT_HCA_QPC_OFFSET + 0x38)
#define INIT_HCA_EQE_CQE_STRIDE_OFFSET (INIT_HCA_QPC_OFFSET + 0x3b)
#define INIT_HCA_ALTC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40)
#define INIT_HCA_AUXC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50)
#define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60)
......@@ -1452,11 +1461,25 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) {
*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30);
dev->caps.cqe_size = 64;
dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE;
dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
} else {
dev->caps.cqe_size = 32;
}
/* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */
if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) &&
(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) {
dev->caps.eqe_size = cache_line_size();
dev->caps.cqe_size = cache_line_size();
dev->caps.eqe_factor = 0;
MLX4_PUT(inbox, (u8)((ilog2(dev->caps.eqe_size) - 5) << 4 |
(ilog2(dev->caps.eqe_size) - 5)),
INIT_HCA_EQE_CQE_STRIDE_OFFSET);
/* User still need to know to support CQE > 32B */
dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
}
/* QPC/EEC/CQC/EQC/RDMARC attributes */
MLX4_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET);
......@@ -1616,6 +1639,17 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
if (byte_field & 0x40) /* 64-bytes cqe enabled */
param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED;
/* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */
MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_STRIDE_OFFSET);
if (byte_field) {
param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED;
param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED;
param->cqe_size = 1 << ((byte_field &
MLX4_CQE_SIZE_MASK_STRIDE) + 5);
param->eqe_size = 1 << (((byte_field &
MLX4_EQE_SIZE_MASK_STRIDE) >> 4) + 5);
}
/* TPT attributes */
MLX4_GET(param->dmpt_base, outbox, INIT_HCA_DMPT_BASE_OFFSET);
......
......@@ -178,6 +178,8 @@ struct mlx4_init_hca_param {
u8 uar_page_sz; /* log pg sz in 4k chunks */
u8 steering_mode; /* for QUERY_HCA */
u64 dev_cap_enabled;
u16 cqe_size; /* For use only when CQE stride feature enabled */
u16 eqe_size; /* For use only when EQE stride feature enabled */
};
struct mlx4_init_ib_param {
......
......@@ -104,7 +104,8 @@ module_param(enable_64b_cqe_eqe, bool, 0444);
MODULE_PARM_DESC(enable_64b_cqe_eqe,
"Enable 64 byte CQEs/EQEs when the FW supports this (default: True)");
#define PF_CONTEXT_BEHAVIOUR_MASK MLX4_FUNC_CAP_64B_EQE_CQE
#define PF_CONTEXT_BEHAVIOUR_MASK (MLX4_FUNC_CAP_64B_EQE_CQE | \
MLX4_FUNC_CAP_EQE_CQE_STRIDE)
static char mlx4_version[] =
DRV_NAME ": Mellanox ConnectX core driver v"
......@@ -196,6 +197,40 @@ static void mlx4_set_port_mask(struct mlx4_dev *dev)
dev->caps.port_mask[i] = dev->caps.port_type[i];
}
static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev)
{
struct mlx4_caps *dev_cap = &dev->caps;
/* FW not supporting or cancelled by user */
if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) ||
!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE))
return;
/* Must have 64B CQE_EQE enabled by FW to use bigger stride
* When FW has NCSI it may decide not to report 64B CQE/EQEs
*/
if (!(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_EQE) ||
!(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_CQE)) {
dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
return;
}
if (cache_line_size() == 128 || cache_line_size() == 256) {
mlx4_dbg(dev, "Enabling CQE stride cacheLine supported\n");
/* Changing the real data inside CQE size to 32B */
dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
if (mlx4_is_master(dev))
dev_cap->function_caps |= MLX4_FUNC_CAP_EQE_CQE_STRIDE;
} else {
mlx4_dbg(dev, "Disabling CQE stride cacheLine unsupported\n");
dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
}
}
static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
{
int err;
......@@ -390,6 +425,14 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
}
if (dev_cap->flags2 &
(MLX4_DEV_CAP_FLAG2_CQE_STRIDE |
MLX4_DEV_CAP_FLAG2_EQE_STRIDE)) {
mlx4_warn(dev, "Disabling EQE/CQE stride per user request\n");
dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
}
}
if ((dev->caps.flags &
......@@ -397,6 +440,9 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
mlx4_is_master(dev))
dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE;
if (!mlx4_is_slave(dev))
mlx4_enable_cqe_eqe_stride(dev);
return 0;
}
......@@ -724,11 +770,22 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) {
dev->caps.cqe_size = 64;
dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE;
dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
} else {
dev->caps.cqe_size = 32;
}
if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_EQE_STRIDE_ENABLED) {
dev->caps.eqe_size = hca_param.eqe_size;
dev->caps.eqe_factor = 0;
}
if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_CQE_STRIDE_ENABLED) {
dev->caps.cqe_size = hca_param.cqe_size;
/* User still need to know when CQE > 32B */
dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
}
dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
mlx4_warn(dev, "Timestamping is not supported in slave mode\n");
......
......@@ -285,6 +285,9 @@ struct mlx4_icm_table {
#define MLX4_MPT_STATUS_SW 0xF0
#define MLX4_MPT_STATUS_HW 0x00
#define MLX4_CQE_SIZE_MASK_STRIDE 0x3
#define MLX4_EQE_SIZE_MASK_STRIDE 0x30
/*
* Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
*/
......
......@@ -542,6 +542,7 @@ struct mlx4_en_priv {
unsigned max_mtu;
int base_qpn;
int cqe_factor;
int cqe_size;
struct mlx4_en_rss_map rss_map;
__be32 ctrl_flags;
......@@ -612,6 +613,11 @@ struct mlx4_mac_entry {
struct rcu_head rcu;
};
static inline struct mlx4_cqe *mlx4_en_get_cqe(void *buf, int idx, int cqe_sz)
{
return buf + idx * cqe_sz;
}
#ifdef CONFIG_NET_RX_BUSY_POLL
static inline void mlx4_en_cq_init_lock(struct mlx4_en_cq *cq)
{
......
......@@ -185,19 +185,24 @@ enum {
MLX4_DEV_CAP_FLAG2_DMFS_IPOIB = 1LL << 9,
MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS = 1LL << 10,
MLX4_DEV_CAP_FLAG2_MAD_DEMUX = 1LL << 11,
MLX4_DEV_CAP_FLAG2_CQE_STRIDE = 1LL << 12,
MLX4_DEV_CAP_FLAG2_EQE_STRIDE = 1LL << 13
};
enum {
MLX4_DEV_CAP_64B_EQE_ENABLED = 1LL << 0,
MLX4_DEV_CAP_64B_CQE_ENABLED = 1LL << 1
MLX4_DEV_CAP_64B_CQE_ENABLED = 1LL << 1,
MLX4_DEV_CAP_CQE_STRIDE_ENABLED = 1LL << 2,
MLX4_DEV_CAP_EQE_STRIDE_ENABLED = 1LL << 3
};
enum {
MLX4_USER_DEV_CAP_64B_CQE = 1L << 0
MLX4_USER_DEV_CAP_LARGE_CQE = 1L << 0
};
enum {
MLX4_FUNC_CAP_64B_EQE_CQE = 1L << 0
MLX4_FUNC_CAP_64B_EQE_CQE = 1L << 0,
MLX4_FUNC_CAP_EQE_CQE_STRIDE = 1L << 1
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册