提交 8f5914bc 编写于 作者: J Jens Axboe

Merge branch 'nvme-5.4' of git://git.infradead.org/nvme into for-5.4/block

Pull NVMe changes from Sagi:

"The nvme updates include:
 - ana log parse fix from Anton
 - nvme quirks support for Apple devices from Ben
 - fix missing bio completion tracing for multipath stack devices from
   Hannes and Mikhail
 - IP TOS settings for nvme rdma and tcp transports from Israel
 - rq_dma_dir cleanups from Israel
 - tracing for Get LBA Status command from Minwoo
 - Some nvme-tcp cleanups from Minwoo, Potnuri and Myself
 - Some consolidation between the fabrics transports for handling the CAP
   register
 - reset race with ns scanning fix for fabrics (move fabrics commands to
   a dedicated request queue with a different lifetime from the admin
   request queue)."

* 'nvme-5.4' of git://git.infradead.org/nvme: (30 commits)
  nvme-rdma: Use rq_dma_dir macro
  nvme-fc: Use rq_dma_dir macro
  nvme-pci: Tidy up nvme_unmap_data
  nvme: make fabrics command run on a separate request queue
  nvme-pci: Support shared tags across queues for Apple 2018 controllers
  nvme-pci: Add support for Apple 2018+ models
  nvme-pci: Add support for variable IO SQ element size
  nvme-pci: Pass the queue to SQ_SIZE/CQ_SIZE macros
  nvme: trace bio completion
  nvme-multipath: fix ana log nsid lookup when nsid is not found
  nvmet-tcp: Add TOS for tcp transport
  nvme-tcp: Add TOS for tcp transport
  nvme-tcp: Use struct nvme_ctrl directly
  nvme-rdma: Add TOS for rdma transport
  nvme-fabrics: Add type of service (TOS) configuration
  nvmet-tcp: fix possible memory leak
  nvmet-tcp: fix possible NULL deref
  nvmet: trace: parse Get LBA Status command in detail
  nvme: trace: parse Get LBA Status command in detail
  nvme: trace: support for Get LBA Status opcode parsed
  ...
......@@ -64,6 +64,7 @@ config NVME_TCP
depends on INET
depends on BLK_DEV_NVME
select NVME_FABRICS
select CRYPTO_CRC32C
help
This provides support for the NVMe over Fabrics protocol using
the TCP transport. This allows you to use remote block devices
......
......@@ -22,12 +22,12 @@
#include <linux/pm_qos.h>
#include <asm/unaligned.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
#include "nvme.h"
#include "fabrics.h"
#define CREATE_TRACE_POINTS
#include "trace.h"
#define NVME_MINORS (1U << MINORBITS)
unsigned int admin_timeout = 60;
......@@ -279,6 +279,8 @@ void nvme_complete_rq(struct request *req)
return;
}
}
nvme_trace_bio_complete(req, status);
blk_mq_end_request(req, status);
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
......@@ -1950,7 +1952,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
* bits', but doing so may cause the device to complete commands to the
* admin queue ... and we don't know what memory that might be pointing at!
*/
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
{
int ret;
......@@ -1964,20 +1966,27 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
msleep(NVME_QUIRK_DELAY_AMOUNT);
return nvme_wait_ready(ctrl, cap, false);
return nvme_wait_ready(ctrl, ctrl->cap, false);
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
/*
* Default to a 4K page size, with the intention to update this
* path in the future to accomodate architectures with differing
* kernel and IO page sizes.
*/
unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
unsigned dev_page_min, page_shift = 12;
int ret;
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
if (ret) {
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
return ret;
}
dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
if (page_shift < dev_page_min) {
dev_err(ctrl->device,
"Minimum device page size %u too large for host (%u)\n",
......@@ -1996,7 +2005,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
return nvme_wait_ready(ctrl, cap, true);
return nvme_wait_ready(ctrl, ctrl->cap, true);
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
......@@ -2562,7 +2571,6 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
int nvme_init_identify(struct nvme_ctrl *ctrl)
{
struct nvme_id_ctrl *id;
u64 cap;
int ret, page_shift;
u32 max_hw_sectors;
bool prev_apst_enabled;
......@@ -2572,16 +2580,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
return ret;
}
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
if (ret) {
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
return ret;
}
page_shift = NVME_CAP_MPSMIN(cap) + 12;
page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
if (ctrl->vs >= NVME_VS(1, 1, 0))
ctrl->subsystem = NVME_CAP_NSSRC(cap);
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
ret = nvme_identify_ctrl(ctrl, &id);
if (ret) {
......
......@@ -150,7 +150,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
cmd.prop_get.fctype = nvme_fabrics_type_property_get;
cmd.prop_get.offset = cpu_to_le32(off);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
NVME_QID_ANY, 0, 0, false);
if (ret >= 0)
......@@ -197,7 +197,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
cmd.prop_get.attrib = 1;
cmd.prop_get.offset = cpu_to_le32(off);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
NVME_QID_ANY, 0, 0, false);
if (ret >= 0)
......@@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
cmd.prop_set.offset = cpu_to_le32(off);
cmd.prop_set.value = cpu_to_le64(val);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0,
NVME_QID_ANY, 0, 0, false);
if (unlikely(ret))
dev_err(ctrl->device,
......@@ -396,7 +396,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res,
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res,
data, sizeof(*data), 0, NVME_QID_ANY, 1,
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false);
if (ret) {
......@@ -611,6 +611,7 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_DATA_DIGEST, "data_digest" },
{ NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
{ NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
{ NVMF_OPT_TOS, "tos=%d" },
{ NVMF_OPT_ERR, NULL }
};
......@@ -632,6 +633,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->duplicate_connect = false;
opts->hdr_digest = false;
opts->data_digest = false;
opts->tos = -1; /* < 0 == use transport default */
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
......@@ -856,6 +858,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->nr_poll_queues = token;
break;
case NVMF_OPT_TOS:
if (match_int(args, &token)) {
ret = -EINVAL;
goto out;
}
if (token < 0) {
pr_err("Invalid type of service %d\n", token);
ret = -EINVAL;
goto out;
}
if (token > 255) {
pr_warn("Clamping type of service to 255\n");
token = 255;
}
opts->tos = token;
break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
......
......@@ -55,6 +55,7 @@ enum {
NVMF_OPT_DATA_DIGEST = 1 << 16,
NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
NVMF_OPT_TOS = 1 << 19,
};
/**
......@@ -87,6 +88,7 @@ enum {
* @data_digest: generate/verify data digest (TCP)
* @nr_write_queues: number of queues for write I/O
* @nr_poll_queues: number of queues for polling I/O
* @tos: type of service
*/
struct nvmf_ctrl_options {
unsigned mask;
......@@ -108,6 +110,7 @@ struct nvmf_ctrl_options {
bool data_digest;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
int tos;
};
/*
......
......@@ -2006,6 +2006,7 @@ nvme_fc_ctrl_free(struct kref *ref)
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
blk_cleanup_queue(ctrl->ctrl.admin_q);
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(&ctrl->admin_tag_set);
kfree(ctrl->queues);
......@@ -2107,7 +2108,6 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
struct nvme_fc_fcp_op *op)
{
struct nvmefc_fcp_req *freq = &op->fcp_req;
enum dma_data_direction dir;
int ret;
freq->sg_cnt = 0;
......@@ -2124,9 +2124,8 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl);
WARN_ON(op->nents > blk_rq_nr_phys_segments(rq));
dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl,
op->nents, dir);
op->nents, rq_dma_dir(rq));
if (unlikely(freq->sg_cnt <= 0)) {
sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE);
freq->sg_cnt = 0;
......@@ -2149,8 +2148,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
return;
fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents,
((rq_data_dir(rq) == WRITE) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE));
rq_dma_dir(rq));
nvme_cleanup_cmd(rq);
......@@ -2633,8 +2631,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (ret)
goto out_delete_hw_queue;
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
ret = nvmf_connect_admin_queue(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
......@@ -2648,23 +2644,15 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
* prior connection values
*/
ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap);
if (ret) {
dev_err(ctrl->ctrl.device,
"prop_get NVME_REG_CAP failed\n");
goto out_disconnect_admin_queue;
}
ctrl->ctrl.sqsize =
min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
ret = nvme_enable_ctrl(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
ret = nvme_init_identify(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
......@@ -3111,10 +3099,16 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
goto out_free_queues;
ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
ret = PTR_ERR(ctrl->ctrl.fabrics_q);
goto out_free_admin_tag_set;
}
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
ret = PTR_ERR(ctrl->ctrl.admin_q);
goto out_free_admin_tag_set;
goto out_cleanup_fabrics_q;
}
/*
......@@ -3186,6 +3180,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
out_cleanup_admin_q:
blk_cleanup_queue(ctrl->ctrl.admin_q);
out_cleanup_fabrics_q:
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_admin_tag_set:
blk_mq_free_tag_set(&ctrl->admin_tag_set);
out_free_queues:
......
......@@ -444,14 +444,16 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
unsigned nsid = le32_to_cpu(desc->nsids[n]);
if (ns->head->ns_id < nsid)
continue;
nvme_update_ns_ana_state(desc, ns);
if (ns->head->ns_id == nsid)
nvme_update_ns_ana_state(desc, ns);
if (++n == nr_nsids)
break;
}
up_write(&ctrl->namespaces_rwsem);
WARN_ON_ONCE(n < nr_nsids);
return 0;
}
......
......@@ -16,6 +16,8 @@
#include <linux/fault-inject.h>
#include <linux/rcupdate.h>
#include <trace/events/block.h>
extern unsigned int nvme_io_timeout;
#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
......@@ -92,6 +94,21 @@ enum nvme_quirks {
* Broken Write Zeroes.
*/
NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9),
/*
* Use only one interrupt vector for all queues
*/
NVME_QUIRK_SINGLE_VECTOR = (1 << 10),
/*
* Use non-standard 128 bytes SQEs.
*/
NVME_QUIRK_128_BYTES_SQES = (1 << 11),
/*
* Prevent tag overlap between queues
*/
NVME_QUIRK_SHARED_TAGS = (1 << 12),
};
/*
......@@ -164,6 +181,7 @@ struct nvme_ctrl {
const struct nvme_ctrl_ops *ops;
struct request_queue *admin_q;
struct request_queue *connect_q;
struct request_queue *fabrics_q;
struct device *dev;
int instance;
int numa_node;
......@@ -426,8 +444,8 @@ void nvme_complete_rq(struct request *req);
bool nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks);
......@@ -511,6 +529,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
kblockd_schedule_work(&head->requeue_work);
}
static inline void nvme_trace_bio_complete(struct request *req,
blk_status_t status)
{
struct nvme_ns *ns = req->q->queuedata;
if (req->cmd_flags & REQ_NVME_MPATH)
trace_block_bio_complete(ns->head->disk->queue,
req->bio, status);
}
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
......@@ -554,6 +582,10 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
}
static inline void nvme_trace_bio_complete(struct request *req,
blk_status_t status)
{
}
static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
struct nvme_id_ctrl *id)
{
......
......@@ -28,8 +28,8 @@
#include "trace.h"
#include "nvme.h"
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
......@@ -100,6 +100,7 @@ struct nvme_dev {
unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
int q_depth;
int io_sqes;
u32 db_stride;
void __iomem *bar;
unsigned long bar_mapped_size;
......@@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
struct nvme_queue {
struct nvme_dev *dev;
spinlock_t sq_lock;
struct nvme_command *sq_cmds;
void *sq_cmds;
/* only used for poll queues: */
spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
......@@ -178,6 +179,7 @@ struct nvme_queue {
u16 last_cq_head;
u16 qid;
u8 cq_phase;
u8 sqes;
unsigned long flags;
#define NVMEQ_ENABLED 0
#define NVMEQ_SQ_CMB 1
......@@ -488,7 +490,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
bool write_sq)
{
spin_lock(&nvmeq->sq_lock);
memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
cmd, sizeof(*cmd));
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
nvme_write_sq_db(nvmeq, write_sq);
......@@ -534,14 +537,13 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
enum dma_data_direction dma_dir = rq_data_dir(req) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE;
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
int i;
if (iod->dma_len) {
dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir);
dma_unmap_page(dev->dev, dma_addr, iod->dma_len,
rq_dma_dir(req));
return;
}
......@@ -1344,16 +1346,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth),
dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
if (!nvmeq->sq_cmds)
return;
if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
nvmeq->sq_cmds, SQ_SIZE(nvmeq));
} else {
dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth),
dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
}
}
......@@ -1403,7 +1405,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
else
nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
nvme_disable_ctrl(&dev->ctrl);
nvme_poll_irqdisable(nvmeq, -1);
}
......@@ -1433,12 +1435,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
}
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
int qid, int depth)
int qid)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
if (nvmeq->sq_cmds) {
nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
nvmeq->sq_cmds);
......@@ -1447,11 +1449,11 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
return 0;
}
pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth));
pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
}
}
nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
&nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
return -ENOMEM;
......@@ -1465,12 +1467,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
if (dev->ctrl.queue_count > qid)
return 0;
nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth),
nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
nvmeq->q_depth = depth;
nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
&nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
goto free_cqdma;
nvmeq->dev = dev;
......@@ -1479,15 +1483,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
nvmeq->q_depth = depth;
nvmeq->qid = qid;
dev->ctrl.queue_count++;
return 0;
free_cqdma:
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr);
dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr);
free_nvmeq:
return -ENOMEM;
}
......@@ -1515,7 +1518,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
wmb(); /* ensure the first interrupt sees the initialization */
......@@ -1679,7 +1682,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
result = nvme_disable_ctrl(&dev->ctrl);
if (result < 0)
return result;
......@@ -1695,7 +1698,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
result = nvme_enable_ctrl(&dev->ctrl);
if (result)
return result;
......@@ -2077,6 +2080,13 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
dev->io_queues[HCTX_TYPE_READ] = 0;
/*
* Some Apple controllers require all queues to use the
* first vector.
*/
if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)
irq_queues = 1;
return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
}
......@@ -2095,6 +2105,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
unsigned long size;
nr_io_queues = max_io_queues();
/*
* If tags are shared with admin queue (Apple bug), then
* make sure we only use one IO queue.
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
nr_io_queues = 1;
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
......@@ -2265,6 +2283,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
/*
* Some Apple controllers requires tags to be unique
* across admin and IO queue, so reserve the first 32
* tags of the IO queue.
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
dev->tagset.reserved_tags = NVME_AQ_DEPTH;
ret = blk_mq_alloc_tag_set(&dev->tagset);
if (ret) {
dev_warn(dev->ctrl.device,
......@@ -2314,9 +2340,20 @@ static int nvme_pci_enable(struct nvme_dev *dev)
dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
dev->dbs = dev->bar + 4096;
/*
* Some Apple controllers require a non-standard SQE size.
* Interestingly they also seem to ignore the CC:IOSQES register
* so we don't bother updating it here.
*/
if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
dev->io_sqes = 7;
else
dev->io_sqes = NVME_NVM_IOSQES;
/*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
......@@ -2334,6 +2371,18 @@ static int nvme_pci_enable(struct nvme_dev *dev)
"set queue depth=%u\n", dev->q_depth);
}
/*
* Controllers with the shared tags quirk need the IO queue to be
* big enough so that we get 32 tags for the admin queue
*/
if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
(dev->q_depth < (NVME_AQ_DEPTH + 2))) {
dev->q_depth = NVME_AQ_DEPTH + 2;
dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
dev->q_depth);
}
nvme_map_cmb(dev);
pci_enable_pcie_error_reporting(pdev);
......@@ -3034,6 +3083,10 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
.driver_data = NVME_QUIRK_SINGLE_VECTOR |
NVME_QUIRK_128_BYTES_SQES |
NVME_QUIRK_SHARED_TAGS },
{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);
......
......@@ -751,6 +751,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.admin_q);
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) {
......@@ -792,10 +793,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
goto out_free_async_qe;
}
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
error = PTR_ERR(ctrl->ctrl.fabrics_q);
goto out_free_tagset;
}
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
error = PTR_ERR(ctrl->ctrl.admin_q);
goto out_free_tagset;
goto out_cleanup_fabrics_q;
}
}
......@@ -803,24 +810,15 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
if (error)
goto out_cleanup_queue;
error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP,
&ctrl->ctrl.cap);
if (error) {
dev_err(ctrl->ctrl.device,
"prop_get NVME_REG_CAP failed\n");
goto out_stop_queue;
}
ctrl->ctrl.sqsize =
min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
error = nvme_enable_ctrl(&ctrl->ctrl);
if (error)
goto out_stop_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
error = nvme_init_identify(&ctrl->ctrl);
if (error)
goto out_stop_queue;
......@@ -832,6 +830,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->ctrl.admin_q);
out_cleanup_fabrics_q:
if (new)
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_tagset:
if (new)
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
......@@ -906,7 +907,8 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
nvme_cancel_request, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
}
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
if (remove)
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, remove);
}
......@@ -1057,6 +1059,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
nvme_rdma_teardown_io_queues(ctrl, false);
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, false);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
......@@ -1143,9 +1146,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
req->mr = NULL;
}
ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
req->nents, rq_data_dir(rq) ==
WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
nvme_cleanup_cmd(rq);
sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
......@@ -1271,7 +1272,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
rq_dma_dir(rq));
if (unlikely(count <= 0)) {
ret = -EIO;
goto out_free_table;
......@@ -1300,9 +1301,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
return 0;
out_unmap_sg:
ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
req->nents, rq_data_dir(rq) ==
WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
out_free_table:
sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
return ret;
......@@ -1545,16 +1544,18 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
{
struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
int ret;
ret = nvme_rdma_create_queue_ib(queue);
if (ret)
return ret;
if (ctrl->opts->tos >= 0)
rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"rdma_resolve_route failed (%d).\n",
dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
queue->cm_error);
goto out_destroy_queue;
}
......@@ -1867,10 +1868,11 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
cancel_delayed_work_sync(&ctrl->reconnect_work);
nvme_rdma_teardown_io_queues(ctrl, shutdown);
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl);
else
nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
nvme_disable_ctrl(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, shutdown);
}
......@@ -2049,7 +2051,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES,
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
NVMF_OPT_TOS,
.create_ctrl = nvme_rdma_create_ctrl,
};
......
......@@ -13,6 +13,7 @@
#include <net/tcp.h>
#include <linux/blk-mq.h>
#include <crypto/hash.h>
#include <net/busy_poll.h>
#include "nvme.h"
#include "fabrics.h"
......@@ -72,6 +73,7 @@ struct nvme_tcp_queue {
int pdu_offset;
size_t data_remaining;
size_t ddgst_remaining;
unsigned int nr_cqe;
/* send state */
struct nvme_tcp_request *request;
......@@ -438,6 +440,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
}
nvme_end_request(rq, cqe->status, cqe->result);
queue->nr_cqe++;
return 0;
}
......@@ -608,23 +611,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
switch (hdr->type) {
case nvme_tcp_c2h_data:
ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
break;
return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
case nvme_tcp_rsp:
nvme_tcp_init_recv_ctx(queue);
ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
break;
return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
case nvme_tcp_r2t:
nvme_tcp_init_recv_ctx(queue);
ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
break;
return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
default:
dev_err(queue->ctrl->ctrl.device,
"unsupported pdu type (%d)\n", hdr->type);
return -EINVAL;
}
return ret;
}
static inline void nvme_tcp_end_request(struct request *rq, u16 status)
......@@ -701,8 +699,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
} else {
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS)
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
queue->nr_cqe++;
}
nvme_tcp_init_recv_ctx(queue);
}
}
......@@ -742,6 +742,7 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
pdu->command_id);
nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
queue->nr_cqe++;
}
nvme_tcp_init_recv_ctx(queue);
......@@ -1023,14 +1024,16 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
{
struct sock *sk = queue->sock->sk;
struct socket *sock = queue->sock;
struct sock *sk = sock->sk;
read_descriptor_t rd_desc;
int consumed;
rd_desc.arg.data = queue;
rd_desc.count = 1;
lock_sock(sk);
consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
queue->nr_cqe = 0;
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
release_sock(sk);
return consumed;
}
......@@ -1255,7 +1258,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->queue_size = queue_size;
if (qid > 0)
queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
queue->cmnd_capsule_len = nctrl->ioccsz * 16;
else
queue->cmnd_capsule_len = sizeof(struct nvme_command) +
NVME_TCP_ADMIN_CCSZ;
......@@ -1263,7 +1266,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
IPPROTO_TCP, &queue->sock);
if (ret) {
dev_err(ctrl->ctrl.device,
dev_err(nctrl->device,
"failed to create socket: %d\n", ret);
return ret;
}
......@@ -1273,7 +1276,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
(char *)&opt, sizeof(opt));
if (ret) {
dev_err(ctrl->ctrl.device,
dev_err(nctrl->device,
"failed to set TCP_SYNCNT sock opt %d\n", ret);
goto err_sock;
}
......@@ -1283,7 +1286,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
TCP_NODELAY, (char *)&opt, sizeof(opt));
if (ret) {
dev_err(ctrl->ctrl.device,
dev_err(nctrl->device,
"failed to set TCP_NODELAY sock opt %d\n", ret);
goto err_sock;
}
......@@ -1296,11 +1299,23 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
(char *)&sol, sizeof(sol));
if (ret) {
dev_err(ctrl->ctrl.device,
dev_err(nctrl->device,
"failed to set SO_LINGER sock opt %d\n", ret);
goto err_sock;
}
/* Set socket type of service */
if (nctrl->opts->tos >= 0) {
opt = nctrl->opts->tos;
ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
(char *)&opt, sizeof(opt));
if (ret) {
dev_err(nctrl->device,
"failed to set IP_TOS sock opt %d\n", ret);
goto err_sock;
}
}
queue->sock->sk->sk_allocation = GFP_ATOMIC;
if (!qid)
n = 0;
......@@ -1314,11 +1329,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->pdu_offset = 0;
sk_set_memalloc(queue->sock->sk);
if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
sizeof(ctrl->src_addr));
if (ret) {
dev_err(ctrl->ctrl.device,
dev_err(nctrl->device,
"failed to bind queue %d socket %d\n",
qid, ret);
goto err_sock;
......@@ -1330,7 +1345,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
if (queue->hdr_digest || queue->data_digest) {
ret = nvme_tcp_alloc_crypto(queue);
if (ret) {
dev_err(ctrl->ctrl.device,
dev_err(nctrl->device,
"failed to allocate queue %d crypto\n", qid);
goto err_sock;
}
......@@ -1344,13 +1359,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
goto err_crypto;
}
dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
dev_dbg(nctrl->device, "connecting queue %d\n",
nvme_tcp_queue_id(queue));
ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
sizeof(ctrl->addr), 0);
if (ret) {
dev_err(ctrl->ctrl.device,
dev_err(nctrl->device,
"failed to connect socket: %d\n", ret);
goto err_rcv_pdu;
}
......@@ -1371,6 +1386,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
queue->sock->sk->sk_state_change = nvme_tcp_state_change;
queue->sock->sk->sk_write_space = nvme_tcp_write_space;
queue->sock->sk->sk_ll_usec = 1;
write_unlock_bh(&queue->sock->sk->sk_callback_lock);
return 0;
......@@ -1469,7 +1485,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
set->nr_maps = 2 /* default + read */;
set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
}
ret = blk_mq_alloc_tag_set(set);
......@@ -1568,6 +1584,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
return nr_io_queues;
}
......@@ -1599,6 +1616,12 @@ static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
min(opts->nr_io_queues, nr_io_queues);
nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
}
if (opts->nr_poll_queues && nr_io_queues) {
/* map dedicated poll queues only if we have queues left */
ctrl->io_queues[HCTX_TYPE_POLL] =
min(opts->nr_poll_queues, nr_io_queues);
}
}
static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
......@@ -1680,6 +1703,7 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
nvme_tcp_stop_queue(ctrl, 0);
if (remove) {
blk_cleanup_queue(ctrl->admin_q);
blk_cleanup_queue(ctrl->fabrics_q);
blk_mq_free_tag_set(ctrl->admin_tagset);
}
nvme_tcp_free_admin_queue(ctrl);
......@@ -1700,10 +1724,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
goto out_free_queue;
}
ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
if (IS_ERR(ctrl->fabrics_q)) {
error = PTR_ERR(ctrl->fabrics_q);
goto out_free_tagset;
}
ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
if (IS_ERR(ctrl->admin_q)) {
error = PTR_ERR(ctrl->admin_q);
goto out_free_tagset;
goto out_cleanup_fabrics_q;
}
}
......@@ -1711,19 +1741,12 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
if (error)
goto out_cleanup_queue;
error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
if (error) {
dev_err(ctrl->device,
"prop_get NVME_REG_CAP failed\n");
goto out_stop_queue;
}
ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
error = nvme_enable_ctrl(ctrl, ctrl->cap);
error = nvme_enable_ctrl(ctrl);
if (error)
goto out_stop_queue;
blk_mq_unquiesce_queue(ctrl->admin_q);
error = nvme_init_identify(ctrl);
if (error)
goto out_stop_queue;
......@@ -1735,6 +1758,9 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->admin_q);
out_cleanup_fabrics_q:
if (new)
blk_cleanup_queue(ctrl->fabrics_q);
out_free_tagset:
if (new)
blk_mq_free_tag_set(ctrl->admin_tagset);
......@@ -1753,7 +1779,8 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
nvme_cancel_request, ctrl);
blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
}
blk_mq_unquiesce_queue(ctrl->admin_q);
if (remove)
blk_mq_unquiesce_queue(ctrl->admin_q);
nvme_tcp_destroy_admin_queue(ctrl, remove);
}
......@@ -1880,6 +1907,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
/* unquiesce to fail fast pending requests */
nvme_start_queues(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, false);
blk_mq_unquiesce_queue(ctrl->admin_q);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
......@@ -1896,10 +1924,11 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
nvme_tcp_teardown_io_queues(ctrl, shutdown);
blk_mq_quiesce_queue(ctrl->admin_q);
if (shutdown)
nvme_shutdown_ctrl(ctrl);
else
nvme_disable_ctrl(ctrl, ctrl->cap);
nvme_disable_ctrl(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, shutdown);
}
......@@ -2155,14 +2184,36 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
/* map dedicated poll queues only if we have queues left */
set->map[HCTX_TYPE_POLL].nr_queues =
ctrl->io_queues[HCTX_TYPE_POLL];
set->map[HCTX_TYPE_POLL].queue_offset =
ctrl->io_queues[HCTX_TYPE_DEFAULT] +
ctrl->io_queues[HCTX_TYPE_READ];
blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
}
dev_info(ctrl->ctrl.device,
"mapped %d/%d default/read queues.\n",
"mapped %d/%d/%d default/read/poll queues.\n",
ctrl->io_queues[HCTX_TYPE_DEFAULT],
ctrl->io_queues[HCTX_TYPE_READ]);
ctrl->io_queues[HCTX_TYPE_READ],
ctrl->io_queues[HCTX_TYPE_POLL]);
return 0;
}
static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
{
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
nvme_tcp_try_recv(queue);
return queue->nr_cqe;
}
static struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.complete = nvme_complete_rq,
......@@ -2171,6 +2222,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = {
.init_hctx = nvme_tcp_init_hctx,
.timeout = nvme_tcp_timeout,
.map_queues = nvme_tcp_map_queues,
.poll = nvme_tcp_poll,
};
static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
......@@ -2224,7 +2276,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
INIT_LIST_HEAD(&ctrl->list);
ctrl->ctrl.opts = opts;
ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
opts->nr_poll_queues + 1;
ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato;
......@@ -2318,7 +2371,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
NVMF_OPT_NR_WRITE_QUEUES,
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
NVMF_OPT_TOS,
.create_ctrl = nvme_tcp_create_ctrl,
};
......
......@@ -86,6 +86,22 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p,
return ret;
}
static const char *nvme_trace_get_lba_status(struct trace_seq *p,
u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
u64 slba = get_unaligned_le64(cdw10);
u32 mndw = get_unaligned_le32(cdw10 + 8);
u16 rl = get_unaligned_le16(cdw10 + 12);
u8 atype = cdw10[15];
trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u",
slba, mndw, rl, atype);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
......@@ -141,6 +157,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
return nvme_trace_admin_identify(p, cdw10);
case nvme_admin_get_features:
return nvme_trace_admin_get_features(p, cdw10);
case nvme_admin_get_lba_status:
return nvme_trace_get_lba_status(p, cdw10);
default:
return nvme_trace_common(p, cdw10);
}
......
......@@ -81,9 +81,11 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
goto out;
host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]);
data_units_read = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part,
sectors[READ]), 1000);
host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]);
data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
data_units_written = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part,
sectors[WRITE]), 1000);
put_unaligned_le64(host_reads, &slog->host_reads[0]);
put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
......@@ -111,11 +113,11 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
if (!ns->bdev)
continue;
host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]);
data_units_read +=
part_stat_read(ns->bdev->bd_part, sectors[READ]);
data_units_read += DIV_ROUND_UP(
part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000);
host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
data_units_written +=
part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
data_units_written += DIV_ROUND_UP(
part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000);
}
rcu_read_unlock();
......
......@@ -253,6 +253,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
blk_cleanup_queue(ctrl->ctrl.admin_q);
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(&ctrl->admin_tag_set);
}
......@@ -357,10 +358,16 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
goto out_free_sq;
ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
error = PTR_ERR(ctrl->ctrl.fabrics_q);
goto out_free_tagset;
}
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
error = PTR_ERR(ctrl->ctrl.admin_q);
goto out_free_tagset;
goto out_cleanup_fabrics_q;
}
error = nvmf_connect_admin_queue(&ctrl->ctrl);
......@@ -369,23 +376,15 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap);
if (error) {
dev_err(ctrl->ctrl.device,
"prop_get NVME_REG_CAP failed\n");
goto out_cleanup_queue;
}
ctrl->ctrl.sqsize =
min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
error = nvme_enable_ctrl(&ctrl->ctrl);
if (error)
goto out_cleanup_queue;
ctrl->ctrl.max_hw_sectors =
(NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
error = nvme_init_identify(&ctrl->ctrl);
if (error)
goto out_cleanup_queue;
......@@ -394,6 +393,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
out_cleanup_queue:
blk_cleanup_queue(ctrl->ctrl.admin_q);
out_cleanup_fabrics_q:
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_tagset:
blk_mq_free_tag_set(&ctrl->admin_tag_set);
out_free_sq:
......@@ -411,14 +412,13 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
nvme_loop_destroy_io_queues(ctrl);
}
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
nvme_shutdown_ctrl(&ctrl->ctrl);
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_loop_destroy_admin_queue(ctrl);
}
......
......@@ -348,7 +348,8 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
return 0;
err:
sgl_free(cmd->req.sg);
if (cmd->req.sg_cnt)
sgl_free(cmd->req.sg);
return NVME_SC_INTERNAL;
}
......@@ -553,7 +554,8 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
if (queue->nvme_sq.sqhd_disabled) {
kfree(cmd->iov);
sgl_free(cmd->req.sg);
if (cmd->req.sg_cnt)
sgl_free(cmd->req.sg);
}
return 1;
......@@ -584,7 +586,8 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
return -EAGAIN;
kfree(cmd->iov);
sgl_free(cmd->req.sg);
if (cmd->req.sg_cnt)
sgl_free(cmd->req.sg);
cmd->queue->snd_cmd = NULL;
nvmet_tcp_put_cmd(cmd);
return 1;
......@@ -1306,7 +1309,9 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
{
nvmet_req_uninit(&cmd->req);
nvmet_tcp_unmap_pdu_iovec(cmd);
sgl_free(cmd->req.sg);
kfree(cmd->iov);
if (cmd->req.sg_cnt)
sgl_free(cmd->req.sg);
}
static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
......@@ -1410,6 +1415,7 @@ static void nvmet_tcp_state_change(struct sock *sk)
static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
{
struct socket *sock = queue->sock;
struct inet_sock *inet = inet_sk(sock->sk);
struct linger sol = { .l_onoff = 1, .l_linger = 0 };
int ret;
......@@ -1433,6 +1439,16 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
if (ret)
return ret;
/* Set socket type of service */
if (inet->rcv_tos > 0) {
int tos = inet->rcv_tos;
ret = kernel_setsockopt(sock, SOL_IP, IP_TOS,
(char *)&tos, sizeof(tos));
if (ret)
return ret;
}
write_lock_bh(&sock->sk->sk_callback_lock);
sock->sk->sk_user_data = queue;
queue->data_ready = sock->sk->sk_data_ready;
......
......@@ -33,6 +33,22 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p,
return ret;
}
static const char *nvmet_trace_get_lba_status(struct trace_seq *p,
u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
u64 slba = get_unaligned_le64(cdw10);
u32 mndw = get_unaligned_le32(cdw10 + 8);
u16 rl = get_unaligned_le16(cdw10 + 12);
u8 atype = cdw10[15];
trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u",
slba, mndw, rl, atype);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
......@@ -80,6 +96,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p,
return nvmet_trace_admin_identify(p, cdw10);
case nvme_admin_get_features:
return nvmet_trace_admin_get_features(p, cdw10);
case nvme_admin_get_lba_status:
return nvmet_trace_get_lba_status(p, cdw10);
default:
return nvmet_trace_common(p, cdw10);
}
......
......@@ -140,6 +140,7 @@ enum {
* Submission and Completion Queue Entry Sizes for the NVM command set.
* (In bytes and specified as a power of two (2^n)).
*/
#define NVME_ADM_SQES 6
#define NVME_NVM_IOSQES 6
#define NVME_NVM_IOCQES 4
......@@ -814,6 +815,7 @@ enum nvme_admin_opcode {
nvme_admin_security_send = 0x81,
nvme_admin_security_recv = 0x82,
nvme_admin_sanitize_nvm = 0x84,
nvme_admin_get_lba_status = 0x86,
};
#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
......@@ -840,7 +842,8 @@ enum nvme_admin_opcode {
nvme_admin_opcode_name(nvme_admin_format_nvm), \
nvme_admin_opcode_name(nvme_admin_security_send), \
nvme_admin_opcode_name(nvme_admin_security_recv), \
nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
nvme_admin_opcode_name(nvme_admin_sanitize_nvm), \
nvme_admin_opcode_name(nvme_admin_get_lba_status))
enum {
NVME_QUEUE_PHYS_CONTIG = (1 << 0),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册