提交 8494bcf5 编写于 作者: L Linus Torvalds

Merge branch 'for-3.20/drivers' of git://git.kernel.dk/linux-block

Pull block driver changes from Jens Axboe:
 "This contains:

   - The 4k/partition fixes for brd from Boaz/Matthew.

   - A few xen front/back block fixes from David Vrabel and Roger Pau
     Monne.

   - Floppy changes from Takashi, cleaning the device file creation.

   - Switching libata to use the new blk-mq tagging policy, removing
     code (and a suboptimal implementation) from libata.  This will
     throw you a merge conflict, since a bug in the original libata
     tagging code was fixed since this code was branched.  Trivial.
     From Shaohua.

   - Conversion of loop to blk-mq, from Ming Lei.

   - Cleanup of the io_schedule() handling in bsg from Peter Zijlstra.
     He claims it improves on unreadable code, which will cost him a
     beer.

   - Maintainer update or NDB, now handled by Markus Pargmann.

   - NVMe:
        - Optimization from me that avoids a kmalloc/kfree per IO for
          smaller (<= 8KB) IO. This cuts about 1% of high IOPS CPU
          overhead.
        - Removal of (now) dead RCU code, a relic from before NVMe was
          converted to blk-mq"

* 'for-3.20/drivers' of git://git.kernel.dk/linux-block:
  xen-blkback: default to X86_32 ABI on x86
  xen-blkfront: fix accounting of reqs when migrating
  xen-blkback,xen-blkfront: add myself as maintainer
  block: Simplify bsg complete all
  floppy: Avoid manual call of device_create_file()
  NVMe: avoid kmalloc/kfree for smaller IO
  MAINTAINERS: Update NBD maintainer
  libata: make sata_sil24 use fifo tag allocator
  libata: move sas ata tag allocation to libata-scsi.c
  libata: use blk taging
  NVMe: within nvme_free_queues(), delete RCU sychro/deferred free
  null_blk: suppress invalid partition info
  brd: Request from fdisk 4k alignment
  brd: Fix all partitions BUGs
  axonram: Fix bug in direct_access
  loop: add blk-mq.h include
  block: loop: don't handle REQ_FUA explicitly
  block: loop: introduce lo_discard() and lo_req_flush()
  block: loop: say goodby to bio
  block: loop: improve performance via blk-mq
......@@ -6642,9 +6642,10 @@ F: include/uapi/linux/netrom.h
F: net/netrom/
NETWORK BLOCK DEVICE (NBD)
M: Paul Clements <Paul.Clements@steeleye.com>
M: Markus Pargmann <mpa@pengutronix.de>
S: Maintained
L: nbd-general@lists.sourceforge.net
T: git git://git.pengutronix.de/git/mpa/linux-nbd.git
F: Documentation/blockdev/nbd.txt
F: drivers/block/nbd.c
F: include/linux/nbd.h
......@@ -10690,6 +10691,7 @@ F: drivers/pci/*xen*
XEN BLOCK SUBSYSTEM
M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
M: Roger Pau Monné <roger.pau@citrix.com>
L: xen-devel@lists.xenproject.org (moderated for non-subscribers)
S: Supported
F: drivers/block/xen-blkback/*
......
......@@ -147,7 +147,7 @@ axon_ram_direct_access(struct block_device *device, sector_t sector,
loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
*kaddr = (void *)(bank->ph_addr + offset);
*pfn = virt_to_phys(kaddr) >> PAGE_SHIFT;
*pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
return bank->size - offset;
}
......
......@@ -136,42 +136,6 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
}
static int bsg_io_schedule(struct bsg_device *bd)
{
DEFINE_WAIT(wait);
int ret = 0;
spin_lock_irq(&bd->lock);
BUG_ON(bd->done_cmds > bd->queued_cmds);
/*
* -ENOSPC or -ENODATA? I'm going for -ENODATA, meaning "I have no
* work to do", even though we return -ENOSPC after this same test
* during bsg_write() -- there, it means our buffer can't have more
* bsg_commands added to it, thus has no space left.
*/
if (bd->done_cmds == bd->queued_cmds) {
ret = -ENODATA;
goto unlock;
}
if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
ret = -EAGAIN;
goto unlock;
}
prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&bd->lock);
io_schedule();
finish_wait(&bd->wq_done, &wait);
return ret;
unlock:
spin_unlock_irq(&bd->lock);
return ret;
}
static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
struct sg_io_v4 *hdr, struct bsg_device *bd,
fmode_t has_write_perm)
......@@ -482,6 +446,30 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
return ret;
}
static bool bsg_complete(struct bsg_device *bd)
{
bool ret = false;
bool spin;
do {
spin_lock_irq(&bd->lock);
BUG_ON(bd->done_cmds > bd->queued_cmds);
/*
* All commands consumed.
*/
if (bd->done_cmds == bd->queued_cmds)
ret = true;
spin = !test_bit(BSG_F_BLOCK, &bd->flags);
spin_unlock_irq(&bd->lock);
} while (!ret && spin);
return ret;
}
static int bsg_complete_all_commands(struct bsg_device *bd)
{
struct bsg_command *bc;
......@@ -492,17 +480,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
/*
* wait for all commands to complete
*/
ret = 0;
do {
ret = bsg_io_schedule(bd);
/*
* look for -ENODATA specifically -- we'll sometimes get
* -ERESTARTSYS when we've taken a signal, but we can't
* return until we're done freeing the queue, so ignore
* it. The signal will get handled when we're done freeing
* the bsg_device.
*/
} while (ret != -ENODATA);
io_wait_event(bd->wq_done, bsg_complete(bd));
/*
* discard done commands
......
......@@ -1585,8 +1585,6 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
else
tag = 0;
if (test_and_set_bit(tag, &ap->qc_allocated))
BUG();
qc = __ata_qc_from_tag(ap, tag);
qc->tag = tag;
......@@ -4722,69 +4720,36 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
}
/**
* ata_qc_new - Request an available ATA command, for queueing
* @ap: target port
*
* Some ATA host controllers may implement a queue depth which is less
* than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond
* the hardware limitation.
* ata_qc_new_init - Request an available ATA command, and initialize it
* @dev: Device from whom we request an available command structure
*
* LOCKING:
* None.
*/
static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
{
struct ata_queued_cmd *qc = NULL;
unsigned int max_queue = ap->host->n_tags;
unsigned int i, tag;
struct ata_port *ap = dev->link->ap;
struct ata_queued_cmd *qc;
/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
return NULL;
for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) {
if (ap->flags & ATA_FLAG_LOWTAG)
tag = i;
else
tag = tag < max_queue ? tag : 0;
/* the last tag is reserved for internal command. */
if (tag == ATA_TAG_INTERNAL)
continue;
if (!test_and_set_bit(tag, &ap->qc_allocated)) {
qc = __ata_qc_from_tag(ap, tag);
qc->tag = tag;
ap->last_tag = tag;
break;
}
/* libsas case */
if (!ap->scsi_host) {
tag = ata_sas_allocate_tag(ap);
if (tag < 0)
return NULL;
}
return qc;
}
/**
* ata_qc_new_init - Request an available ATA command, and initialize it
* @dev: Device from whom we request an available command structure
*
* LOCKING:
* None.
*/
struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
struct ata_queued_cmd *qc;
qc = ata_qc_new(ap);
if (qc) {
qc->scsicmd = NULL;
qc->ap = ap;
qc->dev = dev;
qc = __ata_qc_from_tag(ap, tag);
qc->tag = tag;
qc->scsicmd = NULL;
qc->ap = ap;
qc->dev = dev;
ata_qc_reinit(qc);
}
ata_qc_reinit(qc);
return qc;
}
......@@ -4811,7 +4776,8 @@ void ata_qc_free(struct ata_queued_cmd *qc)
tag = qc->tag;
if (likely(ata_tag_valid(tag))) {
qc->tag = ATA_TAG_POISON;
clear_bit(tag, &ap->qc_allocated);
if (!ap->scsi_host)
ata_sas_free_tag(tag, ap);
}
}
......
......@@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
{
struct ata_queued_cmd *qc;
qc = ata_qc_new_init(dev);
qc = ata_qc_new_init(dev, cmd->request->tag);
if (qc) {
qc->scsicmd = cmd;
qc->scsidone = cmd->scsi_done;
......@@ -3668,6 +3668,9 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht)
*/
shost->max_host_blocked = 1;
if (scsi_init_shared_tag_map(shost, host->n_tags))
goto err_add;
rc = scsi_add_host_with_dma(ap->scsi_host,
&ap->tdev, ap->host->dev);
if (rc)
......@@ -4230,3 +4233,31 @@ int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap)
return rc;
}
EXPORT_SYMBOL_GPL(ata_sas_queuecmd);
int ata_sas_allocate_tag(struct ata_port *ap)
{
unsigned int max_queue = ap->host->n_tags;
unsigned int i, tag;
for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) {
if (ap->flags & ATA_FLAG_LOWTAG)
tag = 1;
else
tag = tag < max_queue ? tag : 0;
/* the last tag is reserved for internal command. */
if (tag == ATA_TAG_INTERNAL)
continue;
if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) {
ap->sas_last_tag = tag;
return tag;
}
}
return -1;
}
void ata_sas_free_tag(unsigned int tag, struct ata_port *ap)
{
clear_bit(tag, &ap->sas_tag_allocated);
}
......@@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev);
extern void ata_force_cbl(struct ata_port *ap);
extern u64 ata_tf_to_lba(const struct ata_taskfile *tf);
extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf);
extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag);
extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
u64 block, u32 n_block, unsigned int tf_flags,
unsigned int tag);
......@@ -144,6 +144,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work);
extern int ata_bus_probe(struct ata_port *ap);
extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
unsigned int id, u64 lun);
int ata_sas_allocate_tag(struct ata_port *ap);
void ata_sas_free_tag(unsigned int tag, struct ata_port *ap);
/* libata-eh.c */
......
......@@ -388,6 +388,7 @@ static struct scsi_host_template sil24_sht = {
.can_queue = SIL24_MAX_CMDS,
.sg_tablesize = SIL24_MAX_SGE,
.dma_boundary = ATA_DMA_BOUNDARY,
.tag_alloc_policy = BLK_TAG_ALLOC_FIFO,
};
static struct ata_port_operations sil24_ops = {
......
......@@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = {
/*
* And now the modules code and kernel interface.
*/
static int rd_nr;
int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
static int max_part;
static int part_shift;
static int part_show = 0;
static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
module_param(rd_nr, int, S_IRUGO);
MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
module_param(rd_size, int, S_IRUGO);
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
static int max_part = 1;
module_param(max_part, int, S_IRUGO);
MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
module_param(part_show, int, S_IRUGO);
MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions");
MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
MODULE_ALIAS("rd");
......@@ -487,25 +486,33 @@ static struct brd_device *brd_alloc(int i)
brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
if (!brd->brd_queue)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
/* This is so fdisk will align partitions on 4k, because of
* direct_access API needing 4k alignment, returning a PFN
* (This is only a problem on very small devices <= 4M,
* otherwise fdisk will align on 1M. Regardless this call
* is harmless)
*/
blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
brd->brd_queue->limits.discard_zeroes_data = 1;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
disk = brd->brd_disk = alloc_disk(1 << part_shift);
disk = brd->brd_disk = alloc_disk(max_part);
if (!disk)
goto out_free_queue;
disk->major = RAMDISK_MAJOR;
disk->first_minor = i << part_shift;
disk->first_minor = i * max_part;
disk->fops = &brd_fops;
disk->private_data = brd;
disk->queue = brd->brd_queue;
if (!part_show)
disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "ram%d", i);
set_capacity(disk, rd_size * 2);
......@@ -527,10 +534,11 @@ static void brd_free(struct brd_device *brd)
kfree(brd);
}
static struct brd_device *brd_init_one(int i)
static struct brd_device *brd_init_one(int i, bool *new)
{
struct brd_device *brd;
*new = false;
list_for_each_entry(brd, &brd_devices, brd_list) {
if (brd->brd_number == i)
goto out;
......@@ -541,6 +549,7 @@ static struct brd_device *brd_init_one(int i)
add_disk(brd->brd_disk);
list_add_tail(&brd->brd_list, &brd_devices);
}
*new = true;
out:
return brd;
}
......@@ -556,70 +565,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
{
struct brd_device *brd;
struct kobject *kobj;
bool new;
mutex_lock(&brd_devices_mutex);
brd = brd_init_one(MINOR(dev) >> part_shift);
brd = brd_init_one(MINOR(dev) / max_part, &new);
kobj = brd ? get_disk(brd->brd_disk) : NULL;
mutex_unlock(&brd_devices_mutex);
*part = 0;
if (new)
*part = 0;
return kobj;
}
static int __init brd_init(void)
{
int i, nr;
unsigned long range;
struct brd_device *brd, *next;
int i;
/*
* brd module now has a feature to instantiate underlying device
* structure on-demand, provided that there is an access dev node.
* However, this will not work well with user space tool that doesn't
* know about such "feature". In order to not break any existing
* tool, we do the following:
*
* (1) if rd_nr is specified, create that many upfront, and this
* also becomes a hard limit.
* (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT
* (default 16) rd device on module load, user can further
* extend brd device by create dev node themselves and have
* kernel automatically instantiate actual device on-demand.
* (1) if rd_nr is specified, create that many upfront. else
* it defaults to CONFIG_BLK_DEV_RAM_COUNT
* (2) User can further extend brd devices by create dev node themselves
* and have kernel automatically instantiate actual device
* on-demand. Example:
* mknod /path/devnod_name b 1 X # 1 is the rd major
* fdisk -l /path/devnod_name
* If (X / max_part) was not already created it will be created
* dynamically.
*/
part_shift = 0;
if (max_part > 0) {
part_shift = fls(max_part);
/*
* Adjust max_part according to part_shift as it is exported
* to user space so that user can decide correct minor number
* if [s]he want to create more devices.
*
* Note that -1 is required because partition 0 is reserved
* for the whole disk.
*/
max_part = (1UL << part_shift) - 1;
}
if ((1UL << part_shift) > DISK_MAX_PARTS)
return -EINVAL;
if (rd_nr > 1UL << (MINORBITS - part_shift))
return -EINVAL;
if (rd_nr) {
nr = rd_nr;
range = rd_nr << part_shift;
} else {
nr = CONFIG_BLK_DEV_RAM_COUNT;
range = 1UL << MINORBITS;
}
if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
return -EIO;
for (i = 0; i < nr; i++) {
if (unlikely(!max_part))
max_part = 1;
for (i = 0; i < rd_nr; i++) {
brd = brd_alloc(i);
if (!brd)
goto out_free;
......@@ -631,10 +616,10 @@ static int __init brd_init(void)
list_for_each_entry(brd, &brd_devices, brd_list)
add_disk(brd->brd_disk);
blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
THIS_MODULE, brd_probe, NULL, NULL);
printk(KERN_INFO "brd: module loaded\n");
pr_info("brd: module loaded\n");
return 0;
out_free:
......@@ -644,21 +629,21 @@ static int __init brd_init(void)
}
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
pr_info("brd: module NOT loaded !!!\n");
return -ENOMEM;
}
static void __exit brd_exit(void)
{
unsigned long range;
struct brd_device *brd, *next;
range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
brd_del_one(brd);
blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
pr_info("brd: module unloaded\n");
}
module_init(brd_init);
......
......@@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev,
static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
static struct attribute *floppy_dev_attrs[] = {
&dev_attr_cmos.attr,
NULL
};
ATTRIBUTE_GROUPS(floppy_dev);
static void floppy_device_release(struct device *dev)
{
}
......@@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void)
floppy_device[drive].name = floppy_device_name;
floppy_device[drive].id = drive;
floppy_device[drive].dev.release = floppy_device_release;
floppy_device[drive].dev.groups = floppy_dev_groups;
err = platform_device_register(&floppy_device[drive]);
if (err)
goto out_remove_drives;
err = device_create_file(&floppy_device[drive].dev,
&dev_attr_cmos);
if (err)
goto out_unreg_platform_dev;
/* to be cleaned up... */
disks[drive]->private_data = (void *)(long)drive;
disks[drive]->flags |= GENHD_FL_REMOVABLE;
......@@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void)
return 0;
out_unreg_platform_dev:
platform_device_unregister(&floppy_device[drive]);
out_remove_drives:
while (drive--) {
if (floppy_available(drive)) {
del_gendisk(disks[drive]);
device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
}
......@@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void)
if (floppy_available(drive)) {
del_gendisk(disks[drive]);
device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
blk_cleanup_queue(disks[drive]->queue);
......
......@@ -85,6 +85,8 @@ static DEFINE_MUTEX(loop_index_mutex);
static int max_part;
static int part_shift;
static struct workqueue_struct *loop_wq;
/*
* Transfer functions
*/
......@@ -284,12 +286,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
return ret;
}
static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos)
{
int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t,
struct page *page);
struct bio_vec bvec;
struct bvec_iter iter;
struct req_iterator iter;
struct page *page = NULL;
int ret = 0;
......@@ -303,7 +305,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
do_lo_send = do_lo_send_direct_write;
}
bio_for_each_segment(bvec, bio, iter) {
rq_for_each_segment(bvec, rq, iter) {
ret = do_lo_send(lo, &bvec, pos, page);
if (ret < 0)
break;
......@@ -391,19 +393,22 @@ do_lo_receive(struct loop_device *lo,
}
static int
lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos)
{
struct bio_vec bvec;
struct bvec_iter iter;
struct req_iterator iter;
ssize_t s;
bio_for_each_segment(bvec, bio, iter) {
rq_for_each_segment(bvec, rq, iter) {
s = do_lo_receive(lo, &bvec, bsize, pos);
if (s < 0)
return s;
if (s != bvec.bv_len) {
zero_fill_bio(bio);
struct bio *bio;
__rq_for_each_bio(bio, rq)
zero_fill_bio(bio);
break;
}
pos += bvec.bv_len;
......@@ -411,106 +416,58 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
return 0;
}
static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
{
loff_t pos;
/*
* We use punch hole to reclaim the free space used by the
* image a.k.a. discard. However we do not support discard if
* encryption is enabled, because it may give an attacker
* useful information.
*/
struct file *file = lo->lo_backing_file;
int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
int ret;
pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset;
if (bio_rw(bio) == WRITE) {
struct file *file = lo->lo_backing_file;
if (bio->bi_rw & REQ_FLUSH) {
ret = vfs_fsync(file, 0);
if (unlikely(ret && ret != -EINVAL)) {
ret = -EIO;
goto out;
}
}
/*
* We use punch hole to reclaim the free space used by the
* image a.k.a. discard. However we do not support discard if
* encryption is enabled, because it may give an attacker
* useful information.
*/
if (bio->bi_rw & REQ_DISCARD) {
struct file *file = lo->lo_backing_file;
int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
if ((!file->f_op->fallocate) ||
lo->lo_encrypt_key_size) {
ret = -EOPNOTSUPP;
goto out;
}
ret = file->f_op->fallocate(file, mode, pos,
bio->bi_iter.bi_size);
if (unlikely(ret && ret != -EINVAL &&
ret != -EOPNOTSUPP))
ret = -EIO;
goto out;
}
ret = lo_send(lo, bio, pos);
if ((bio->bi_rw & REQ_FUA) && !ret) {
ret = vfs_fsync(file, 0);
if (unlikely(ret && ret != -EINVAL))
ret = -EIO;
}
} else
ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
ret = -EOPNOTSUPP;
goto out;
}
out:
ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
ret = -EIO;
out:
return ret;
}
/*
* Add bio to back of pending list
*/
static void loop_add_bio(struct loop_device *lo, struct bio *bio)
static int lo_req_flush(struct loop_device *lo, struct request *rq)
{
lo->lo_bio_count++;
bio_list_add(&lo->lo_bio_list, bio);
}
struct file *file = lo->lo_backing_file;
int ret = vfs_fsync(file, 0);
if (unlikely(ret && ret != -EINVAL))
ret = -EIO;
/*
* Grab first pending buffer
*/
static struct bio *loop_get_bio(struct loop_device *lo)
{
lo->lo_bio_count--;
return bio_list_pop(&lo->lo_bio_list);
return ret;
}
static void loop_make_request(struct request_queue *q, struct bio *old_bio)
static int do_req_filebacked(struct loop_device *lo, struct request *rq)
{
struct loop_device *lo = q->queuedata;
int rw = bio_rw(old_bio);
if (rw == READA)
rw = READ;
loff_t pos;
int ret;
BUG_ON(!lo || (rw != READ && rw != WRITE));
pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
spin_lock_irq(&lo->lo_lock);
if (lo->lo_state != Lo_bound)
goto out;
if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
goto out;
if (lo->lo_bio_count >= q->nr_congestion_on)
wait_event_lock_irq(lo->lo_req_wait,
lo->lo_bio_count < q->nr_congestion_off,
lo->lo_lock);
loop_add_bio(lo, old_bio);
wake_up(&lo->lo_event);
spin_unlock_irq(&lo->lo_lock);
return;
if (rq->cmd_flags & REQ_WRITE) {
if (rq->cmd_flags & REQ_FLUSH)
ret = lo_req_flush(lo, rq);
else if (rq->cmd_flags & REQ_DISCARD)
ret = lo_discard(lo, rq, pos);
else
ret = lo_send(lo, rq, pos);
} else
ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
out:
spin_unlock_irq(&lo->lo_lock);
bio_io_error(old_bio);
return ret;
}
struct switch_request {
......@@ -518,57 +475,26 @@ struct switch_request {
struct completion wait;
};
static void do_loop_switch(struct loop_device *, struct switch_request *);
static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
{
if (unlikely(!bio->bi_bdev)) {
do_loop_switch(lo, bio->bi_private);
bio_put(bio);
} else {
int ret = do_bio_filebacked(lo, bio);
bio_endio(bio, ret);
}
}
/*
* worker thread that handles reads/writes to file backed loop devices,
* to avoid blocking in our make_request_fn. it also does loop decrypting
* on reads for block backed loop, as that is too heavy to do from
* b_end_io context where irqs may be disabled.
*
* Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before
* calling kthread_stop(). Therefore once kthread_should_stop() is
* true, make_request will not place any more requests. Therefore
* once kthread_should_stop() is true and lo_bio is NULL, we are
* done with the loop.
* Do the actual switch; called from the BIO completion routine
*/
static int loop_thread(void *data)
static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
{
struct loop_device *lo = data;
struct bio *bio;
set_user_nice(current, MIN_NICE);
while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
wait_event_interruptible(lo->lo_event,
!bio_list_empty(&lo->lo_bio_list) ||
kthread_should_stop());
if (bio_list_empty(&lo->lo_bio_list))
continue;
spin_lock_irq(&lo->lo_lock);
bio = loop_get_bio(lo);
if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
wake_up(&lo->lo_req_wait);
spin_unlock_irq(&lo->lo_lock);
struct file *file = p->file;
struct file *old_file = lo->lo_backing_file;
struct address_space *mapping;
BUG_ON(!bio);
loop_handle_bio(lo, bio);
}
/* if no new file, only flush of queued bios requested */
if (!file)
return;
return 0;
mapping = file->f_mapping;
mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
lo->lo_backing_file = file;
lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
}
/*
......@@ -579,15 +505,18 @@ static int loop_thread(void *data)
static int loop_switch(struct loop_device *lo, struct file *file)
{
struct switch_request w;
struct bio *bio = bio_alloc(GFP_KERNEL, 0);
if (!bio)
return -ENOMEM;
init_completion(&w.wait);
w.file = file;
bio->bi_private = &w;
bio->bi_bdev = NULL;
loop_make_request(lo->lo_queue, bio);
wait_for_completion(&w.wait);
/* freeze queue and wait for completion of scheduled requests */
blk_mq_freeze_queue(lo->lo_queue);
/* do the switch action */
do_loop_switch(lo, &w);
/* unfreeze */
blk_mq_unfreeze_queue(lo->lo_queue);
return 0;
}
......@@ -596,38 +525,9 @@ static int loop_switch(struct loop_device *lo, struct file *file)
*/
static int loop_flush(struct loop_device *lo)
{
/* loop not yet configured, no running thread, nothing to flush */
if (!lo->lo_thread)
return 0;
return loop_switch(lo, NULL);
}
/*
* Do the actual switch; called from the BIO completion routine
*/
static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
{
struct file *file = p->file;
struct file *old_file = lo->lo_backing_file;
struct address_space *mapping;
/* if no new file, only flush of queued bios requested */
if (!file)
goto out;
mapping = file->f_mapping;
mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
lo->lo_backing_file = file;
lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
out:
complete(&p->wait);
}
/*
* loop_change_fd switched the backing store of a loopback device to
* a new file. This is useful for operating system installers to free up
......@@ -889,12 +789,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
lo->transfer = transfer_none;
lo->ioctl = NULL;
lo->lo_sizelimit = 0;
lo->lo_bio_count = 0;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
bio_list_init(&lo->lo_bio_list);
if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
blk_queue_flush(lo->lo_queue, REQ_FLUSH);
......@@ -906,14 +803,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
set_blocksize(bdev, lo_blocksize);
lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
lo->lo_number);
if (IS_ERR(lo->lo_thread)) {
error = PTR_ERR(lo->lo_thread);
goto out_clr;
}
lo->lo_state = Lo_bound;
wake_up_process(lo->lo_thread);
if (part_shift)
lo->lo_flags |= LO_FLAGS_PARTSCAN;
if (lo->lo_flags & LO_FLAGS_PARTSCAN)
......@@ -925,18 +815,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
bdgrab(bdev);
return 0;
out_clr:
loop_sysfs_exit(lo);
lo->lo_thread = NULL;
lo->lo_device = NULL;
lo->lo_backing_file = NULL;
lo->lo_flags = 0;
set_capacity(lo->lo_disk, 0);
invalidate_bdev(bdev);
bd_set_size(bdev, 0);
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
lo->lo_state = Lo_unbound;
out_putf:
fput(file);
out:
......@@ -1012,11 +890,6 @@ static int loop_clr_fd(struct loop_device *lo)
spin_lock_irq(&lo->lo_lock);
lo->lo_state = Lo_rundown;
spin_unlock_irq(&lo->lo_lock);
kthread_stop(lo->lo_thread);
spin_lock_irq(&lo->lo_lock);
lo->lo_backing_file = NULL;
spin_unlock_irq(&lo->lo_lock);
......@@ -1028,7 +901,6 @@ static int loop_clr_fd(struct loop_device *lo)
lo->lo_offset = 0;
lo->lo_sizelimit = 0;
lo->lo_encrypt_key_size = 0;
lo->lo_thread = NULL;
memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
......@@ -1601,6 +1473,105 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);
static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
blk_mq_start_request(bd->rq);
if (cmd->rq->cmd_flags & REQ_WRITE) {
struct loop_device *lo = cmd->rq->q->queuedata;
bool need_sched = true;
spin_lock_irq(&lo->lo_lock);
if (lo->write_started)
need_sched = false;
else
lo->write_started = true;
list_add_tail(&cmd->list, &lo->write_cmd_head);
spin_unlock_irq(&lo->lo_lock);
if (need_sched)
queue_work(loop_wq, &lo->write_work);
} else {
queue_work(loop_wq, &cmd->read_work);
}
return BLK_MQ_RQ_QUEUE_OK;
}
static void loop_handle_cmd(struct loop_cmd *cmd)
{
const bool write = cmd->rq->cmd_flags & REQ_WRITE;
struct loop_device *lo = cmd->rq->q->queuedata;
int ret = -EIO;
if (lo->lo_state != Lo_bound)
goto failed;
if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
goto failed;
ret = do_req_filebacked(lo, cmd->rq);
failed:
if (ret)
cmd->rq->errors = -EIO;
blk_mq_complete_request(cmd->rq);
}
static void loop_queue_write_work(struct work_struct *work)
{
struct loop_device *lo =
container_of(work, struct loop_device, write_work);
LIST_HEAD(cmd_list);
spin_lock_irq(&lo->lo_lock);
repeat:
list_splice_init(&lo->write_cmd_head, &cmd_list);
spin_unlock_irq(&lo->lo_lock);
while (!list_empty(&cmd_list)) {
struct loop_cmd *cmd = list_first_entry(&cmd_list,
struct loop_cmd, list);
list_del_init(&cmd->list);
loop_handle_cmd(cmd);
}
spin_lock_irq(&lo->lo_lock);
if (!list_empty(&lo->write_cmd_head))
goto repeat;
lo->write_started = false;
spin_unlock_irq(&lo->lo_lock);
}
static void loop_queue_read_work(struct work_struct *work)
{
struct loop_cmd *cmd =
container_of(work, struct loop_cmd, read_work);
loop_handle_cmd(cmd);
}
static int loop_init_request(void *data, struct request *rq,
unsigned int hctx_idx, unsigned int request_idx,
unsigned int numa_node)
{
struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
cmd->rq = rq;
INIT_WORK(&cmd->read_work, loop_queue_read_work);
return 0;
}
static struct blk_mq_ops loop_mq_ops = {
.queue_rq = loop_queue_rq,
.map_queue = blk_mq_map_queue,
.init_request = loop_init_request,
};
static int loop_add(struct loop_device **l, int i)
{
struct loop_device *lo;
......@@ -1627,16 +1598,28 @@ static int loop_add(struct loop_device **l, int i)
i = err;
err = -ENOMEM;
lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
if (!lo->lo_queue)
lo->tag_set.ops = &loop_mq_ops;
lo->tag_set.nr_hw_queues = 1;
lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
lo->tag_set.driver_data = lo;
err = blk_mq_alloc_tag_set(&lo->tag_set);
if (err)
goto out_free_idr;
/*
* set queue make_request_fn
*/
blk_queue_make_request(lo->lo_queue, loop_make_request);
lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
if (IS_ERR_OR_NULL(lo->lo_queue)) {
err = PTR_ERR(lo->lo_queue);
goto out_cleanup_tags;
}
lo->lo_queue->queuedata = lo;
INIT_LIST_HEAD(&lo->write_cmd_head);
INIT_WORK(&lo->write_work, loop_queue_write_work);
disk = lo->lo_disk = alloc_disk(1 << part_shift);
if (!disk)
goto out_free_queue;
......@@ -1664,9 +1647,6 @@ static int loop_add(struct loop_device **l, int i)
disk->flags |= GENHD_FL_EXT_DEVT;
mutex_init(&lo->lo_ctl_mutex);
lo->lo_number = i;
lo->lo_thread = NULL;
init_waitqueue_head(&lo->lo_event);
init_waitqueue_head(&lo->lo_req_wait);
spin_lock_init(&lo->lo_lock);
disk->major = LOOP_MAJOR;
disk->first_minor = i << part_shift;
......@@ -1680,6 +1660,8 @@ static int loop_add(struct loop_device **l, int i)
out_free_queue:
blk_cleanup_queue(lo->lo_queue);
out_cleanup_tags:
blk_mq_free_tag_set(&lo->tag_set);
out_free_idr:
idr_remove(&loop_index_idr, i);
out_free_dev:
......@@ -1692,6 +1674,7 @@ static void loop_remove(struct loop_device *lo)
{
del_gendisk(lo->lo_disk);
blk_cleanup_queue(lo->lo_queue);
blk_mq_free_tag_set(&lo->tag_set);
put_disk(lo->lo_disk);
kfree(lo);
}
......@@ -1875,6 +1858,13 @@ static int __init loop_init(void)
goto misc_out;
}
loop_wq = alloc_workqueue("kloopd",
WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
if (!loop_wq) {
err = -ENOMEM;
goto misc_out;
}
blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
THIS_MODULE, loop_probe, NULL, NULL);
......@@ -1912,6 +1902,8 @@ static void __exit loop_exit(void)
blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
unregister_blkdev(LOOP_MAJOR, "loop");
destroy_workqueue(loop_wq);
misc_deregister(&loop_misc);
}
......
......@@ -11,8 +11,10 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <uapi/linux/loop.h>
/* Possible states of device */
......@@ -52,19 +54,23 @@ struct loop_device {
gfp_t old_gfp_mask;
spinlock_t lo_lock;
struct bio_list lo_bio_list;
unsigned int lo_bio_count;
struct list_head write_cmd_head;
struct work_struct write_work;
bool write_started;
int lo_state;
struct mutex lo_ctl_mutex;
struct task_struct *lo_thread;
wait_queue_head_t lo_event;
/* wait queue for incoming requests */
wait_queue_head_t lo_req_wait;
struct request_queue *lo_queue;
struct blk_mq_tag_set tag_set;
struct gendisk *lo_disk;
};
struct loop_cmd {
struct work_struct read_work;
struct request *rq;
struct list_head list;
};
/* Support for loadable transfer modules */
struct loop_func_table {
int number; /* filter type */
......
......@@ -579,7 +579,7 @@ static int null_add_dev(void)
sector_div(size, bs);
set_capacity(disk, size);
disk->flags |= GENHD_FL_EXT_DEVT;
disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->major = null_major;
disk->first_minor = nullb->index;
disk->fops = &null_fops;
......
......@@ -144,8 +144,37 @@ struct nvme_cmd_info {
void *ctx;
int aborted;
struct nvme_queue *nvmeq;
struct nvme_iod iod[0];
};
/*
* Max size of iod being embedded in the request payload
*/
#define NVME_INT_PAGES 2
#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size)
/*
* Will slightly overestimate the number of pages needed. This is OK
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
*/
static int nvme_npages(unsigned size, struct nvme_dev *dev)
{
unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}
static unsigned int nvme_cmd_size(struct nvme_dev *dev)
{
unsigned int ret = sizeof(struct nvme_cmd_info);
ret += sizeof(struct nvme_iod);
ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
return ret;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
......@@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
}
static void *iod_get_private(struct nvme_iod *iod)
{
return (void *) (iod->private & ~0x1UL);
}
/*
* If bit 0 is set, the iod is embedded in the request payload.
*/
static bool iod_should_kfree(struct nvme_iod *iod)
{
return (iod->private & 0x01) == 0;
}
/* Special values must be less than 0x1000 */
#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
......@@ -361,35 +403,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
return ((void *)iod) + iod->offset;
}
/*
* Will slightly overestimate the number of pages needed. This is OK
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
*/
static int nvme_npages(unsigned size, struct nvme_dev *dev)
static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
unsigned nseg, unsigned long private)
{
unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
iod->private = private;
iod->offset = offsetof(struct nvme_iod, sg[nseg]);
iod->npages = -1;
iod->length = nbytes;
iod->nents = 0;
}
static struct nvme_iod *
nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
unsigned long priv, gfp_t gfp)
{
struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
sizeof(__le64 *) * nvme_npages(nbytes, dev) +
sizeof(__le64 *) * nvme_npages(bytes, dev) +
sizeof(struct scatterlist) * nseg, gfp);
if (iod) {
iod->offset = offsetof(struct nvme_iod, sg[nseg]);
iod->npages = -1;
iod->length = nbytes;
iod->nents = 0;
iod->first_dma = 0ULL;
}
if (iod)
iod_init(iod, bytes, nseg, priv);
return iod;
}
static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
gfp_t gfp)
{
unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
sizeof(struct nvme_dsm_range);
unsigned long mask = 0;
struct nvme_iod *iod;
if (rq->nr_phys_segments <= NVME_INT_PAGES &&
size <= NVME_INT_BYTES(dev)) {
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
iod = cmd->iod;
mask = 0x01;
iod_init(iod, size, rq->nr_phys_segments,
(unsigned long) rq | 0x01);
return iod;
}
return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
(unsigned long) rq, gfp);
}
void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
{
const int last_prp = dev->page_size / 8 - 1;
......@@ -405,7 +465,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
prp_dma = next_prp_dma;
}
kfree(iod);
if (iod_should_kfree(iod))
kfree(iod);
}
static int nvme_error_status(u16 status)
......@@ -424,7 +486,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
struct nvme_iod *iod = ctx;
struct request *req = iod->private;
struct request *req = iod_get_private(iod);
struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
u16 status = le16_to_cpup(&cqe->status) >> 1;
......@@ -585,7 +647,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
struct nvme_ns *ns)
{
struct request *req = iod->private;
struct request *req = iod_get_private(iod);
struct nvme_command *cmnd;
u16 control = 0;
u32 dsmgmt = 0;
......@@ -626,17 +688,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *req = bd->rq;
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
struct nvme_iod *iod;
int psegs = req->nr_phys_segments;
enum dma_data_direction dma_dir;
unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
sizeof(struct nvme_dsm_range);
iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
if (!iod)
return BLK_MQ_RQ_QUEUE_BUSY;
iod->private = req;
if (req->cmd_flags & REQ_DISCARD) {
void *range;
/*
......@@ -651,10 +708,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
goto retry_cmd;
iod_list(iod)[0] = (__le64 *)range;
iod->npages = 0;
} else if (psegs) {
} else if (req->nr_phys_segments) {
dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
sg_init_table(iod->sg, psegs);
sg_init_table(iod->sg, req->nr_phys_segments);
iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
if (!iod->nents)
goto error_cmd;
......@@ -1137,21 +1194,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
{
LLIST_HEAD(q_list);
struct nvme_queue *nvmeq, *next;
struct llist_node *entry;
int i;
for (i = dev->queue_count - 1; i >= lowest; i--) {
struct nvme_queue *nvmeq = dev->queues[i];
llist_add(&nvmeq->node, &q_list);
dev->queue_count--;
dev->queues[i] = NULL;
}
synchronize_rcu();
entry = llist_del_all(&q_list);
llist_for_each_entry_safe(nvmeq, next, entry, node)
nvme_free_queue(nvmeq);
}
}
/**
......@@ -1408,7 +1458,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
dev->admin_tagset.driver_data = dev;
if (blk_mq_alloc_tag_set(&dev->admin_tagset))
......@@ -1522,7 +1572,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
}
err = -ENOMEM;
iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
if (!iod)
goto put_pages;
......@@ -2148,7 +2198,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
dev->tagset.cmd_size = nvme_cmd_size(dev);
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
......
......@@ -214,6 +214,15 @@ enum blkif_protocol {
BLKIF_PROTOCOL_X86_64 = 3,
};
/*
* Default protocol if the frontend doesn't specify one.
*/
#ifdef CONFIG_X86
# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
#else
# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
#endif
struct xen_vbd {
/* What the domain refers to this vbd as. */
blkif_vdev_t handle;
......
......@@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be)
return err;
}
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
"%63s", protocol, NULL);
if (err)
strcpy(protocol, "unspecified, assuming native");
strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
......
......@@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info)
merge_bio.tail = copy[i].request->biotail;
bio_list_merge(&bio_list, &merge_bio);
copy[i].request->bio = NULL;
blk_put_request(copy[i].request);
blk_end_request_all(copy[i].request, 0);
}
kfree(copy);
......@@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info)
req->bio = NULL;
if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
pr_alert("diskcache flush request found!\n");
__blk_put_request(info->rq, req);
__blk_end_request_all(req, 0);
}
spin_unlock_irq(&info->io_lock);
......
......@@ -823,10 +823,10 @@ struct ata_port {
unsigned int cbl; /* cable type; ATA_CBL_xxx */
struct ata_queued_cmd qcmd[ATA_MAX_QUEUE];
unsigned long qc_allocated;
unsigned long sas_tag_allocated; /* for sas tag allocation only */
unsigned int qc_active;
int nr_active_links; /* #links with active qcs */
unsigned int last_tag; /* track next tag hw expects */
unsigned int sas_last_tag; /* track next tag hw expects */
struct ata_link link; /* host default link */
struct ata_link *slave_link; /* see ata_slave_link_init() */
......@@ -1352,6 +1352,7 @@ extern struct device_attribute *ata_common_sdev_attrs[];
.ioctl = ata_scsi_ioctl, \
.queuecommand = ata_scsi_queuecmd, \
.can_queue = ATA_DEF_QUEUE, \
.tag_alloc_policy = BLK_TAG_ALLOC_RR, \
.this_id = ATA_SHT_THIS_ID, \
.cmd_per_lun = ATA_SHT_CMD_PER_LUN, \
.emulated = ATA_SHT_EMULATED, \
......
......@@ -132,13 +132,12 @@ struct nvme_ns {
* allocated to store the PRP list.
*/
struct nvme_iod {
void *private; /* For the use of the submitter of the I/O */
unsigned long private; /* For the use of the submitter of the I/O */
int npages; /* In the PRP list. 0 means small pool in use */
int offset; /* Of PRP list */
int nents; /* Used in scatterlist */
int length; /* Of data, in bytes */
dma_addr_t first_dma;
struct list_head node;
struct scatterlist sg[0];
};
......
......@@ -267,6 +267,21 @@ do { \
__wait_event(wq, condition); \
} while (0)
#define __io_wait_event(wq, condition) \
(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
io_schedule())
/*
* io_wait_event() -- like wait_event() but with io_schedule()
*/
#define io_wait_event(wq, condition) \
do { \
might_sleep(); \
if (condition) \
break; \
__io_wait_event(wq, condition); \
} while (0)
#define __wait_event_freezable(wq, condition) \
___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
schedule(); try_to_freeze())
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册