提交 4a319a49 编写于 作者: L Linus Torvalds

Merge branch 'for-3.17/core' of git://git.kernel.dk/linux-block

Pull block core bits from Jens Axboe:
 "Small round this time, after the massive blk-mq dump for 3.16.  This
  pull request contains:

   - Fixes for max_sectors overflow in ioctls from Akinoby Mita.

   - Partition off-by-one bug fix in aix partitions from Dan Carpenter.

   - Various small partition cleanups from Fabian Frederick.

   - Fix for the block integrity code sometimes returning the wrong
     vector count from Gu Zheng.

   - Cleanup an re-org of the blk-mq queue enter/exit percpu counters
     from Tejun.  Dependent on the percpu pull for 3.17 (which was in
     the block tree too), that you have already pulled in.

   - A blkcg oops fix, also from Tejun"

* 'for-3.17/core' of git://git.kernel.dk/linux-block:
  partitions: aix.c: off by one bug
  blkcg: don't call into policy draining if root_blkg is already gone
  Revert "bio: modify __bio_add_page() to accept pages that don't start a new segment"
  bio: modify __bio_add_page() to accept pages that don't start a new segment
  block: fix SG_[GS]ET_RESERVED_SIZE ioctl when max_sectors is huge
  block: fix BLKSECTGET ioctl when max_sectors is greater than USHRT_MAX
  block/partitions/efi.c: kerneldoc fixing
  block/partitions/msdos.c: code clean-up
  block/partitions/amiga.c: replace nolevel printk by pr_err
  block/partitions/aix.c: replace count*size kzalloc by kcalloc
  bio-integrity: add "bip_max_vcnt" into struct bio_integrity_payload
  blk-mq: use percpu_ref for mq usage count
  blk-mq: collapse __blk_mq_drain_queue() into blk_mq_freeze_queue()
  blk-mq: decouble blk-mq freezing from generic bypassing
  block, blk-mq: draining can't be skipped even if bypass_depth was non-zero
  blk-mq: fix a memory ordering bug in blk_mq_queue_enter()
......@@ -70,8 +70,10 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
bs->bvec_integrity_pool);
if (!bip->bip_vec)
goto err;
bip->bip_max_vcnt = bvec_nr_vecs(idx);
} else {
bip->bip_vec = bip->bip_inline_vecs;
bip->bip_max_vcnt = inline_vecs;
}
bip->bip_slab = idx;
......@@ -114,14 +116,6 @@ void bio_integrity_free(struct bio *bio)
}
EXPORT_SYMBOL(bio_integrity_free);
static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
{
if (bip->bip_slab == BIO_POOL_NONE)
return BIP_INLINE_VECS;
return bvec_nr_vecs(bip->bip_slab);
}
/**
* bio_integrity_add_page - Attach integrity metadata
* @bio: bio to update
......@@ -137,7 +131,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
struct bio_integrity_payload *bip = bio->bi_integrity;
struct bio_vec *iv;
if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
if (bip->bip_vcnt >= bip->bip_max_vcnt) {
printk(KERN_ERR "%s: bip_vec full\n", __func__);
return 0;
}
......
......@@ -438,14 +438,17 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
*/
void blk_queue_bypass_start(struct request_queue *q)
{
bool drain;
spin_lock_irq(q->queue_lock);
drain = !q->bypass_depth++;
q->bypass_depth++;
queue_flag_set(QUEUE_FLAG_BYPASS, q);
spin_unlock_irq(q->queue_lock);
if (drain) {
/*
* Queues start drained. Skip actual draining till init is
* complete. This avoids lenghty delays during queue init which
* can happen many times during boot.
*/
if (blk_queue_init_done(q)) {
spin_lock_irq(q->queue_lock);
__blk_drain_queue(q, false);
spin_unlock_irq(q->queue_lock);
......@@ -511,7 +514,7 @@ void blk_cleanup_queue(struct request_queue *q)
* prevent that q->request_fn() gets invoked after draining finished.
*/
if (q->mq_ops) {
blk_mq_drain_queue(q);
blk_mq_freeze_queue(q);
spin_lock_irq(lock);
} else {
spin_lock_irq(lock);
......
......@@ -78,68 +78,47 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
static int blk_mq_queue_enter(struct request_queue *q)
{
int ret;
__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
smp_wmb();
/* we have problems freezing the queue if it's initializing */
if (!blk_queue_dying(q) &&
(!blk_queue_bypass(q) || !blk_queue_init_done(q)))
return 0;
__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
while (true) {
int ret;
spin_lock_irq(q->queue_lock);
ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
!blk_queue_bypass(q) || blk_queue_dying(q),
*q->queue_lock);
/* inc usage with lock hold to avoid freeze_queue runs here */
if (!ret && !blk_queue_dying(q))
__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
else if (blk_queue_dying(q))
ret = -ENODEV;
spin_unlock_irq(q->queue_lock);
if (percpu_ref_tryget_live(&q->mq_usage_counter))
return 0;
return ret;
ret = wait_event_interruptible(q->mq_freeze_wq,
!q->mq_freeze_depth || blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
if (ret)
return ret;
}
}
static void blk_mq_queue_exit(struct request_queue *q)
{
__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
percpu_ref_put(&q->mq_usage_counter);
}
void blk_mq_drain_queue(struct request_queue *q)
static void blk_mq_usage_counter_release(struct percpu_ref *ref)
{
while (true) {
s64 count;
spin_lock_irq(q->queue_lock);
count = percpu_counter_sum(&q->mq_usage_counter);
spin_unlock_irq(q->queue_lock);
struct request_queue *q =
container_of(ref, struct request_queue, mq_usage_counter);
if (count == 0)
break;
blk_mq_start_hw_queues(q);
msleep(10);
}
wake_up_all(&q->mq_freeze_wq);
}
/*
* Guarantee no request is in use, so we can change any data structure of
* the queue afterward.
*/
static void blk_mq_freeze_queue(struct request_queue *q)
void blk_mq_freeze_queue(struct request_queue *q)
{
bool drain;
spin_lock_irq(q->queue_lock);
drain = !q->bypass_depth++;
queue_flag_set(QUEUE_FLAG_BYPASS, q);
q->mq_freeze_depth++;
spin_unlock_irq(q->queue_lock);
if (drain)
blk_mq_drain_queue(q);
percpu_ref_kill(&q->mq_usage_counter);
blk_mq_run_queues(q, false);
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
}
static void blk_mq_unfreeze_queue(struct request_queue *q)
......@@ -147,14 +126,13 @@ static void blk_mq_unfreeze_queue(struct request_queue *q)
bool wake = false;
spin_lock_irq(q->queue_lock);
if (!--q->bypass_depth) {
queue_flag_clear(QUEUE_FLAG_BYPASS, q);
wake = true;
}
WARN_ON_ONCE(q->bypass_depth < 0);
wake = !--q->mq_freeze_depth;
WARN_ON_ONCE(q->mq_freeze_depth < 0);
spin_unlock_irq(q->queue_lock);
if (wake)
if (wake) {
percpu_ref_reinit(&q->mq_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
}
bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
......@@ -1798,7 +1776,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
if (!q)
goto err_hctxs;
if (percpu_counter_init(&q->mq_usage_counter, 0))
if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release))
goto err_map;
setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
......@@ -1891,7 +1869,7 @@ void blk_mq_free_queue(struct request_queue *q)
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
blk_mq_free_hw_queues(q, set);
percpu_counter_destroy(&q->mq_usage_counter);
percpu_ref_exit(&q->mq_usage_counter);
free_percpu(q->queue_ctx);
kfree(q->queue_hw_ctx);
......@@ -2050,8 +2028,7 @@ static int __init blk_mq_init(void)
{
blk_mq_cpu_init();
/* Must be called after percpu_counter_hotcpu_callback() */
hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
return 0;
}
......
......@@ -28,7 +28,7 @@ struct blk_mq_ctx {
void __blk_mq_complete_request(struct request *rq);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_init_flush(struct request_queue *q);
void blk_mq_drain_queue(struct request_queue *q);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
void blk_mq_clone_flush_request(struct request *flush_rq,
struct request *orig_rq);
......
......@@ -554,8 +554,8 @@ int blk_register_queue(struct gendisk *disk)
* Initialization must be complete by now. Finish the initial
* bypass from queue allocation.
*/
blk_queue_bypass_end(q);
queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
blk_queue_bypass_end(q);
ret = blk_trace_init_sysfs(dev);
if (ret)
......
......@@ -663,6 +663,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
fmode_t mode = file->f_mode;
struct backing_dev_info *bdi;
loff_t size;
unsigned int max_sectors;
/*
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
......@@ -719,8 +720,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKSSZGET: /* get block device hardware sector size */
return compat_put_int(arg, bdev_logical_block_size(bdev));
case BLKSECTGET:
return compat_put_ushort(arg,
queue_max_sectors(bdev_get_queue(bdev)));
max_sectors = min_t(unsigned int, USHRT_MAX,
queue_max_sectors(bdev_get_queue(bdev)));
return compat_put_ushort(arg, max_sectors);
case BLKROTATIONAL:
return compat_put_ushort(arg,
!blk_queue_nonrot(bdev_get_queue(bdev)));
......
......@@ -278,6 +278,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
struct backing_dev_info *bdi;
loff_t size;
int ret, n;
unsigned int max_sectors;
switch(cmd) {
case BLKFLSBUF:
......@@ -375,7 +376,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKDISCARDZEROES:
return put_uint(arg, bdev_discard_zeroes_data(bdev));
case BLKSECTGET:
return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
max_sectors = min_t(unsigned int, USHRT_MAX,
queue_max_sectors(bdev_get_queue(bdev)));
return put_ushort(arg, max_sectors);
case BLKROTATIONAL:
return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
case BLKRASET:
......
......@@ -215,7 +215,7 @@ int aix_partition(struct parsed_partitions *state)
numlvs = be16_to_cpu(p->numlvs);
put_dev_sector(sect);
}
lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL);
lvip = kcalloc(state->limit, sizeof(struct lv_info), GFP_KERNEL);
if (!lvip)
return 0;
if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) {
......@@ -253,7 +253,7 @@ int aix_partition(struct parsed_partitions *state)
continue;
}
lv_ix = be16_to_cpu(p->lv_ix) - 1;
if (lv_ix > state->limit) {
if (lv_ix >= state->limit) {
cur_lv_ix = -1;
continue;
}
......
......@@ -7,6 +7,8 @@
* Re-organised Feb 1998 Russell King
*/
#define pr_fmt(fmt) fmt
#include <linux/types.h>
#include <linux/affs_hardblocks.h>
......@@ -40,7 +42,7 @@ int amiga_partition(struct parsed_partitions *state)
data = read_part_sector(state, blk, &sect);
if (!data) {
if (warn_no_part)
printk("Dev %s: unable to read RDB block %d\n",
pr_err("Dev %s: unable to read RDB block %d\n",
bdevname(state->bdev, b), blk);
res = -1;
goto rdb_done;
......@@ -57,12 +59,12 @@ int amiga_partition(struct parsed_partitions *state)
*(__be32 *)(data+0xdc) = 0;
if (checksum_block((__be32 *)data,
be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
printk("Warning: Trashed word at 0xd0 in block %d "
"ignored in checksum calculation\n",blk);
pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
blk);
break;
}
printk("Dev %s: RDB in block %d has bad checksum\n",
pr_err("Dev %s: RDB in block %d has bad checksum\n",
bdevname(state->bdev, b), blk);
}
......@@ -83,7 +85,7 @@ int amiga_partition(struct parsed_partitions *state)
data = read_part_sector(state, blk, &sect);
if (!data) {
if (warn_no_part)
printk("Dev %s: unable to read partition block %d\n",
pr_err("Dev %s: unable to read partition block %d\n",
bdevname(state->bdev, b), blk);
res = -1;
goto rdb_done;
......
......@@ -121,7 +121,7 @@ __setup("gpt", force_gpt_fn);
/**
* efi_crc32() - EFI version of crc32 function
* @buf: buffer to calculate crc32 of
* @len - length of buf
* @len: length of buf
*
* Description: Returns EFI-style CRC32 value for @buf
*
......@@ -240,10 +240,10 @@ static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors)
/**
* read_lba(): Read bytes from disk, starting at given LBA
* @state
* @lba
* @buffer
* @size_t
* @state: disk parsed partitions
* @lba: the Logical Block Address of the partition table
* @buffer: destination buffer
* @count: bytes to read
*
* Description: Reads @count bytes from @state->bdev into @buffer.
* Returns number of bytes read on success, 0 on error.
......@@ -277,8 +277,8 @@ static size_t read_lba(struct parsed_partitions *state,
/**
* alloc_read_gpt_entries(): reads partition entries from disk
* @state
* @gpt - GPT header
* @state: disk parsed partitions
* @gpt: GPT header
*
* Description: Returns ptes on success, NULL on error.
* Allocates space for PTEs based on information found in @gpt.
......@@ -312,8 +312,8 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
/**
* alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
* @state
* @lba is the Logical Block Address of the partition table
* @state: disk parsed partitions
* @lba: the Logical Block Address of the partition table
*
* Description: returns GPT header on success, NULL on error. Allocates
* and fills a GPT header starting at @ from @state->bdev.
......@@ -340,10 +340,10 @@ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
/**
* is_gpt_valid() - tests one GPT header and PTEs for validity
* @state
* @lba is the logical block address of the GPT header to test
* @gpt is a GPT header ptr, filled on return.
* @ptes is a PTEs ptr, filled on return.
* @state: disk parsed partitions
* @lba: logical block address of the GPT header to test
* @gpt: GPT header ptr, filled on return.
* @ptes: PTEs ptr, filled on return.
*
* Description: returns 1 if valid, 0 on error.
* If valid, returns pointers to newly allocated GPT header and PTEs.
......@@ -461,8 +461,8 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
/**
* is_pte_valid() - tests one PTE for validity
* @pte is the pte to check
* @lastlba is last lba of the disk
* @pte:pte to check
* @lastlba: last lba of the disk
*
* Description: returns 1 if valid, 0 on error.
*/
......@@ -478,9 +478,10 @@ is_pte_valid(const gpt_entry *pte, const u64 lastlba)
/**
* compare_gpts() - Search disk for valid GPT headers and PTEs
* @pgpt is the primary GPT header
* @agpt is the alternate GPT header
* @lastlba is the last LBA number
* @pgpt: primary GPT header
* @agpt: alternate GPT header
* @lastlba: last LBA number
*
* Description: Returns nothing. Sanity checks pgpt and agpt fields
* and prints warnings on discrepancies.
*
......@@ -572,9 +573,10 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
/**
* find_valid_gpt() - Search disk for valid GPT headers and PTEs
* @state
* @gpt is a GPT header ptr, filled on return.
* @ptes is a PTEs ptr, filled on return.
* @state: disk parsed partitions
* @gpt: GPT header ptr, filled on return.
* @ptes: PTEs ptr, filled on return.
*
* Description: Returns 1 if valid, 0 on error.
* If valid, returns pointers to newly allocated GPT header and PTEs.
* Validity depends on PMBR being valid (or being overridden by the
......@@ -663,7 +665,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
/**
* efi_partition(struct parsed_partitions *state)
* @state
* @state: disk parsed partitions
*
* Description: called from check.c, if the disk contains GPT
* partitions, sets up partition entries in the kernel.
......
......@@ -159,8 +159,9 @@ static void parse_extended(struct parsed_partitions *state,
/*
* First process the data partition(s)
*/
for (i=0; i<4; i++, p++) {
for (i = 0; i < 4; i++, p++) {
sector_t offs, size, next;
if (!nr_sects(p) || is_extended_partition(p))
continue;
......@@ -194,7 +195,7 @@ static void parse_extended(struct parsed_partitions *state,
* It should be a link to the next logical partition.
*/
p -= 4;
for (i=0; i<4; i++, p++)
for (i = 0; i < 4; i++, p++)
if (nr_sects(p) && is_extended_partition(p))
break;
if (i == 4)
......@@ -243,8 +244,8 @@ static void parse_solaris_x86(struct parsed_partitions *state,
return;
}
/* Ensure we can handle previous case of VTOC with 8 entries gracefully */
max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
for (i=0; i<max_nparts && state->next<state->limit; i++) {
max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
for (i = 0; i < max_nparts && state->next < state->limit; i++) {
struct solaris_x86_slice *s = &v->v_slice[i];
char tmp[3 + 10 + 1 + 1];
......@@ -409,7 +410,7 @@ static void parse_minix(struct parsed_partitions *state,
/* The first sector of a Minix partition can have either
* a secondary MBR describing its subpartitions, or
* the normal boot sector. */
if (msdos_magic_present (data + 510) &&
if (msdos_magic_present(data + 510) &&
SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
......@@ -527,6 +528,7 @@ int msdos_partition(struct parsed_partitions *state)
for (slot = 1 ; slot <= 4 ; slot++, p++) {
sector_t start = start_sect(p)*sector_size;
sector_t size = nr_sects(p)*sector_size;
if (!size)
continue;
if (is_extended_partition(p)) {
......@@ -537,6 +539,7 @@ int msdos_partition(struct parsed_partitions *state)
* sector, although it may not be enough/proper.
*/
sector_t n = 2;
n = min(size, max(sector_size, n));
put_partition(state, slot, start, n);
......
......@@ -82,9 +82,18 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
return err;
}
static int max_sectors_bytes(struct request_queue *q)
{
unsigned int max_sectors = queue_max_sectors(q);
max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);
return max_sectors << 9;
}
static int sg_get_reserved_size(struct request_queue *q, int __user *p)
{
unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9);
int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));
return put_user(val, p);
}
......@@ -98,10 +107,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p)
if (size < 0)
return -EINVAL;
if (size > (queue_max_sectors(q) << 9))
size = queue_max_sectors(q) << 9;
q->sg_reserved_size = size;
q->sg_reserved_size = min(size, max_sectors_bytes(q));
return 0;
}
......
......@@ -308,6 +308,7 @@ struct bio_integrity_payload {
unsigned short bip_slab; /* slab the bip came from */
unsigned short bip_vcnt; /* # of integrity bio_vecs */
unsigned short bip_max_vcnt; /* integrity bio_vec slots */
unsigned bip_owns_buf:1; /* should free bip_buf */
struct work_struct bip_work; /* I/O completion */
......
......@@ -21,6 +21,7 @@
#include <linux/bsg.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/percpu-refcount.h>
#include <asm/scatterlist.h>
......@@ -470,6 +471,7 @@ struct request_queue {
struct mutex sysfs_lock;
int bypass_depth;
int mq_freeze_depth;
#if defined(CONFIG_BLK_DEV_BSG)
bsg_job_fn *bsg_job_fn;
......@@ -483,7 +485,7 @@ struct request_queue {
#endif
struct rcu_head rcu_head;
wait_queue_head_t mq_freeze_wq;
struct percpu_counter mq_usage_counter;
struct percpu_ref mq_usage_counter;
struct list_head all_q_node;
struct blk_mq_tag_set *tag_set;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册