提交 64120354 编写于 作者: L Linus Torvalds

Merge branch 'for-4.5/drivers' of git://git.kernel.dk/linux-block

Pull block driver updates from Jens Axboe:
 "This is the block driver pull request for 4.5, with the exception of
  NVMe, which is in a separate branch and will be posted after this one.

  This pull request contains:

   - A set of bcache stability fixes, which have been acked by Kent.
     These have been used and tested for more than a year by the
     community, so it's about time that they got in.

   - A set of drbd updates from the drbd team (Andreas, Lars, Philipp)
     and Markus Elfring, Oleg Drokin.

   - A set of fixes for xen blkback/front from the usual suspects, (Bob,
     Konrad) as well as community based fixes from Kiri, Julien, and
     Peng.

   - A 2038 time fix for sx8 from Shraddha, with a fix from me.

   - A small mtip32xx cleanup from Zhu Yanjun.

   - A null_blk division fix from Arnd"

* 'for-4.5/drivers' of git://git.kernel.dk/linux-block: (71 commits)
  null_blk: use sector_div instead of do_div
  mtip32xx: restrict variables visible in current code module
  xen/blkfront: Fix crash if backend doesn't follow the right states.
  xen/blkback: Fix two memory leaks.
  xen/blkback: make st_ statistics per ring
  xen/blkfront: Handle non-indirect grant with 64KB pages
  xen-blkfront: Introduce blkif_ring_get_request
  xen-blkback: clear PF_NOFREEZE for xen_blkif_schedule()
  xen/blkback: Free resources if connect_ring failed.
  xen/blocks: Return -EXX instead of -1
  xen/blkback: make pool of persistent grants and free pages per-queue
  xen/blkback: get the number of hardware queues/rings from blkfront
  xen/blkback: pseudo support for multi hardware queues/rings
  xen/blkback: separate ring information out of struct xen_blkif
  xen/blkfront: correct setting for xen_blkif_max_ring_order
  xen/blkfront: make persistent grants pool per-queue
  xen/blkfront: Remove duplicate setting of ->xbdev.
  xen/blkfront: Cleanup of comments, fix unaligned variables, and syntax errors.
  xen/blkfront: negotiate number of queues/rings to be used with backend
  xen/blkfront: split per device io_lock
  ...
...@@ -3665,13 +3665,12 @@ F: drivers/scsi/dpt* ...@@ -3665,13 +3665,12 @@ F: drivers/scsi/dpt*
F: drivers/scsi/dpt/ F: drivers/scsi/dpt/
DRBD DRIVER DRBD DRIVER
P: Philipp Reisner M: Philipp Reisner <philipp.reisner@linbit.com>
P: Lars Ellenberg M: Lars Ellenberg <lars.ellenberg@linbit.com>
M: drbd-dev@lists.linbit.com L: drbd-dev@lists.linbit.com
L: drbd-user@lists.linbit.com
W: http://www.drbd.org W: http://www.drbd.org
T: git git://git.drbd.org/linux-2.6-drbd.git drbd T: git git://git.linbit.com/linux-drbd.git
T: git git://git.drbd.org/drbd-8.3.git T: git git://git.linbit.com/drbd-8.4.git
S: Supported S: Supported
F: drivers/block/drbd/ F: drivers/block/drbd/
F: lib/lru_cache.c F: lib/lru_cache.c
......
...@@ -288,7 +288,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval * ...@@ -288,7 +288,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
return need_transaction; return need_transaction;
} }
static int al_write_transaction(struct drbd_device *device); #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
* are still coupled, or assume too much about their relation.
* Code below will not work if this is violated.
* Will be cleaned up with some followup patch.
*/
# error FIXME
#endif
static unsigned int al_extent_to_bm_page(unsigned int al_enr)
{
return al_enr >>
/* bit to page */
((PAGE_SHIFT + 3) -
/* al extent number to bit */
(AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
}
static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
{
const unsigned int stripes = device->ldev->md.al_stripes;
const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
/* transaction number, modulo on-disk ring buffer wrap around */
unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
/* ... to aligned 4k on disk block */
t = ((t % stripes) * stripe_size_4kB) + t/stripes;
/* ... to 512 byte sector in activity log */
t *= 8;
/* ... plus offset to the on disk position */
return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
}
static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer)
{
struct lc_element *e;
sector_t sector;
int i, mx;
unsigned extent_nr;
unsigned crc = 0;
int err = 0;
memset(buffer, 0, sizeof(*buffer));
buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
buffer->tr_number = cpu_to_be32(device->al_tr_number);
i = 0;
/* Even though no one can start to change this list
* once we set the LC_LOCKED -- from drbd_al_begin_io(),
* lc_try_lock_for_transaction() --, someone may still
* be in the process of changing it. */
spin_lock_irq(&device->al_lock);
list_for_each_entry(e, &device->act_log->to_be_changed, list) {
if (i == AL_UPDATES_PER_TRANSACTION) {
i++;
break;
}
buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
if (e->lc_number != LC_FREE)
drbd_bm_mark_for_writeout(device,
al_extent_to_bm_page(e->lc_number));
i++;
}
spin_unlock_irq(&device->al_lock);
BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
buffer->n_updates = cpu_to_be16(i);
for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
buffer->update_slot_nr[i] = cpu_to_be16(-1);
buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
}
buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
device->act_log->nr_elements - device->al_tr_cycle);
for (i = 0; i < mx; i++) {
unsigned idx = device->al_tr_cycle + i;
extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
buffer->context[i] = cpu_to_be32(extent_nr);
}
for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
buffer->context[i] = cpu_to_be32(LC_FREE);
device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
if (device->al_tr_cycle >= device->act_log->nr_elements)
device->al_tr_cycle = 0;
sector = al_tr_number_to_on_disk_sector(device);
crc = crc32c(0, buffer, 4096);
buffer->crc32c = cpu_to_be32(crc);
if (drbd_bm_write_hinted(device))
err = -EIO;
else {
bool write_al_updates;
rcu_read_lock();
write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
rcu_read_unlock();
if (write_al_updates) {
if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
err = -EIO;
drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
} else {
device->al_tr_number++;
device->al_writ_cnt++;
}
}
}
return err;
}
static int al_write_transaction(struct drbd_device *device)
{
struct al_transaction_on_disk *buffer;
int err;
if (!get_ldev(device)) {
drbd_err(device, "disk is %s, cannot start al transaction\n",
drbd_disk_str(device->state.disk));
return -EIO;
}
/* The bitmap write may have failed, causing a state change. */
if (device->state.disk < D_INCONSISTENT) {
drbd_err(device,
"disk is %s, cannot write al transaction\n",
drbd_disk_str(device->state.disk));
put_ldev(device);
return -EIO;
}
/* protects md_io_buffer, al_tr_cycle, ... */
buffer = drbd_md_get_buffer(device, __func__);
if (!buffer) {
drbd_err(device, "disk failed while waiting for md_io buffer\n");
put_ldev(device);
return -ENODEV;
}
err = __al_write_transaction(device, buffer);
drbd_md_put_buffer(device);
put_ldev(device);
return err;
}
void drbd_al_begin_io_commit(struct drbd_device *device) void drbd_al_begin_io_commit(struct drbd_device *device)
{ {
...@@ -420,153 +575,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) ...@@ -420,153 +575,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
wake_up(&device->al_wait); wake_up(&device->al_wait);
} }
#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
* are still coupled, or assume too much about their relation.
* Code below will not work if this is violated.
* Will be cleaned up with some followup patch.
*/
# error FIXME
#endif
static unsigned int al_extent_to_bm_page(unsigned int al_enr)
{
return al_enr >>
/* bit to page */
((PAGE_SHIFT + 3) -
/* al extent number to bit */
(AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
}
static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
{
const unsigned int stripes = device->ldev->md.al_stripes;
const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
/* transaction number, modulo on-disk ring buffer wrap around */
unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
/* ... to aligned 4k on disk block */
t = ((t % stripes) * stripe_size_4kB) + t/stripes;
/* ... to 512 byte sector in activity log */
t *= 8;
/* ... plus offset to the on disk position */
return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
}
int al_write_transaction(struct drbd_device *device)
{
struct al_transaction_on_disk *buffer;
struct lc_element *e;
sector_t sector;
int i, mx;
unsigned extent_nr;
unsigned crc = 0;
int err = 0;
if (!get_ldev(device)) {
drbd_err(device, "disk is %s, cannot start al transaction\n",
drbd_disk_str(device->state.disk));
return -EIO;
}
/* The bitmap write may have failed, causing a state change. */
if (device->state.disk < D_INCONSISTENT) {
drbd_err(device,
"disk is %s, cannot write al transaction\n",
drbd_disk_str(device->state.disk));
put_ldev(device);
return -EIO;
}
/* protects md_io_buffer, al_tr_cycle, ... */
buffer = drbd_md_get_buffer(device, __func__);
if (!buffer) {
drbd_err(device, "disk failed while waiting for md_io buffer\n");
put_ldev(device);
return -ENODEV;
}
memset(buffer, 0, sizeof(*buffer));
buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
buffer->tr_number = cpu_to_be32(device->al_tr_number);
i = 0;
/* Even though no one can start to change this list
* once we set the LC_LOCKED -- from drbd_al_begin_io(),
* lc_try_lock_for_transaction() --, someone may still
* be in the process of changing it. */
spin_lock_irq(&device->al_lock);
list_for_each_entry(e, &device->act_log->to_be_changed, list) {
if (i == AL_UPDATES_PER_TRANSACTION) {
i++;
break;
}
buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
if (e->lc_number != LC_FREE)
drbd_bm_mark_for_writeout(device,
al_extent_to_bm_page(e->lc_number));
i++;
}
spin_unlock_irq(&device->al_lock);
BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
buffer->n_updates = cpu_to_be16(i);
for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
buffer->update_slot_nr[i] = cpu_to_be16(-1);
buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
}
buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
device->act_log->nr_elements - device->al_tr_cycle);
for (i = 0; i < mx; i++) {
unsigned idx = device->al_tr_cycle + i;
extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
buffer->context[i] = cpu_to_be32(extent_nr);
}
for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
buffer->context[i] = cpu_to_be32(LC_FREE);
device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
if (device->al_tr_cycle >= device->act_log->nr_elements)
device->al_tr_cycle = 0;
sector = al_tr_number_to_on_disk_sector(device);
crc = crc32c(0, buffer, 4096);
buffer->crc32c = cpu_to_be32(crc);
if (drbd_bm_write_hinted(device))
err = -EIO;
else {
bool write_al_updates;
rcu_read_lock();
write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
rcu_read_unlock();
if (write_al_updates) {
if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
err = -EIO;
drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
} else {
device->al_tr_number++;
device->al_writ_cnt++;
}
}
}
drbd_md_put_buffer(device);
put_ldev(device);
return err;
}
static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
{ {
int rv; int rv;
...@@ -606,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device) ...@@ -606,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device)
wake_up(&device->al_wait); wake_up(&device->al_wait);
} }
int drbd_initialize_al(struct drbd_device *device, void *buffer) int drbd_al_initialize(struct drbd_device *device, void *buffer)
{ {
struct al_transaction_on_disk *al = buffer; struct al_transaction_on_disk *al = buffer;
struct drbd_md *md = &device->ldev->md; struct drbd_md *md = &device->ldev->md;
sector_t al_base = md->md_offset + md->al_offset;
int al_size_4k = md->al_stripes * md->al_stripe_size_4k; int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
int i; int i;
memset(al, 0, 4096); __al_write_transaction(device, al);
al->magic = cpu_to_be32(DRBD_AL_MAGIC); /* There may or may not have been a pending transaction. */
al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); spin_lock_irq(&device->al_lock);
al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); lc_committed(device->act_log);
spin_unlock_irq(&device->al_lock);
for (i = 0; i < al_size_4k; i++) { /* The rest of the transactions will have an empty "updates" list, and
int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); * are written out only to provide the context, and to initialize the
* on-disk ring buffer. */
for (i = 1; i < al_size_4k; i++) {
int err = __al_write_transaction(device, al);
if (err) if (err)
return err; return err;
} }
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/bitops.h> #include <linux/bitmap.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/drbd.h> #include <linux/drbd.h>
...@@ -479,8 +479,14 @@ void drbd_bm_cleanup(struct drbd_device *device) ...@@ -479,8 +479,14 @@ void drbd_bm_cleanup(struct drbd_device *device)
* this masks out the remaining bits. * this masks out the remaining bits.
* Returns the number of bits cleared. * Returns the number of bits cleared.
*/ */
#ifndef BITS_PER_PAGE
#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3)) #define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
#else
# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
# error "ambiguous BITS_PER_PAGE"
# endif
#endif
#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1) #define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
static int bm_clear_surplus(struct drbd_bitmap *b) static int bm_clear_surplus(struct drbd_bitmap *b)
{ {
...@@ -559,21 +565,19 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) ...@@ -559,21 +565,19 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
unsigned long *p_addr; unsigned long *p_addr;
unsigned long bits = 0; unsigned long bits = 0;
unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1; unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
int idx, i, last_word; int idx, last_word;
/* all but last page */ /* all but last page */
for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) { for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
p_addr = __bm_map_pidx(b, idx); p_addr = __bm_map_pidx(b, idx);
for (i = 0; i < LWPP; i++) bits += bitmap_weight(p_addr, BITS_PER_PAGE);
bits += hweight_long(p_addr[i]);
__bm_unmap(p_addr); __bm_unmap(p_addr);
cond_resched(); cond_resched();
} }
/* last (or only) page */ /* last (or only) page */
last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL; last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
p_addr = __bm_map_pidx(b, idx); p_addr = __bm_map_pidx(b, idx);
for (i = 0; i < last_word; i++) bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
bits += hweight_long(p_addr[i]);
p_addr[last_word] &= cpu_to_lel(mask); p_addr[last_word] &= cpu_to_lel(mask);
bits += hweight_long(p_addr[last_word]); bits += hweight_long(p_addr[last_word]);
/* 32bit arch, may have an unused padding long */ /* 32bit arch, may have an unused padding long */
...@@ -1419,6 +1423,9 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, ...@@ -1419,6 +1423,9 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
int bits; int bits;
int changed = 0; int changed = 0;
unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
/* I think it is more cache line friendly to hweight_long then set to ~0UL,
* than to first bitmap_weight() all words, then bitmap_fill() all words */
for (i = first_word; i < last_word; i++) { for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]); bits = hweight_long(paddr[i]);
paddr[i] = ~0UL; paddr[i] = ~0UL;
...@@ -1628,8 +1635,7 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr) ...@@ -1628,8 +1635,7 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
int n = e-s; int n = e-s;
p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
bm = p_addr + MLPP(s); bm = p_addr + MLPP(s);
while (n--) count += bitmap_weight(bm, n * BITS_PER_LONG);
count += hweight_long(*bm++);
bm_unmap(p_addr); bm_unmap(p_addr);
} else { } else {
drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s); drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
......
...@@ -771,6 +771,13 @@ static int device_data_gen_id_show(struct seq_file *m, void *ignored) ...@@ -771,6 +771,13 @@ static int device_data_gen_id_show(struct seq_file *m, void *ignored)
return 0; return 0;
} }
static int device_ed_gen_id_show(struct seq_file *m, void *ignored)
{
struct drbd_device *device = m->private;
seq_printf(m, "0x%016llX\n", (unsigned long long)device->ed_uuid);
return 0;
}
#define drbd_debugfs_device_attr(name) \ #define drbd_debugfs_device_attr(name) \
static int device_ ## name ## _open(struct inode *inode, struct file *file) \ static int device_ ## name ## _open(struct inode *inode, struct file *file) \
{ \ { \
...@@ -796,6 +803,7 @@ drbd_debugfs_device_attr(oldest_requests) ...@@ -796,6 +803,7 @@ drbd_debugfs_device_attr(oldest_requests)
drbd_debugfs_device_attr(act_log_extents) drbd_debugfs_device_attr(act_log_extents)
drbd_debugfs_device_attr(resync_extents) drbd_debugfs_device_attr(resync_extents)
drbd_debugfs_device_attr(data_gen_id) drbd_debugfs_device_attr(data_gen_id)
drbd_debugfs_device_attr(ed_gen_id)
void drbd_debugfs_device_add(struct drbd_device *device) void drbd_debugfs_device_add(struct drbd_device *device)
{ {
...@@ -839,6 +847,7 @@ void drbd_debugfs_device_add(struct drbd_device *device) ...@@ -839,6 +847,7 @@ void drbd_debugfs_device_add(struct drbd_device *device)
DCF(act_log_extents); DCF(act_log_extents);
DCF(resync_extents); DCF(resync_extents);
DCF(data_gen_id); DCF(data_gen_id);
DCF(ed_gen_id);
#undef DCF #undef DCF
return; return;
...@@ -854,6 +863,7 @@ void drbd_debugfs_device_cleanup(struct drbd_device *device) ...@@ -854,6 +863,7 @@ void drbd_debugfs_device_cleanup(struct drbd_device *device)
drbd_debugfs_remove(&device->debugfs_vol_act_log_extents); drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
drbd_debugfs_remove(&device->debugfs_vol_resync_extents); drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
drbd_debugfs_remove(&device->debugfs_vol_data_gen_id); drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
drbd_debugfs_remove(&device->debugfs_vol_ed_gen_id);
drbd_debugfs_remove(&device->debugfs_vol); drbd_debugfs_remove(&device->debugfs_vol);
} }
......
...@@ -77,13 +77,6 @@ extern int fault_devs; ...@@ -77,13 +77,6 @@ extern int fault_devs;
extern char usermode_helper[]; extern char usermode_helper[];
/* I don't remember why XCPU ...
* This is used to wake the asender,
* and to interrupt sending the sending task
* on disconnect.
*/
#define DRBD_SIG SIGXCPU
/* This is used to stop/restart our threads. /* This is used to stop/restart our threads.
* Cannot use SIGTERM nor SIGKILL, since these * Cannot use SIGTERM nor SIGKILL, since these
* are sent out by init on runlevel changes * are sent out by init on runlevel changes
...@@ -292,6 +285,9 @@ struct drbd_device_work { ...@@ -292,6 +285,9 @@ struct drbd_device_work {
extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *); extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
extern void lock_all_resources(void);
extern void unlock_all_resources(void);
struct drbd_request { struct drbd_request {
struct drbd_work w; struct drbd_work w;
struct drbd_device *device; struct drbd_device *device;
...@@ -504,7 +500,6 @@ enum { ...@@ -504,7 +500,6 @@ enum {
MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
SUSPEND_IO, /* suspend application io */
BITMAP_IO, /* suspend application io; BITMAP_IO, /* suspend application io;
once no more io in flight, start bitmap io */ once no more io in flight, start bitmap io */
BITMAP_IO_QUEUED, /* Started bitmap IO */ BITMAP_IO_QUEUED, /* Started bitmap IO */
...@@ -632,12 +627,6 @@ struct bm_io_work { ...@@ -632,12 +627,6 @@ struct bm_io_work {
void (*done)(struct drbd_device *device, int rv); void (*done)(struct drbd_device *device, int rv);
}; };
enum write_ordering_e {
WO_none,
WO_drain_io,
WO_bdev_flush,
};
struct fifo_buffer { struct fifo_buffer {
unsigned int head_index; unsigned int head_index;
unsigned int size; unsigned int size;
...@@ -650,8 +639,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size); ...@@ -650,8 +639,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size);
enum { enum {
NET_CONGESTED, /* The data socket is congested */ NET_CONGESTED, /* The data socket is congested */
RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
SEND_PING, /* whether asender should send a ping asap */ SEND_PING,
SIGNAL_ASENDER, /* whether asender wants to be interrupted */
GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */
CONN_WD_ST_CHG_OKAY, CONN_WD_ST_CHG_OKAY,
...@@ -670,6 +658,8 @@ enum { ...@@ -670,6 +658,8 @@ enum {
DEVICE_WORK_PENDING, /* tell worker that some device has pending work */ DEVICE_WORK_PENDING, /* tell worker that some device has pending work */
}; };
enum which_state { NOW, OLD = NOW, NEW };
struct drbd_resource { struct drbd_resource {
char *name; char *name;
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS
...@@ -755,7 +745,8 @@ struct drbd_connection { ...@@ -755,7 +745,8 @@ struct drbd_connection {
unsigned long last_reconnect_jif; unsigned long last_reconnect_jif;
struct drbd_thread receiver; struct drbd_thread receiver;
struct drbd_thread worker; struct drbd_thread worker;
struct drbd_thread asender; struct drbd_thread ack_receiver;
struct workqueue_struct *ack_sender;
/* cached pointers, /* cached pointers,
* so we can look up the oldest pending requests more quickly. * so we can look up the oldest pending requests more quickly.
...@@ -774,6 +765,8 @@ struct drbd_connection { ...@@ -774,6 +765,8 @@ struct drbd_connection {
struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST]; struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
struct { struct {
unsigned long last_sent_barrier_jif;
/* whether this sender thread /* whether this sender thread
* has processed a single write yet. */ * has processed a single write yet. */
bool seen_any_write_yet; bool seen_any_write_yet;
...@@ -788,6 +781,17 @@ struct drbd_connection { ...@@ -788,6 +781,17 @@ struct drbd_connection {
} send; } send;
}; };
static inline bool has_net_conf(struct drbd_connection *connection)
{
bool has_net_conf;
rcu_read_lock();
has_net_conf = rcu_dereference(connection->net_conf);
rcu_read_unlock();
return has_net_conf;
}
void __update_timing_details( void __update_timing_details(
struct drbd_thread_timing_details *tdp, struct drbd_thread_timing_details *tdp,
unsigned int *cb_nr, unsigned int *cb_nr,
...@@ -811,6 +815,7 @@ struct drbd_peer_device { ...@@ -811,6 +815,7 @@ struct drbd_peer_device {
struct list_head peer_devices; struct list_head peer_devices;
struct drbd_device *device; struct drbd_device *device;
struct drbd_connection *connection; struct drbd_connection *connection;
struct work_struct send_acks_work;
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_peer_dev; struct dentry *debugfs_peer_dev;
#endif #endif
...@@ -829,6 +834,7 @@ struct drbd_device { ...@@ -829,6 +834,7 @@ struct drbd_device {
struct dentry *debugfs_vol_act_log_extents; struct dentry *debugfs_vol_act_log_extents;
struct dentry *debugfs_vol_resync_extents; struct dentry *debugfs_vol_resync_extents;
struct dentry *debugfs_vol_data_gen_id; struct dentry *debugfs_vol_data_gen_id;
struct dentry *debugfs_vol_ed_gen_id;
#endif #endif
unsigned int vnr; /* volume number within the connection */ unsigned int vnr; /* volume number within the connection */
...@@ -873,6 +879,7 @@ struct drbd_device { ...@@ -873,6 +879,7 @@ struct drbd_device {
atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
atomic_t unacked_cnt; /* Need to send replies for */ atomic_t unacked_cnt; /* Need to send replies for */
atomic_t local_cnt; /* Waiting for local completion */ atomic_t local_cnt; /* Waiting for local completion */
atomic_t suspend_cnt;
/* Interval tree of pending local requests */ /* Interval tree of pending local requests */
struct rb_root read_requests; struct rb_root read_requests;
...@@ -1020,6 +1027,12 @@ static inline struct drbd_peer_device *first_peer_device(struct drbd_device *dev ...@@ -1020,6 +1027,12 @@ static inline struct drbd_peer_device *first_peer_device(struct drbd_device *dev
return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
} }
static inline struct drbd_peer_device *
conn_peer_device(struct drbd_connection *connection, int volume_number)
{
return idr_find(&connection->peer_devices, volume_number);
}
#define for_each_resource(resource, _resources) \ #define for_each_resource(resource, _resources) \
list_for_each_entry(resource, _resources, resources) list_for_each_entry(resource, _resources, resources)
...@@ -1113,7 +1126,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int ...@@ -1113,7 +1126,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
extern int drbd_send_bitmap(struct drbd_device *device); extern int drbd_send_bitmap(struct drbd_device *device);
extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
extern void drbd_free_ldev(struct drbd_backing_dev *ldev); extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
extern void drbd_device_cleanup(struct drbd_device *device); extern void drbd_device_cleanup(struct drbd_device *device);
void drbd_print_uuids(struct drbd_device *device, const char *text); void drbd_print_uuids(struct drbd_device *device, const char *text);
...@@ -1424,7 +1437,7 @@ extern struct bio_set *drbd_md_io_bio_set; ...@@ -1424,7 +1437,7 @@ extern struct bio_set *drbd_md_io_bio_set;
/* to allocate from that set */ /* to allocate from that set */
extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
extern rwlock_t global_state_lock; extern struct mutex resources_mutex;
extern int conn_lowest_minor(struct drbd_connection *connection); extern int conn_lowest_minor(struct drbd_connection *connection);
extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
...@@ -1454,6 +1467,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); ...@@ -1454,6 +1467,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
/* drbd_nl.c */ /* drbd_nl.c */
extern struct mutex notification_mutex;
extern void drbd_suspend_io(struct drbd_device *device); extern void drbd_suspend_io(struct drbd_device *device);
extern void drbd_resume_io(struct drbd_device *device); extern void drbd_resume_io(struct drbd_device *device);
extern char *ppsize(char *buf, unsigned long long size); extern char *ppsize(char *buf, unsigned long long size);
...@@ -1536,7 +1552,9 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); ...@@ -1536,7 +1552,9 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
/* drbd_receiver.c */ /* drbd_receiver.c */
extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_receiver(struct drbd_thread *thi);
extern int drbd_asender(struct drbd_thread *thi); extern int drbd_ack_receiver(struct drbd_thread *thi);
extern void drbd_send_ping_wf(struct work_struct *ws);
extern void drbd_send_acks_wf(struct work_struct *ws);
extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
bool throttle_if_app_is_waiting); bool throttle_if_app_is_waiting);
...@@ -1649,7 +1667,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s ...@@ -1649,7 +1667,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s
#define drbd_rs_failed_io(device, sector, size) \ #define drbd_rs_failed_io(device, sector, size) \
__drbd_change_sync(device, sector, size, RECORD_RS_FAILED) __drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
extern void drbd_al_shrink(struct drbd_device *device); extern void drbd_al_shrink(struct drbd_device *device);
extern int drbd_initialize_al(struct drbd_device *, void *); extern int drbd_al_initialize(struct drbd_device *, void *);
/* drbd_nl.c */ /* drbd_nl.c */
/* state info broadcast */ /* state info broadcast */
...@@ -1668,6 +1686,29 @@ struct sib_info { ...@@ -1668,6 +1686,29 @@ struct sib_info {
}; };
void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib); void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
extern void notify_resource_state(struct sk_buff *,
unsigned int,
struct drbd_resource *,
struct resource_info *,
enum drbd_notification_type);
extern void notify_device_state(struct sk_buff *,
unsigned int,
struct drbd_device *,
struct device_info *,
enum drbd_notification_type);
extern void notify_connection_state(struct sk_buff *,
unsigned int,
struct drbd_connection *,
struct connection_info *,
enum drbd_notification_type);
extern void notify_peer_device_state(struct sk_buff *,
unsigned int,
struct drbd_peer_device *,
struct peer_device_info *,
enum drbd_notification_type);
extern void notify_helper(enum drbd_notification_type, struct drbd_device *,
struct drbd_connection *, const char *, int);
/* /*
* inline helper functions * inline helper functions
*************************/ *************************/
...@@ -1694,19 +1735,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r ...@@ -1694,19 +1735,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r
return 0; return 0;
} }
static inline enum drbd_state_rv
_drbd_set_state(struct drbd_device *device, union drbd_state ns,
enum chg_state_flags flags, struct completion *done)
{
enum drbd_state_rv rv;
read_lock(&global_state_lock);
rv = __drbd_set_state(device, ns, flags, done);
read_unlock(&global_state_lock);
return rv;
}
static inline union drbd_state drbd_read_state(struct drbd_device *device) static inline union drbd_state drbd_read_state(struct drbd_device *device)
{ {
struct drbd_resource *resource = device->resource; struct drbd_resource *resource = device->resource;
...@@ -1937,16 +1965,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit) ...@@ -1937,16 +1965,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit)
extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
static inline void wake_asender(struct drbd_connection *connection) /* To get the ack_receiver out of the blocking network stack,
* so it can change its sk_rcvtimeo from idle- to ping-timeout,
* and send a ping, we need to send a signal.
* Which signal we send is irrelevant. */
static inline void wake_ack_receiver(struct drbd_connection *connection)
{ {
if (test_bit(SIGNAL_ASENDER, &connection->flags)) struct task_struct *task = connection->ack_receiver.task;
force_sig(DRBD_SIG, connection->asender.task); if (task && get_t_state(&connection->ack_receiver) == RUNNING)
force_sig(SIGXCPU, task);
} }
static inline void request_ping(struct drbd_connection *connection) static inline void request_ping(struct drbd_connection *connection)
{ {
set_bit(SEND_PING, &connection->flags); set_bit(SEND_PING, &connection->flags);
wake_asender(connection); wake_ack_receiver(connection);
} }
extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *); extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
...@@ -2230,7 +2263,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device) ...@@ -2230,7 +2263,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device)
if (drbd_suspended(device)) if (drbd_suspended(device))
return false; return false;
if (test_bit(SUSPEND_IO, &device->flags)) if (atomic_read(&device->suspend_cnt))
return false; return false;
/* to avoid potential deadlock or bitmap corruption, /* to avoid potential deadlock or bitmap corruption,
......
...@@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0 ...@@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0
*/ */
struct idr drbd_devices; struct idr drbd_devices;
struct list_head drbd_resources; struct list_head drbd_resources;
struct mutex resources_mutex;
struct kmem_cache *drbd_request_cache; struct kmem_cache *drbd_request_cache;
struct kmem_cache *drbd_ee_cache; /* peer requests */ struct kmem_cache *drbd_ee_cache; /* peer requests */
...@@ -1435,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str ...@@ -1435,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str
/* long elapsed = (long)(jiffies - device->last_received); */ /* long elapsed = (long)(jiffies - device->last_received); */
drop_it = connection->meta.socket == sock drop_it = connection->meta.socket == sock
|| !connection->asender.task || !connection->ack_receiver.task
|| get_t_state(&connection->asender) != RUNNING || get_t_state(&connection->ack_receiver) != RUNNING
|| connection->cstate < C_WF_REPORT_PARAMS; || connection->cstate < C_WF_REPORT_PARAMS;
if (drop_it) if (drop_it)
...@@ -1793,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock, ...@@ -1793,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock,
drbd_update_congested(connection); drbd_update_congested(connection);
} }
do { do {
/* STRANGE
* tcp_sendmsg does _not_ use its size parameter at all ?
*
* -EAGAIN on timeout, -EINTR on signal.
*/
/* THINK
* do we need to block DRBD_SIG if sock == &meta.socket ??
* otherwise wake_asender() might interrupt some send_*Ack !
*/
rv = kernel_sendmsg(sock, &msg, &iov, 1, size); rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
if (rv == -EAGAIN) { if (rv == -EAGAIN) {
if (we_should_drop_the_connection(connection, sock)) if (we_should_drop_the_connection(connection, sock))
...@@ -2000,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device) ...@@ -2000,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device)
drbd_bm_cleanup(device); drbd_bm_cleanup(device);
} }
drbd_free_ldev(device->ldev); drbd_backing_dev_free(device, device->ldev);
device->ldev = NULL; device->ldev = NULL;
clear_bit(AL_SUSPENDED, &device->flags); clear_bit(AL_SUSPENDED, &device->flags);
...@@ -2179,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref) ...@@ -2179,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref)
if (device->this_bdev) if (device->this_bdev)
bdput(device->this_bdev); bdput(device->this_bdev);
drbd_free_ldev(device->ldev); drbd_backing_dev_free(device, device->ldev);
device->ldev = NULL; device->ldev = NULL;
drbd_release_all_peer_reqs(device); drbd_release_all_peer_reqs(device);
...@@ -2563,7 +2555,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op ...@@ -2563,7 +2555,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
cpumask_copy(resource->cpu_mask, new_cpu_mask); cpumask_copy(resource->cpu_mask, new_cpu_mask);
for_each_connection_rcu(connection, resource) { for_each_connection_rcu(connection, resource) {
connection->receiver.reset_cpu_mask = 1; connection->receiver.reset_cpu_mask = 1;
connection->asender.reset_cpu_mask = 1; connection->ack_receiver.reset_cpu_mask = 1;
connection->worker.reset_cpu_mask = 1; connection->worker.reset_cpu_mask = 1;
} }
} }
...@@ -2590,7 +2582,7 @@ struct drbd_resource *drbd_create_resource(const char *name) ...@@ -2590,7 +2582,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
kref_init(&resource->kref); kref_init(&resource->kref);
idr_init(&resource->devices); idr_init(&resource->devices);
INIT_LIST_HEAD(&resource->connections); INIT_LIST_HEAD(&resource->connections);
resource->write_ordering = WO_bdev_flush; resource->write_ordering = WO_BDEV_FLUSH;
list_add_tail_rcu(&resource->resources, &drbd_resources); list_add_tail_rcu(&resource->resources, &drbd_resources);
mutex_init(&resource->conf_update); mutex_init(&resource->conf_update);
mutex_init(&resource->adm_mutex); mutex_init(&resource->adm_mutex);
...@@ -2652,8 +2644,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) ...@@ -2652,8 +2644,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
connection->receiver.connection = connection; connection->receiver.connection = connection;
drbd_thread_init(resource, &connection->worker, drbd_worker, "worker"); drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
connection->worker.connection = connection; connection->worker.connection = connection;
drbd_thread_init(resource, &connection->asender, drbd_asender, "asender"); drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv");
connection->asender.connection = connection; connection->ack_receiver.connection = connection;
kref_init(&connection->kref); kref_init(&connection->kref);
...@@ -2702,8 +2694,8 @@ static int init_submitter(struct drbd_device *device) ...@@ -2702,8 +2694,8 @@ static int init_submitter(struct drbd_device *device)
{ {
/* opencoded create_singlethread_workqueue(), /* opencoded create_singlethread_workqueue(),
* to be able to say "drbd%d", ..., minor */ * to be able to say "drbd%d", ..., minor */
device->submit.wq = alloc_workqueue("drbd%u_submit", device->submit.wq =
WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor); alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor);
if (!device->submit.wq) if (!device->submit.wq)
return -ENOMEM; return -ENOMEM;
...@@ -2820,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig ...@@ -2820,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
goto out_idr_remove_from_resource; goto out_idr_remove_from_resource;
} }
kref_get(&connection->kref); kref_get(&connection->kref);
INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);
} }
if (init_submitter(device)) { if (init_submitter(device)) {
...@@ -2923,7 +2916,7 @@ static int __init drbd_init(void) ...@@ -2923,7 +2916,7 @@ static int __init drbd_init(void)
drbd_proc = NULL; /* play safe for drbd_cleanup */ drbd_proc = NULL; /* play safe for drbd_cleanup */
idr_init(&drbd_devices); idr_init(&drbd_devices);
rwlock_init(&global_state_lock); mutex_init(&resources_mutex);
INIT_LIST_HEAD(&drbd_resources); INIT_LIST_HEAD(&drbd_resources);
err = drbd_genl_register(); err = drbd_genl_register();
...@@ -2971,18 +2964,6 @@ static int __init drbd_init(void) ...@@ -2971,18 +2964,6 @@ static int __init drbd_init(void)
return err; return err;
} }
void drbd_free_ldev(struct drbd_backing_dev *ldev)
{
if (ldev == NULL)
return;
blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
kfree(ldev->disk_conf);
kfree(ldev);
}
static void drbd_free_one_sock(struct drbd_socket *ds) static void drbd_free_one_sock(struct drbd_socket *ds)
{ {
struct socket *s; struct socket *s;
...@@ -3277,6 +3258,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev) ...@@ -3277,6 +3258,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
* and read it. */ * and read it. */
bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
bdev->md.md_offset = drbd_md_ss(bdev); bdev->md.md_offset = drbd_md_ss(bdev);
/* Even for (flexible or indexed) external meta data,
* initially restrict us to the 4k superblock for now.
* Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
bdev->md.md_size_sect = 8;
if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) { if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
/* NOTE: can't do normal error processing here as this is /* NOTE: can't do normal error processing here as this is
...@@ -3578,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device, ...@@ -3578,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
spin_lock_irq(&device->resource->req_lock); spin_lock_irq(&device->resource->req_lock);
set_bit(BITMAP_IO, &device->flags); set_bit(BITMAP_IO, &device->flags);
if (atomic_read(&device->ap_bio_cnt) == 0) { /* don't wait for pending application IO if the caller indicates that
* application IO does not conflict anyways. */
if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) {
if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
drbd_queue_work(&first_peer_device(device)->connection->sender_work, drbd_queue_work(&first_peer_device(device)->connection->sender_work,
&device->bm_io_work.w); &device->bm_io_work.w);
...@@ -3746,6 +3733,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i) ...@@ -3746,6 +3733,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
return 0; return 0;
} }
void lock_all_resources(void)
{
struct drbd_resource *resource;
int __maybe_unused i = 0;
mutex_lock(&resources_mutex);
local_irq_disable();
for_each_resource(resource, &drbd_resources)
spin_lock_nested(&resource->req_lock, i++);
}
void unlock_all_resources(void)
{
struct drbd_resource *resource;
for_each_resource(resource, &drbd_resources)
spin_unlock(&resource->req_lock);
local_irq_enable();
mutex_unlock(&resources_mutex);
}
#ifdef CONFIG_DRBD_FAULT_INJECTION #ifdef CONFIG_DRBD_FAULT_INJECTION
/* Fault insertion support including random number generator shamelessly /* Fault insertion support including random number generator shamelessly
* stolen from kernel/rcutorture.c */ * stolen from kernel/rcutorture.c */
......
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include "drbd_int.h" #include "drbd_int.h"
#include "drbd_protocol.h" #include "drbd_protocol.h"
#include "drbd_req.h" #include "drbd_req.h"
#include "drbd_state_change.h"
#include <asm/unaligned.h> #include <asm/unaligned.h>
#include <linux/drbd_limits.h> #include <linux/drbd_limits.h>
#include <linux/kthread.h> #include <linux/kthread.h>
...@@ -75,11 +76,24 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); ...@@ -75,11 +76,24 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
/* .dumpit */ /* .dumpit */
int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb);
int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb);
int drbd_adm_dump_devices_done(struct netlink_callback *cb);
int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb);
int drbd_adm_dump_connections_done(struct netlink_callback *cb);
int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb);
int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb);
int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb);
#include <linux/drbd_genl_api.h> #include <linux/drbd_genl_api.h>
#include "drbd_nla.h" #include "drbd_nla.h"
#include <linux/genl_magic_func.h> #include <linux/genl_magic_func.h>
static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */
DEFINE_MUTEX(notification_mutex);
/* used blkdev_get_by_path, to claim our meta data device(s) */ /* used blkdev_get_by_path, to claim our meta data device(s) */
static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
...@@ -349,6 +363,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) ...@@ -349,6 +363,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
sib.sib_reason = SIB_HELPER_PRE; sib.sib_reason = SIB_HELPER_PRE;
sib.helper_name = cmd; sib.helper_name = cmd;
drbd_bcast_event(device, &sib); drbd_bcast_event(device, &sib);
notify_helper(NOTIFY_CALL, device, connection, cmd, 0);
ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
if (ret) if (ret)
drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n", drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
...@@ -361,6 +376,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) ...@@ -361,6 +376,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
sib.sib_reason = SIB_HELPER_POST; sib.sib_reason = SIB_HELPER_POST;
sib.helper_exit_code = ret; sib.helper_exit_code = ret;
drbd_bcast_event(device, &sib); drbd_bcast_event(device, &sib);
notify_helper(NOTIFY_RESPONSE, device, connection, cmd, ret);
if (current == connection->worker.task) if (current == connection->worker.task)
clear_bit(CALLBACK_PENDING, &connection->flags); clear_bit(CALLBACK_PENDING, &connection->flags);
...@@ -388,6 +404,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd) ...@@ -388,6 +404,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name); drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
/* TODO: conn_bcast_event() ?? */ /* TODO: conn_bcast_event() ?? */
notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0);
ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
if (ret) if (ret)
...@@ -399,6 +416,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd) ...@@ -399,6 +416,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
usermode_helper, cmd, resource_name, usermode_helper, cmd, resource_name,
(ret >> 8) & 0xff, ret); (ret >> 8) & 0xff, ret);
/* TODO: conn_bcast_event() ?? */ /* TODO: conn_bcast_event() ?? */
notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret);
if (ret < 0) /* Ignore any ERRNOs we got. */ if (ret < 0) /* Ignore any ERRNOs we got. */
ret = 0; ret = 0;
...@@ -847,9 +865,11 @@ char *ppsize(char *buf, unsigned long long size) ...@@ -847,9 +865,11 @@ char *ppsize(char *buf, unsigned long long size)
* and can be long lived. * and can be long lived.
* This changes an device->flag, is triggered by drbd internals, * This changes an device->flag, is triggered by drbd internals,
* and should be short-lived. */ * and should be short-lived. */
/* It needs to be a counter, since multiple threads might
independently suspend and resume IO. */
void drbd_suspend_io(struct drbd_device *device) void drbd_suspend_io(struct drbd_device *device)
{ {
set_bit(SUSPEND_IO, &device->flags); atomic_inc(&device->suspend_cnt);
if (drbd_suspended(device)) if (drbd_suspended(device))
return; return;
wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt)); wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
...@@ -857,8 +877,8 @@ void drbd_suspend_io(struct drbd_device *device) ...@@ -857,8 +877,8 @@ void drbd_suspend_io(struct drbd_device *device)
void drbd_resume_io(struct drbd_device *device) void drbd_resume_io(struct drbd_device *device)
{ {
clear_bit(SUSPEND_IO, &device->flags); if (atomic_dec_and_test(&device->suspend_cnt))
wake_up(&device->misc_wait); wake_up(&device->misc_wait);
} }
/** /**
...@@ -871,27 +891,32 @@ void drbd_resume_io(struct drbd_device *device) ...@@ -871,27 +891,32 @@ void drbd_resume_io(struct drbd_device *device)
enum determine_dev_size enum determine_dev_size
drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
{ {
sector_t prev_first_sect, prev_size; /* previous meta location */ struct md_offsets_and_sizes {
sector_t la_size_sect, u_size; u64 last_agreed_sect;
u64 md_offset;
s32 al_offset;
s32 bm_offset;
u32 md_size_sect;
u32 al_stripes;
u32 al_stripe_size_4k;
} prev;
sector_t u_size, size;
struct drbd_md *md = &device->ldev->md; struct drbd_md *md = &device->ldev->md;
u32 prev_al_stripe_size_4k;
u32 prev_al_stripes;
sector_t size;
char ppb[10]; char ppb[10];
void *buffer; void *buffer;
int md_moved, la_size_changed; int md_moved, la_size_changed;
enum determine_dev_size rv = DS_UNCHANGED; enum determine_dev_size rv = DS_UNCHANGED;
/* race: /* We may change the on-disk offsets of our meta data below. Lock out
* application request passes inc_ap_bio, * anything that may cause meta data IO, to avoid acting on incomplete
* but then cannot get an AL-reference. * layout changes or scribbling over meta data that is in the process
* this function later may wait on ap_bio_cnt == 0. -> deadlock. * of being moved.
* *
* to avoid that: * Move is not exactly correct, btw, currently we have all our meta
* Suspend IO right here. * data in core memory, to "move" it we just write it all out, there
* still lock the act_log to not trigger ASSERTs there. * are no reads. */
*/
drbd_suspend_io(device); drbd_suspend_io(device);
buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
if (!buffer) { if (!buffer) {
...@@ -899,19 +924,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct ...@@ -899,19 +924,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
return DS_ERROR; return DS_ERROR;
} }
/* no wait necessary anymore, actually we could assert that */ /* remember current offset and sizes */
wait_event(device->al_wait, lc_try_lock(device->act_log)); prev.last_agreed_sect = md->la_size_sect;
prev.md_offset = md->md_offset;
prev_first_sect = drbd_md_first_sector(device->ldev); prev.al_offset = md->al_offset;
prev_size = device->ldev->md.md_size_sect; prev.bm_offset = md->bm_offset;
la_size_sect = device->ldev->md.la_size_sect; prev.md_size_sect = md->md_size_sect;
prev.al_stripes = md->al_stripes;
prev.al_stripe_size_4k = md->al_stripe_size_4k;
if (rs) { if (rs) {
/* rs is non NULL if we should change the AL layout only */ /* rs is non NULL if we should change the AL layout only */
prev_al_stripes = md->al_stripes;
prev_al_stripe_size_4k = md->al_stripe_size_4k;
md->al_stripes = rs->al_stripes; md->al_stripes = rs->al_stripes;
md->al_stripe_size_4k = rs->al_stripe_size / 4; md->al_stripe_size_4k = rs->al_stripe_size / 4;
md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
...@@ -924,7 +947,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct ...@@ -924,7 +947,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
rcu_read_unlock(); rcu_read_unlock();
size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED); size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
if (size < la_size_sect) { if (size < prev.last_agreed_sect) {
if (rs && u_size == 0) { if (rs && u_size == 0) {
/* Remove "rs &&" later. This check should always be active, but /* Remove "rs &&" later. This check should always be active, but
right now the receiver expects the permissive behavior */ right now the receiver expects the permissive behavior */
...@@ -945,30 +968,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct ...@@ -945,30 +968,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC)); err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
if (unlikely(err)) { if (unlikely(err)) {
/* currently there is only one error: ENOMEM! */ /* currently there is only one error: ENOMEM! */
size = drbd_bm_capacity(device)>>1; size = drbd_bm_capacity(device);
if (size == 0) { if (size == 0) {
drbd_err(device, "OUT OF MEMORY! " drbd_err(device, "OUT OF MEMORY! "
"Could not allocate bitmap!\n"); "Could not allocate bitmap!\n");
} else { } else {
drbd_err(device, "BM resizing failed. " drbd_err(device, "BM resizing failed. "
"Leaving size unchanged at size = %lu KB\n", "Leaving size unchanged\n");
(unsigned long)size);
} }
rv = DS_ERROR; rv = DS_ERROR;
} }
/* racy, see comments above. */ /* racy, see comments above. */
drbd_set_my_capacity(device, size); drbd_set_my_capacity(device, size);
device->ldev->md.la_size_sect = size; md->la_size_sect = size;
drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
(unsigned long long)size>>1); (unsigned long long)size>>1);
} }
if (rv <= DS_ERROR) if (rv <= DS_ERROR)
goto err_out; goto err_out;
la_size_changed = (la_size_sect != device->ldev->md.la_size_sect); la_size_changed = (prev.last_agreed_sect != md->la_size_sect);
md_moved = prev_first_sect != drbd_md_first_sector(device->ldev) md_moved = prev.md_offset != md->md_offset
|| prev_size != device->ldev->md.md_size_sect; || prev.md_size_sect != md->md_size_sect;
if (la_size_changed || md_moved || rs) { if (la_size_changed || md_moved || rs) {
u32 prev_flags; u32 prev_flags;
...@@ -977,20 +999,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct ...@@ -977,20 +999,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
* Clear the timer, to avoid scary "timer expired!" messages, * Clear the timer, to avoid scary "timer expired!" messages,
* "Superblock" is written out at least twice below, anyways. */ * "Superblock" is written out at least twice below, anyways. */
del_timer(&device->md_sync_timer); del_timer(&device->md_sync_timer);
drbd_al_shrink(device); /* All extents inactive. */
/* We won't change the "al-extents" setting, we just may need
* to move the on-disk location of the activity log ringbuffer.
* Lock for transaction is good enough, it may well be "dirty"
* or even "starving". */
wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log));
/* mark current on-disk bitmap and activity log as unreliable */
prev_flags = md->flags; prev_flags = md->flags;
md->flags &= ~MDF_PRIMARY_IND; md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED;
drbd_md_write(device, buffer); drbd_md_write(device, buffer);
drbd_al_initialize(device, buffer);
drbd_info(device, "Writing the whole bitmap, %s\n", drbd_info(device, "Writing the whole bitmap, %s\n",
la_size_changed && md_moved ? "size changed and md moved" : la_size_changed && md_moved ? "size changed and md moved" :
la_size_changed ? "size changed" : "md moved"); la_size_changed ? "size changed" : "md moved");
/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
"size changed", BM_LOCKED_MASK); "size changed", BM_LOCKED_MASK);
drbd_initialize_al(device, buffer);
/* on-disk bitmap and activity log is authoritative again
* (unless there was an IO error meanwhile...) */
md->flags = prev_flags; md->flags = prev_flags;
drbd_md_write(device, buffer); drbd_md_write(device, buffer);
...@@ -999,20 +1030,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct ...@@ -999,20 +1030,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
md->al_stripes, md->al_stripe_size_4k * 4); md->al_stripes, md->al_stripe_size_4k * 4);
} }
if (size > la_size_sect) if (size > prev.last_agreed_sect)
rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO; rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO;
if (size < la_size_sect) if (size < prev.last_agreed_sect)
rv = DS_SHRUNK; rv = DS_SHRUNK;
if (0) { if (0) {
err_out: err_out:
if (rs) { /* restore previous offset and sizes */
md->al_stripes = prev_al_stripes; md->la_size_sect = prev.last_agreed_sect;
md->al_stripe_size_4k = prev_al_stripe_size_4k; md->md_offset = prev.md_offset;
md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; md->al_offset = prev.al_offset;
md->bm_offset = prev.bm_offset;
drbd_md_set_sector_offsets(device, device->ldev); md->md_size_sect = prev.md_size_sect;
} md->al_stripes = prev.al_stripes;
md->al_stripe_size_4k = prev.al_stripe_size_4k;
md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k;
} }
lc_unlock(device->act_log); lc_unlock(device->act_log);
wake_up(&device->al_wait); wake_up(&device->al_wait);
...@@ -1115,8 +1148,7 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) ...@@ -1115,8 +1148,7 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
lc_destroy(n); lc_destroy(n);
return -EBUSY; return -EBUSY;
} else { } else {
if (t) lc_destroy(t);
lc_destroy(t);
} }
drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */ drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
return 0; return 0;
...@@ -1151,21 +1183,20 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi ...@@ -1151,21 +1183,20 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
if (b) { if (b) {
struct drbd_connection *connection = first_peer_device(device)->connection; struct drbd_connection *connection = first_peer_device(device)->connection;
blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
if (blk_queue_discard(b) && if (blk_queue_discard(b) &&
(connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
/* For now, don't allow more than one activity log extent worth of data /* We don't care, stacking below should fix it for the local device.
* to be discarded in one go. We may need to rework drbd_al_begin_io() * Whether or not it is a suitable granularity on the remote device
* to allow for even larger discard ranges */ * is not our problem, really. If you care, you need to
blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); * use devices with similar topology on all peers. */
q->limits.discard_granularity = 512;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
/* REALLY? Is stacking secdiscard "legal"? */
if (blk_queue_secdiscard(b))
queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
} else { } else {
blk_queue_max_discard_sectors(q, 0); blk_queue_max_discard_sectors(q, 0);
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); q->limits.discard_granularity = 0;
} }
blk_queue_stack_limits(q, b); blk_queue_stack_limits(q, b);
...@@ -1177,6 +1208,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi ...@@ -1177,6 +1208,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
} }
} }
/* To avoid confusion, if this queue does not support discard, clear
* max_discard_sectors, which is what lsblk -D reports to the user. */
if (!blk_queue_discard(q)) {
blk_queue_max_discard_sectors(q, 0);
q->limits.discard_granularity = 0;
}
} }
void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
...@@ -1241,8 +1278,8 @@ static void conn_reconfig_done(struct drbd_connection *connection) ...@@ -1241,8 +1278,8 @@ static void conn_reconfig_done(struct drbd_connection *connection)
connection->cstate == C_STANDALONE; connection->cstate == C_STANDALONE;
spin_unlock_irq(&connection->resource->req_lock); spin_unlock_irq(&connection->resource->req_lock);
if (stop_threads) { if (stop_threads) {
/* asender is implicitly stopped by receiver /* ack_receiver thread and ack_sender workqueue are implicitly
* in conn_disconnect() */ * stopped by receiver in conn_disconnect() */
drbd_thread_stop(&connection->receiver); drbd_thread_stop(&connection->receiver);
drbd_thread_stop(&connection->worker); drbd_thread_stop(&connection->worker);
} }
...@@ -1389,13 +1426,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) ...@@ -1389,13 +1426,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
goto fail_unlock; goto fail_unlock;
} }
write_lock_irq(&global_state_lock); lock_all_resources();
retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
if (retcode == NO_ERROR) { if (retcode == NO_ERROR) {
rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
drbd_resync_after_changed(device); drbd_resync_after_changed(device);
} }
write_unlock_irq(&global_state_lock); unlock_all_resources();
if (retcode != NO_ERROR) if (retcode != NO_ERROR)
goto fail_unlock; goto fail_unlock;
...@@ -1418,7 +1455,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) ...@@ -1418,7 +1455,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
set_bit(MD_NO_FUA, &device->flags); set_bit(MD_NO_FUA, &device->flags);
if (write_ordering_changed(old_disk_conf, new_disk_conf)) if (write_ordering_changed(old_disk_conf, new_disk_conf))
drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush); drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
drbd_md_sync(device); drbd_md_sync(device);
...@@ -1449,6 +1486,88 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) ...@@ -1449,6 +1486,88 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
return 0; return 0;
} }
static struct block_device *open_backing_dev(struct drbd_device *device,
const char *bdev_path, void *claim_ptr, bool do_bd_link)
{
struct block_device *bdev;
int err = 0;
bdev = blkdev_get_by_path(bdev_path,
FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr);
if (IS_ERR(bdev)) {
drbd_err(device, "open(\"%s\") failed with %ld\n",
bdev_path, PTR_ERR(bdev));
return bdev;
}
if (!do_bd_link)
return bdev;
err = bd_link_disk_holder(bdev, device->vdisk);
if (err) {
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n",
bdev_path, err);
bdev = ERR_PTR(err);
}
return bdev;
}
static int open_backing_devices(struct drbd_device *device,
struct disk_conf *new_disk_conf,
struct drbd_backing_dev *nbc)
{
struct block_device *bdev;
bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true);
if (IS_ERR(bdev))
return ERR_OPEN_DISK;
nbc->backing_bdev = bdev;
/*
* meta_dev_idx >= 0: external fixed size, possibly multiple
* drbd sharing one meta device. TODO in that case, paranoia
* check that [md_bdev, meta_dev_idx] is not yet used by some
* other drbd minor! (if you use drbd.conf + drbdadm, that
* should check it for you already; but if you don't, or
* someone fooled it, we need to double check here)
*/
bdev = open_backing_dev(device, new_disk_conf->meta_dev,
/* claim ptr: device, if claimed exclusively; shared drbd_m_holder,
* if potentially shared with other drbd minors */
(new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder,
/* avoid double bd_claim_by_disk() for the same (source,target) tuple,
* as would happen with internal metadata. */
(new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT &&
new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL));
if (IS_ERR(bdev))
return ERR_OPEN_MD_DISK;
nbc->md_bdev = bdev;
return NO_ERROR;
}
static void close_backing_dev(struct drbd_device *device, struct block_device *bdev,
bool do_bd_unlink)
{
if (!bdev)
return;
if (do_bd_unlink)
bd_unlink_disk_holder(bdev, device->vdisk);
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}
void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev)
{
if (ldev == NULL)
return;
close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev);
close_backing_dev(device, ldev->backing_bdev, true);
kfree(ldev->disk_conf);
kfree(ldev);
}
int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
{ {
struct drbd_config_context adm_ctx; struct drbd_config_context adm_ctx;
...@@ -1462,7 +1581,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) ...@@ -1462,7 +1581,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
sector_t min_md_device_sectors; sector_t min_md_device_sectors;
struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
struct disk_conf *new_disk_conf = NULL; struct disk_conf *new_disk_conf = NULL;
struct block_device *bdev;
struct lru_cache *resync_lru = NULL; struct lru_cache *resync_lru = NULL;
struct fifo_buffer *new_plan = NULL; struct fifo_buffer *new_plan = NULL;
union drbd_state ns, os; union drbd_state ns, os;
...@@ -1478,7 +1596,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) ...@@ -1478,7 +1596,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
device = adm_ctx.device; device = adm_ctx.device;
mutex_lock(&adm_ctx.resource->adm_mutex); mutex_lock(&adm_ctx.resource->adm_mutex);
peer_device = first_peer_device(device); peer_device = first_peer_device(device);
connection = peer_device ? peer_device->connection : NULL; connection = peer_device->connection;
conn_reconfig_start(connection); conn_reconfig_start(connection);
/* if you want to reconfigure, please tear down first */ /* if you want to reconfigure, please tear down first */
...@@ -1539,12 +1657,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) ...@@ -1539,12 +1657,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
goto fail; goto fail;
} }
write_lock_irq(&global_state_lock);
retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
write_unlock_irq(&global_state_lock);
if (retcode != NO_ERROR)
goto fail;
rcu_read_lock(); rcu_read_lock();
nc = rcu_dereference(connection->net_conf); nc = rcu_dereference(connection->net_conf);
if (nc) { if (nc) {
...@@ -1556,35 +1668,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) ...@@ -1556,35 +1668,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
} }
rcu_read_unlock(); rcu_read_unlock();
bdev = blkdev_get_by_path(new_disk_conf->backing_dev, retcode = open_backing_devices(device, new_disk_conf, nbc);
FMODE_READ | FMODE_WRITE | FMODE_EXCL, device); if (retcode != NO_ERROR)
if (IS_ERR(bdev)) {
drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
PTR_ERR(bdev));
retcode = ERR_OPEN_DISK;
goto fail;
}
nbc->backing_bdev = bdev;
/*
* meta_dev_idx >= 0: external fixed size, possibly multiple
* drbd sharing one meta device. TODO in that case, paranoia
* check that [md_bdev, meta_dev_idx] is not yet used by some
* other drbd minor! (if you use drbd.conf + drbdadm, that
* should check it for you already; but if you don't, or
* someone fooled it, we need to double check here)
*/
bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL,
(new_disk_conf->meta_dev_idx < 0) ?
(void *)device : (void *)drbd_m_holder);
if (IS_ERR(bdev)) {
drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
PTR_ERR(bdev));
retcode = ERR_OPEN_MD_DISK;
goto fail; goto fail;
}
nbc->md_bdev = bdev;
if ((nbc->backing_bdev == nbc->md_bdev) != if ((nbc->backing_bdev == nbc->md_bdev) !=
(new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
...@@ -1707,6 +1793,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) ...@@ -1707,6 +1793,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
goto force_diskless_dec; goto force_diskless_dec;
} }
lock_all_resources();
retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
if (retcode != NO_ERROR) {
unlock_all_resources();
goto force_diskless_dec;
}
/* Reset the "barriers don't work" bits here, then force meta data to /* Reset the "barriers don't work" bits here, then force meta data to
* be written, to ensure we determine if barriers are supported. */ * be written, to ensure we determine if barriers are supported. */
if (new_disk_conf->md_flushes) if (new_disk_conf->md_flushes)
...@@ -1727,7 +1820,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) ...@@ -1727,7 +1820,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
new_disk_conf = NULL; new_disk_conf = NULL;
new_plan = NULL; new_plan = NULL;
drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush); drbd_resync_after_changed(device);
drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH);
unlock_all_resources();
if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
set_bit(CRASHED_PRIMARY, &device->flags); set_bit(CRASHED_PRIMARY, &device->flags);
...@@ -1875,12 +1970,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) ...@@ -1875,12 +1970,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
fail: fail:
conn_reconfig_done(connection); conn_reconfig_done(connection);
if (nbc) { if (nbc) {
if (nbc->backing_bdev) close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev);
blkdev_put(nbc->backing_bdev, close_backing_dev(device, nbc->backing_bdev, true);
FMODE_READ | FMODE_WRITE | FMODE_EXCL);
if (nbc->md_bdev)
blkdev_put(nbc->md_bdev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL);
kfree(nbc); kfree(nbc);
} }
kfree(new_disk_conf); kfree(new_disk_conf);
...@@ -1895,6 +1986,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) ...@@ -1895,6 +1986,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
static int adm_detach(struct drbd_device *device, int force) static int adm_detach(struct drbd_device *device, int force)
{ {
enum drbd_state_rv retcode; enum drbd_state_rv retcode;
void *buffer;
int ret; int ret;
if (force) { if (force) {
...@@ -1905,13 +1997,16 @@ static int adm_detach(struct drbd_device *device, int force) ...@@ -1905,13 +1997,16 @@ static int adm_detach(struct drbd_device *device, int force)
} }
drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ buffer = drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
retcode = drbd_request_state(device, NS(disk, D_FAILED)); if (buffer) {
drbd_md_put_buffer(device); retcode = drbd_request_state(device, NS(disk, D_FAILED));
drbd_md_put_buffer(device);
} else /* already <= D_FAILED */
retcode = SS_NOTHING_TO_DO;
/* D_FAILED will transition to DISKLESS. */ /* D_FAILED will transition to DISKLESS. */
drbd_resume_io(device);
ret = wait_event_interruptible(device->misc_wait, ret = wait_event_interruptible(device->misc_wait,
device->state.disk != D_FAILED); device->state.disk != D_FAILED);
drbd_resume_io(device);
if ((int)retcode == (int)SS_IS_DISKLESS) if ((int)retcode == (int)SS_IS_DISKLESS)
retcode = SS_NOTHING_TO_DO; retcode = SS_NOTHING_TO_DO;
if (ret) if (ret)
...@@ -2245,8 +2340,31 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) ...@@ -2245,8 +2340,31 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
return 0; return 0;
} }
static void connection_to_info(struct connection_info *info,
struct drbd_connection *connection)
{
info->conn_connection_state = connection->cstate;
info->conn_role = conn_highest_peer(connection);
}
static void peer_device_to_info(struct peer_device_info *info,
struct drbd_peer_device *peer_device)
{
struct drbd_device *device = peer_device->device;
info->peer_repl_state =
max_t(enum drbd_conns, C_WF_REPORT_PARAMS, device->state.conn);
info->peer_disk_state = device->state.pdsk;
info->peer_resync_susp_user = device->state.user_isp;
info->peer_resync_susp_peer = device->state.peer_isp;
info->peer_resync_susp_dependency = device->state.aftr_isp;
}
int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
{ {
struct connection_info connection_info;
enum drbd_notification_type flags;
unsigned int peer_devices = 0;
struct drbd_config_context adm_ctx; struct drbd_config_context adm_ctx;
struct drbd_peer_device *peer_device; struct drbd_peer_device *peer_device;
struct net_conf *old_net_conf, *new_net_conf = NULL; struct net_conf *old_net_conf, *new_net_conf = NULL;
...@@ -2347,6 +2465,22 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) ...@@ -2347,6 +2465,22 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
connection->peer_addr_len = nla_len(adm_ctx.peer_addr); connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len); memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
idr_for_each_entry(&connection->peer_devices, peer_device, i) {
peer_devices++;
}
connection_to_info(&connection_info, connection);
flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
mutex_lock(&notification_mutex);
notify_connection_state(NULL, 0, connection, &connection_info, NOTIFY_CREATE | flags);
idr_for_each_entry(&connection->peer_devices, peer_device, i) {
struct peer_device_info peer_device_info;
peer_device_to_info(&peer_device_info, peer_device);
flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags);
}
mutex_unlock(&notification_mutex);
mutex_unlock(&adm_ctx.resource->conf_update); mutex_unlock(&adm_ctx.resource->conf_update);
rcu_read_lock(); rcu_read_lock();
...@@ -2428,6 +2562,8 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection ...@@ -2428,6 +2562,8 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection
drbd_err(connection, drbd_err(connection,
"unexpected rv2=%d in conn_try_disconnect()\n", "unexpected rv2=%d in conn_try_disconnect()\n",
rv2); rv2);
/* Unlike in DRBD 9, the state engine has generated
* NOTIFY_DESTROY events before clearing connection->net_conf. */
} }
return rv; return rv;
} }
...@@ -2585,6 +2721,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) ...@@ -2585,6 +2721,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
mutex_unlock(&device->resource->conf_update); mutex_unlock(&device->resource->conf_update);
synchronize_rcu(); synchronize_rcu();
kfree(old_disk_conf); kfree(old_disk_conf);
new_disk_conf = NULL;
} }
ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
...@@ -2618,6 +2755,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) ...@@ -2618,6 +2755,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
fail_ldev: fail_ldev:
put_ldev(device); put_ldev(device);
kfree(new_disk_conf);
goto fail; goto fail;
} }
...@@ -2855,7 +2993,30 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) ...@@ -2855,7 +2993,30 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
mutex_lock(&adm_ctx.resource->adm_mutex); mutex_lock(&adm_ctx.resource->adm_mutex);
device = adm_ctx.device; device = adm_ctx.device;
if (test_bit(NEW_CUR_UUID, &device->flags)) { if (test_bit(NEW_CUR_UUID, &device->flags)) {
drbd_uuid_new_current(device); if (get_ldev_if_state(device, D_ATTACHING)) {
drbd_uuid_new_current(device);
put_ldev(device);
} else {
/* This is effectively a multi-stage "forced down".
* The NEW_CUR_UUID bit is supposedly only set, if we
* lost the replication connection, and are configured
* to freeze IO and wait for some fence-peer handler.
* So we still don't have a replication connection.
* And now we don't have a local disk either. After
* resume, we will fail all pending and new IO, because
* we don't have any data anymore. Which means we will
* eventually be able to terminate all users of this
* device, and then take it down. By bumping the
* "effective" data uuid, we make sure that you really
* need to tear down before you reconfigure, we will
* the refuse to re-connect or re-attach (because no
* matching real data uuid exists).
*/
u64 val;
get_random_bytes(&val, sizeof(u64));
drbd_set_ed_uuid(device, val);
drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n");
}
clear_bit(NEW_CUR_UUID, &device->flags); clear_bit(NEW_CUR_UUID, &device->flags);
} }
drbd_suspend_io(device); drbd_suspend_io(device);
...@@ -2909,6 +3070,486 @@ static int nla_put_drbd_cfg_context(struct sk_buff *skb, ...@@ -2909,6 +3070,486 @@ static int nla_put_drbd_cfg_context(struct sk_buff *skb,
return -EMSGSIZE; return -EMSGSIZE;
} }
/*
* The generic netlink dump callbacks are called outside the genl_lock(), so
* they cannot use the simple attribute parsing code which uses global
* attribute tables.
*/
static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr)
{
const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
struct nlattr *nla;
nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen),
DRBD_NLA_CFG_CONTEXT);
if (!nla)
return NULL;
return drbd_nla_find_nested(maxtype, nla, __nla_type(attr));
}
static void resource_to_info(struct resource_info *, struct drbd_resource *);
int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb)
{
struct drbd_genlmsghdr *dh;
struct drbd_resource *resource;
struct resource_info resource_info;
struct resource_statistics resource_statistics;
int err;
rcu_read_lock();
if (cb->args[0]) {
for_each_resource_rcu(resource, &drbd_resources)
if (resource == (struct drbd_resource *)cb->args[0])
goto found_resource;
err = 0; /* resource was probably deleted */
goto out;
}
resource = list_entry(&drbd_resources,
struct drbd_resource, resources);
found_resource:
list_for_each_entry_continue_rcu(resource, &drbd_resources, resources) {
goto put_result;
}
err = 0;
goto out;
put_result:
dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, &drbd_genl_family,
NLM_F_MULTI, DRBD_ADM_GET_RESOURCES);
err = -ENOMEM;
if (!dh)
goto out;
dh->minor = -1U;
dh->ret_code = NO_ERROR;
err = nla_put_drbd_cfg_context(skb, resource, NULL, NULL);
if (err)
goto out;
err = res_opts_to_skb(skb, &resource->res_opts, !capable(CAP_SYS_ADMIN));
if (err)
goto out;
resource_to_info(&resource_info, resource);
err = resource_info_to_skb(skb, &resource_info, !capable(CAP_SYS_ADMIN));
if (err)
goto out;
resource_statistics.res_stat_write_ordering = resource->write_ordering;
err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
if (err)
goto out;
cb->args[0] = (long)resource;
genlmsg_end(skb, dh);
err = 0;
out:
rcu_read_unlock();
if (err)
return err;
return skb->len;
}
static void device_to_statistics(struct device_statistics *s,
struct drbd_device *device)
{
memset(s, 0, sizeof(*s));
s->dev_upper_blocked = !may_inc_ap_bio(device);
if (get_ldev(device)) {
struct drbd_md *md = &device->ldev->md;
u64 *history_uuids = (u64 *)s->history_uuids;
struct request_queue *q;
int n;
spin_lock_irq(&md->uuid_lock);
s->dev_current_uuid = md->uuid[UI_CURRENT];
BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1);
for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++)
history_uuids[n] = md->uuid[UI_HISTORY_START + n];
for (; n < HISTORY_UUIDS; n++)
history_uuids[n] = 0;
s->history_uuids_len = HISTORY_UUIDS;
spin_unlock_irq(&md->uuid_lock);
s->dev_disk_flags = md->flags;
q = bdev_get_queue(device->ldev->backing_bdev);
s->dev_lower_blocked =
bdi_congested(&q->backing_dev_info,
(1 << WB_async_congested) |
(1 << WB_sync_congested));
put_ldev(device);
}
s->dev_size = drbd_get_capacity(device->this_bdev);
s->dev_read = device->read_cnt;
s->dev_write = device->writ_cnt;
s->dev_al_writes = device->al_writ_cnt;
s->dev_bm_writes = device->bm_writ_cnt;
s->dev_upper_pending = atomic_read(&device->ap_bio_cnt);
s->dev_lower_pending = atomic_read(&device->local_cnt);
s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags);
s->dev_exposed_data_uuid = device->ed_uuid;
}
static int put_resource_in_arg0(struct netlink_callback *cb, int holder_nr)
{
if (cb->args[0]) {
struct drbd_resource *resource =
(struct drbd_resource *)cb->args[0];
kref_put(&resource->kref, drbd_destroy_resource);
}
return 0;
}
int drbd_adm_dump_devices_done(struct netlink_callback *cb) {
return put_resource_in_arg0(cb, 7);
}
static void device_to_info(struct device_info *, struct drbd_device *);
int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb)
{
struct nlattr *resource_filter;
struct drbd_resource *resource;
struct drbd_device *uninitialized_var(device);
int minor, err, retcode;
struct drbd_genlmsghdr *dh;
struct device_info device_info;
struct device_statistics device_statistics;
struct idr *idr_to_search;
resource = (struct drbd_resource *)cb->args[0];
if (!cb->args[0] && !cb->args[1]) {
resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
if (resource_filter) {
retcode = ERR_RES_NOT_KNOWN;
resource = drbd_find_resource(nla_data(resource_filter));
if (!resource)
goto put_result;
cb->args[0] = (long)resource;
}
}
rcu_read_lock();
minor = cb->args[1];
idr_to_search = resource ? &resource->devices : &drbd_devices;
device = idr_get_next(idr_to_search, &minor);
if (!device) {
err = 0;
goto out;
}
idr_for_each_entry_continue(idr_to_search, device, minor) {
retcode = NO_ERROR;
goto put_result; /* only one iteration */
}
err = 0;
goto out; /* no more devices */
put_result:
dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, &drbd_genl_family,
NLM_F_MULTI, DRBD_ADM_GET_DEVICES);
err = -ENOMEM;
if (!dh)
goto out;
dh->ret_code = retcode;
dh->minor = -1U;
if (retcode == NO_ERROR) {
dh->minor = device->minor;
err = nla_put_drbd_cfg_context(skb, device->resource, NULL, device);
if (err)
goto out;
if (get_ldev(device)) {
struct disk_conf *disk_conf =
rcu_dereference(device->ldev->disk_conf);
err = disk_conf_to_skb(skb, disk_conf, !capable(CAP_SYS_ADMIN));
put_ldev(device);
if (err)
goto out;
}
device_to_info(&device_info, device);
err = device_info_to_skb(skb, &device_info, !capable(CAP_SYS_ADMIN));
if (err)
goto out;
device_to_statistics(&device_statistics, device);
err = device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
if (err)
goto out;
cb->args[1] = minor + 1;
}
genlmsg_end(skb, dh);
err = 0;
out:
rcu_read_unlock();
if (err)
return err;
return skb->len;
}
int drbd_adm_dump_connections_done(struct netlink_callback *cb)
{
return put_resource_in_arg0(cb, 6);
}
enum { SINGLE_RESOURCE, ITERATE_RESOURCES };
int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb)
{
struct nlattr *resource_filter;
struct drbd_resource *resource = NULL, *next_resource;
struct drbd_connection *uninitialized_var(connection);
int err = 0, retcode;
struct drbd_genlmsghdr *dh;
struct connection_info connection_info;
struct connection_statistics connection_statistics;
rcu_read_lock();
resource = (struct drbd_resource *)cb->args[0];
if (!cb->args[0]) {
resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
if (resource_filter) {
retcode = ERR_RES_NOT_KNOWN;
resource = drbd_find_resource(nla_data(resource_filter));
if (!resource)
goto put_result;
cb->args[0] = (long)resource;
cb->args[1] = SINGLE_RESOURCE;
}
}
if (!resource) {
if (list_empty(&drbd_resources))
goto out;
resource = list_first_entry(&drbd_resources, struct drbd_resource, resources);
kref_get(&resource->kref);
cb->args[0] = (long)resource;
cb->args[1] = ITERATE_RESOURCES;
}
next_resource:
rcu_read_unlock();
mutex_lock(&resource->conf_update);
rcu_read_lock();
if (cb->args[2]) {
for_each_connection_rcu(connection, resource)
if (connection == (struct drbd_connection *)cb->args[2])
goto found_connection;
/* connection was probably deleted */
goto no_more_connections;
}
connection = list_entry(&resource->connections, struct drbd_connection, connections);
found_connection:
list_for_each_entry_continue_rcu(connection, &resource->connections, connections) {
if (!has_net_conf(connection))
continue;
retcode = NO_ERROR;
goto put_result; /* only one iteration */
}
no_more_connections:
if (cb->args[1] == ITERATE_RESOURCES) {
for_each_resource_rcu(next_resource, &drbd_resources) {
if (next_resource == resource)
goto found_resource;
}
/* resource was probably deleted */
}
goto out;
found_resource:
list_for_each_entry_continue_rcu(next_resource, &drbd_resources, resources) {
mutex_unlock(&resource->conf_update);
kref_put(&resource->kref, drbd_destroy_resource);
resource = next_resource;
kref_get(&resource->kref);
cb->args[0] = (long)resource;
cb->args[2] = 0;
goto next_resource;
}
goto out; /* no more resources */
put_result:
dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, &drbd_genl_family,
NLM_F_MULTI, DRBD_ADM_GET_CONNECTIONS);
err = -ENOMEM;
if (!dh)
goto out;
dh->ret_code = retcode;
dh->minor = -1U;
if (retcode == NO_ERROR) {
struct net_conf *net_conf;
err = nla_put_drbd_cfg_context(skb, resource, connection, NULL);
if (err)
goto out;
net_conf = rcu_dereference(connection->net_conf);
if (net_conf) {
err = net_conf_to_skb(skb, net_conf, !capable(CAP_SYS_ADMIN));
if (err)
goto out;
}
connection_to_info(&connection_info, connection);
err = connection_info_to_skb(skb, &connection_info, !capable(CAP_SYS_ADMIN));
if (err)
goto out;
connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
err = connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
if (err)
goto out;
cb->args[2] = (long)connection;
}
genlmsg_end(skb, dh);
err = 0;
out:
rcu_read_unlock();
if (resource)
mutex_unlock(&resource->conf_update);
if (err)
return err;
return skb->len;
}
enum mdf_peer_flag {
MDF_PEER_CONNECTED = 1 << 0,
MDF_PEER_OUTDATED = 1 << 1,
MDF_PEER_FENCING = 1 << 2,
MDF_PEER_FULL_SYNC = 1 << 3,
};
static void peer_device_to_statistics(struct peer_device_statistics *s,
struct drbd_peer_device *peer_device)
{
struct drbd_device *device = peer_device->device;
memset(s, 0, sizeof(*s));
s->peer_dev_received = device->recv_cnt;
s->peer_dev_sent = device->send_cnt;
s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) +
atomic_read(&device->rs_pending_cnt);
s->peer_dev_unacked = atomic_read(&device->unacked_cnt);
s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9);
s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9);
if (get_ldev(device)) {
struct drbd_md *md = &device->ldev->md;
spin_lock_irq(&md->uuid_lock);
s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP];
spin_unlock_irq(&md->uuid_lock);
s->peer_dev_flags =
(drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ?
MDF_PEER_CONNECTED : 0) +
(drbd_md_test_flag(device->ldev, MDF_CONSISTENT) &&
!drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ?
MDF_PEER_OUTDATED : 0) +
/* FIXME: MDF_PEER_FENCING? */
(drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ?
MDF_PEER_FULL_SYNC : 0);
put_ldev(device);
}
}
int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb)
{
return put_resource_in_arg0(cb, 9);
}
int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb)
{
struct nlattr *resource_filter;
struct drbd_resource *resource;
struct drbd_device *uninitialized_var(device);
struct drbd_peer_device *peer_device = NULL;
int minor, err, retcode;
struct drbd_genlmsghdr *dh;
struct idr *idr_to_search;
resource = (struct drbd_resource *)cb->args[0];
if (!cb->args[0] && !cb->args[1]) {
resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
if (resource_filter) {
retcode = ERR_RES_NOT_KNOWN;
resource = drbd_find_resource(nla_data(resource_filter));
if (!resource)
goto put_result;
}
cb->args[0] = (long)resource;
}
rcu_read_lock();
minor = cb->args[1];
idr_to_search = resource ? &resource->devices : &drbd_devices;
device = idr_find(idr_to_search, minor);
if (!device) {
next_device:
minor++;
cb->args[2] = 0;
device = idr_get_next(idr_to_search, &minor);
if (!device) {
err = 0;
goto out;
}
}
if (cb->args[2]) {
for_each_peer_device(peer_device, device)
if (peer_device == (struct drbd_peer_device *)cb->args[2])
goto found_peer_device;
/* peer device was probably deleted */
goto next_device;
}
/* Make peer_device point to the list head (not the first entry). */
peer_device = list_entry(&device->peer_devices, struct drbd_peer_device, peer_devices);
found_peer_device:
list_for_each_entry_continue_rcu(peer_device, &device->peer_devices, peer_devices) {
if (!has_net_conf(peer_device->connection))
continue;
retcode = NO_ERROR;
goto put_result; /* only one iteration */
}
goto next_device;
put_result:
dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, &drbd_genl_family,
NLM_F_MULTI, DRBD_ADM_GET_PEER_DEVICES);
err = -ENOMEM;
if (!dh)
goto out;
dh->ret_code = retcode;
dh->minor = -1U;
if (retcode == NO_ERROR) {
struct peer_device_info peer_device_info;
struct peer_device_statistics peer_device_statistics;
dh->minor = minor;
err = nla_put_drbd_cfg_context(skb, device->resource, peer_device->connection, device);
if (err)
goto out;
peer_device_to_info(&peer_device_info, peer_device);
err = peer_device_info_to_skb(skb, &peer_device_info, !capable(CAP_SYS_ADMIN));
if (err)
goto out;
peer_device_to_statistics(&peer_device_statistics, peer_device);
err = peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
if (err)
goto out;
cb->args[1] = minor;
cb->args[2] = (long)peer_device;
}
genlmsg_end(skb, dh);
err = 0;
out:
rcu_read_unlock();
if (err)
return err;
return skb->len;
}
/* /*
* Return the connection of @resource if @resource has exactly one connection. * Return the connection of @resource if @resource has exactly one connection.
*/ */
...@@ -3414,8 +4055,18 @@ drbd_check_resource_name(struct drbd_config_context *adm_ctx) ...@@ -3414,8 +4055,18 @@ drbd_check_resource_name(struct drbd_config_context *adm_ctx)
return NO_ERROR; return NO_ERROR;
} }
static void resource_to_info(struct resource_info *info,
struct drbd_resource *resource)
{
info->res_role = conn_highest_role(first_connection(resource));
info->res_susp = resource->susp;
info->res_susp_nod = resource->susp_nod;
info->res_susp_fen = resource->susp_fen;
}
int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
{ {
struct drbd_connection *connection;
struct drbd_config_context adm_ctx; struct drbd_config_context adm_ctx;
enum drbd_ret_code retcode; enum drbd_ret_code retcode;
struct res_opts res_opts; struct res_opts res_opts;
...@@ -3449,13 +4100,33 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) ...@@ -3449,13 +4100,33 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
} }
/* not yet safe for genl_family.parallel_ops */ /* not yet safe for genl_family.parallel_ops */
if (!conn_create(adm_ctx.resource_name, &res_opts)) mutex_lock(&resources_mutex);
connection = conn_create(adm_ctx.resource_name, &res_opts);
mutex_unlock(&resources_mutex);
if (connection) {
struct resource_info resource_info;
mutex_lock(&notification_mutex);
resource_to_info(&resource_info, connection->resource);
notify_resource_state(NULL, 0, connection->resource,
&resource_info, NOTIFY_CREATE);
mutex_unlock(&notification_mutex);
} else
retcode = ERR_NOMEM; retcode = ERR_NOMEM;
out: out:
drbd_adm_finish(&adm_ctx, info, retcode); drbd_adm_finish(&adm_ctx, info, retcode);
return 0; return 0;
} }
static void device_to_info(struct device_info *info,
struct drbd_device *device)
{
info->dev_disk_state = device->state.disk;
}
int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
{ {
struct drbd_config_context adm_ctx; struct drbd_config_context adm_ctx;
...@@ -3490,6 +4161,36 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) ...@@ -3490,6 +4161,36 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
mutex_lock(&adm_ctx.resource->adm_mutex); mutex_lock(&adm_ctx.resource->adm_mutex);
retcode = drbd_create_device(&adm_ctx, dh->minor); retcode = drbd_create_device(&adm_ctx, dh->minor);
if (retcode == NO_ERROR) {
struct drbd_device *device;
struct drbd_peer_device *peer_device;
struct device_info info;
unsigned int peer_devices = 0;
enum drbd_notification_type flags;
device = minor_to_device(dh->minor);
for_each_peer_device(peer_device, device) {
if (!has_net_conf(peer_device->connection))
continue;
peer_devices++;
}
device_to_info(&info, device);
mutex_lock(&notification_mutex);
flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
notify_device_state(NULL, 0, device, &info, NOTIFY_CREATE | flags);
for_each_peer_device(peer_device, device) {
struct peer_device_info peer_device_info;
if (!has_net_conf(peer_device->connection))
continue;
peer_device_to_info(&peer_device_info, peer_device);
flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
notify_peer_device_state(NULL, 0, peer_device, &peer_device_info,
NOTIFY_CREATE | flags);
}
mutex_unlock(&notification_mutex);
}
mutex_unlock(&adm_ctx.resource->adm_mutex); mutex_unlock(&adm_ctx.resource->adm_mutex);
out: out:
drbd_adm_finish(&adm_ctx, info, retcode); drbd_adm_finish(&adm_ctx, info, retcode);
...@@ -3498,13 +4199,35 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) ...@@ -3498,13 +4199,35 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
static enum drbd_ret_code adm_del_minor(struct drbd_device *device) static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
{ {
struct drbd_peer_device *peer_device;
if (device->state.disk == D_DISKLESS && if (device->state.disk == D_DISKLESS &&
/* no need to be device->state.conn == C_STANDALONE && /* no need to be device->state.conn == C_STANDALONE &&
* we may want to delete a minor from a live replication group. * we may want to delete a minor from a live replication group.
*/ */
device->state.role == R_SECONDARY) { device->state.role == R_SECONDARY) {
struct drbd_connection *connection =
first_connection(device->resource);
_drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS), _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
CS_VERBOSE + CS_WAIT_COMPLETE); CS_VERBOSE + CS_WAIT_COMPLETE);
/* If the state engine hasn't stopped the sender thread yet, we
* need to flush the sender work queue before generating the
* DESTROY events here. */
if (get_t_state(&connection->worker) == RUNNING)
drbd_flush_workqueue(&connection->sender_work);
mutex_lock(&notification_mutex);
for_each_peer_device(peer_device, device) {
if (!has_net_conf(peer_device->connection))
continue;
notify_peer_device_state(NULL, 0, peer_device, NULL,
NOTIFY_DESTROY | NOTIFY_CONTINUES);
}
notify_device_state(NULL, 0, device, NULL, NOTIFY_DESTROY);
mutex_unlock(&notification_mutex);
drbd_delete_device(device); drbd_delete_device(device);
return NO_ERROR; return NO_ERROR;
} else } else
...@@ -3541,7 +4264,16 @@ static int adm_del_resource(struct drbd_resource *resource) ...@@ -3541,7 +4264,16 @@ static int adm_del_resource(struct drbd_resource *resource)
if (!idr_is_empty(&resource->devices)) if (!idr_is_empty(&resource->devices))
return ERR_RES_IN_USE; return ERR_RES_IN_USE;
/* The state engine has stopped the sender thread, so we don't
* need to flush the sender work queue before generating the
* DESTROY event here. */
mutex_lock(&notification_mutex);
notify_resource_state(NULL, 0, resource, NULL, NOTIFY_DESTROY);
mutex_unlock(&notification_mutex);
mutex_lock(&resources_mutex);
list_del_rcu(&resource->resources); list_del_rcu(&resource->resources);
mutex_unlock(&resources_mutex);
/* Make sure all threads have actually stopped: state handling only /* Make sure all threads have actually stopped: state handling only
* does drbd_thread_stop_nowait(). */ * does drbd_thread_stop_nowait(). */
list_for_each_entry(connection, &resource->connections, connections) list_for_each_entry(connection, &resource->connections, connections)
...@@ -3637,7 +4369,6 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) ...@@ -3637,7 +4369,6 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
{ {
static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
struct sk_buff *msg; struct sk_buff *msg;
struct drbd_genlmsghdr *d_out; struct drbd_genlmsghdr *d_out;
unsigned seq; unsigned seq;
...@@ -3658,7 +4389,7 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) ...@@ -3658,7 +4389,7 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
if (nla_put_status_info(msg, device, sib)) if (nla_put_status_info(msg, device, sib))
goto nla_put_failure; goto nla_put_failure;
genlmsg_end(msg, d_out); genlmsg_end(msg, d_out);
err = drbd_genl_multicast_events(msg, 0); err = drbd_genl_multicast_events(msg, GFP_NOWAIT);
/* msg has been consumed or freed in netlink_broadcast() */ /* msg has been consumed or freed in netlink_broadcast() */
if (err && err != -ESRCH) if (err && err != -ESRCH)
goto failed; goto failed;
...@@ -3672,3 +4403,405 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) ...@@ -3672,3 +4403,405 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
"Event seq:%u sib_reason:%u\n", "Event seq:%u sib_reason:%u\n",
err, seq, sib->sib_reason); err, seq, sib->sib_reason);
} }
static int nla_put_notification_header(struct sk_buff *msg,
enum drbd_notification_type type)
{
struct drbd_notification_header nh = {
.nh_type = type,
};
return drbd_notification_header_to_skb(msg, &nh, true);
}
void notify_resource_state(struct sk_buff *skb,
unsigned int seq,
struct drbd_resource *resource,
struct resource_info *resource_info,
enum drbd_notification_type type)
{
struct resource_statistics resource_statistics;
struct drbd_genlmsghdr *dh;
bool multicast = false;
int err;
if (!skb) {
seq = atomic_inc_return(&notify_genl_seq);
skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
err = -ENOMEM;
if (!skb)
goto failed;
multicast = true;
}
err = -EMSGSIZE;
dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_RESOURCE_STATE);
if (!dh)
goto nla_put_failure;
dh->minor = -1U;
dh->ret_code = NO_ERROR;
if (nla_put_drbd_cfg_context(skb, resource, NULL, NULL) ||
nla_put_notification_header(skb, type) ||
((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
resource_info_to_skb(skb, resource_info, true)))
goto nla_put_failure;
resource_statistics.res_stat_write_ordering = resource->write_ordering;
err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
if (err)
goto nla_put_failure;
genlmsg_end(skb, dh);
if (multicast) {
err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
/* skb has been consumed or freed in netlink_broadcast() */
if (err && err != -ESRCH)
goto failed;
}
return;
nla_put_failure:
nlmsg_free(skb);
failed:
drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
err, seq);
}
void notify_device_state(struct sk_buff *skb,
unsigned int seq,
struct drbd_device *device,
struct device_info *device_info,
enum drbd_notification_type type)
{
struct device_statistics device_statistics;
struct drbd_genlmsghdr *dh;
bool multicast = false;
int err;
if (!skb) {
seq = atomic_inc_return(&notify_genl_seq);
skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
err = -ENOMEM;
if (!skb)
goto failed;
multicast = true;
}
err = -EMSGSIZE;
dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_DEVICE_STATE);
if (!dh)
goto nla_put_failure;
dh->minor = device->minor;
dh->ret_code = NO_ERROR;
if (nla_put_drbd_cfg_context(skb, device->resource, NULL, device) ||
nla_put_notification_header(skb, type) ||
((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
device_info_to_skb(skb, device_info, true)))
goto nla_put_failure;
device_to_statistics(&device_statistics, device);
device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
genlmsg_end(skb, dh);
if (multicast) {
err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
/* skb has been consumed or freed in netlink_broadcast() */
if (err && err != -ESRCH)
goto failed;
}
return;
nla_put_failure:
nlmsg_free(skb);
failed:
drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n",
err, seq);
}
void notify_connection_state(struct sk_buff *skb,
unsigned int seq,
struct drbd_connection *connection,
struct connection_info *connection_info,
enum drbd_notification_type type)
{
struct connection_statistics connection_statistics;
struct drbd_genlmsghdr *dh;
bool multicast = false;
int err;
if (!skb) {
seq = atomic_inc_return(&notify_genl_seq);
skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
err = -ENOMEM;
if (!skb)
goto failed;
multicast = true;
}
err = -EMSGSIZE;
dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_CONNECTION_STATE);
if (!dh)
goto nla_put_failure;
dh->minor = -1U;
dh->ret_code = NO_ERROR;
if (nla_put_drbd_cfg_context(skb, connection->resource, connection, NULL) ||
nla_put_notification_header(skb, type) ||
((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
connection_info_to_skb(skb, connection_info, true)))
goto nla_put_failure;
connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
genlmsg_end(skb, dh);
if (multicast) {
err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
/* skb has been consumed or freed in netlink_broadcast() */
if (err && err != -ESRCH)
goto failed;
}
return;
nla_put_failure:
nlmsg_free(skb);
failed:
drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n",
err, seq);
}
void notify_peer_device_state(struct sk_buff *skb,
unsigned int seq,
struct drbd_peer_device *peer_device,
struct peer_device_info *peer_device_info,
enum drbd_notification_type type)
{
struct peer_device_statistics peer_device_statistics;
struct drbd_resource *resource = peer_device->device->resource;
struct drbd_genlmsghdr *dh;
bool multicast = false;
int err;
if (!skb) {
seq = atomic_inc_return(&notify_genl_seq);
skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
err = -ENOMEM;
if (!skb)
goto failed;
multicast = true;
}
err = -EMSGSIZE;
dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_PEER_DEVICE_STATE);
if (!dh)
goto nla_put_failure;
dh->minor = -1U;
dh->ret_code = NO_ERROR;
if (nla_put_drbd_cfg_context(skb, resource, peer_device->connection, peer_device->device) ||
nla_put_notification_header(skb, type) ||
((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
peer_device_info_to_skb(skb, peer_device_info, true)))
goto nla_put_failure;
peer_device_to_statistics(&peer_device_statistics, peer_device);
peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
genlmsg_end(skb, dh);
if (multicast) {
err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
/* skb has been consumed or freed in netlink_broadcast() */
if (err && err != -ESRCH)
goto failed;
}
return;
nla_put_failure:
nlmsg_free(skb);
failed:
drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n",
err, seq);
}
void notify_helper(enum drbd_notification_type type,
struct drbd_device *device, struct drbd_connection *connection,
const char *name, int status)
{
struct drbd_resource *resource = device ? device->resource : connection->resource;
struct drbd_helper_info helper_info;
unsigned int seq = atomic_inc_return(&notify_genl_seq);
struct sk_buff *skb = NULL;
struct drbd_genlmsghdr *dh;
int err;
strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name));
helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name));
helper_info.helper_status = status;
skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
err = -ENOMEM;
if (!skb)
goto fail;
err = -EMSGSIZE;
dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_HELPER);
if (!dh)
goto fail;
dh->minor = device ? device->minor : -1;
dh->ret_code = NO_ERROR;
mutex_lock(&notification_mutex);
if (nla_put_drbd_cfg_context(skb, resource, connection, device) ||
nla_put_notification_header(skb, type) ||
drbd_helper_info_to_skb(skb, &helper_info, true))
goto unlock_fail;
genlmsg_end(skb, dh);
err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
skb = NULL;
/* skb has been consumed or freed in netlink_broadcast() */
if (err && err != -ESRCH)
goto unlock_fail;
mutex_unlock(&notification_mutex);
return;
unlock_fail:
mutex_unlock(&notification_mutex);
fail:
nlmsg_free(skb);
drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
err, seq);
}
static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq)
{
struct drbd_genlmsghdr *dh;
int err;
err = -EMSGSIZE;
dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_INITIAL_STATE_DONE);
if (!dh)
goto nla_put_failure;
dh->minor = -1U;
dh->ret_code = NO_ERROR;
if (nla_put_notification_header(skb, NOTIFY_EXISTS))
goto nla_put_failure;
genlmsg_end(skb, dh);
return;
nla_put_failure:
nlmsg_free(skb);
pr_err("Error %d sending event. Event seq:%u\n", err, seq);
}
static void free_state_changes(struct list_head *list)
{
while (!list_empty(list)) {
struct drbd_state_change *state_change =
list_first_entry(list, struct drbd_state_change, list);
list_del(&state_change->list);
forget_state_change(state_change);
}
}
static unsigned int notifications_for_state_change(struct drbd_state_change *state_change)
{
return 1 +
state_change->n_connections +
state_change->n_devices +
state_change->n_devices * state_change->n_connections;
}
static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
{
struct drbd_state_change *state_change = (struct drbd_state_change *)cb->args[0];
unsigned int seq = cb->args[2];
unsigned int n;
enum drbd_notification_type flags = 0;
/* There is no need for taking notification_mutex here: it doesn't
matter if the initial state events mix with later state chage
events; we can always tell the events apart by the NOTIFY_EXISTS
flag. */
cb->args[5]--;
if (cb->args[5] == 1) {
notify_initial_state_done(skb, seq);
goto out;
}
n = cb->args[4]++;
if (cb->args[4] < cb->args[3])
flags |= NOTIFY_CONTINUES;
if (n < 1) {
notify_resource_state_change(skb, seq, state_change->resource,
NOTIFY_EXISTS | flags);
goto next;
}
n--;
if (n < state_change->n_connections) {
notify_connection_state_change(skb, seq, &state_change->connections[n],
NOTIFY_EXISTS | flags);
goto next;
}
n -= state_change->n_connections;
if (n < state_change->n_devices) {
notify_device_state_change(skb, seq, &state_change->devices[n],
NOTIFY_EXISTS | flags);
goto next;
}
n -= state_change->n_devices;
if (n < state_change->n_devices * state_change->n_connections) {
notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n],
NOTIFY_EXISTS | flags);
goto next;
}
next:
if (cb->args[4] == cb->args[3]) {
struct drbd_state_change *next_state_change =
list_entry(state_change->list.next,
struct drbd_state_change, list);
cb->args[0] = (long)next_state_change;
cb->args[3] = notifications_for_state_change(next_state_change);
cb->args[4] = 0;
}
out:
return skb->len;
}
int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
{
struct drbd_resource *resource;
LIST_HEAD(head);
if (cb->args[5] >= 1) {
if (cb->args[5] > 1)
return get_initial_state(skb, cb);
if (cb->args[0]) {
struct drbd_state_change *state_change =
(struct drbd_state_change *)cb->args[0];
/* connect list to head */
list_add(&head, &state_change->list);
free_state_changes(&head);
}
return 0;
}
cb->args[5] = 2; /* number of iterations */
mutex_lock(&resources_mutex);
for_each_resource(resource, &drbd_resources) {
struct drbd_state_change *state_change;
state_change = remember_old_state(resource, GFP_KERNEL);
if (!state_change) {
if (!list_empty(&head))
free_state_changes(&head);
mutex_unlock(&resources_mutex);
return -ENOMEM;
}
copy_old_to_new_state_change(state_change);
list_add_tail(&state_change->list, &head);
cb->args[5] += notifications_for_state_change(state_change);
}
mutex_unlock(&resources_mutex);
if (!list_empty(&head)) {
struct drbd_state_change *state_change =
list_entry(head.next, struct drbd_state_change, list);
cb->args[0] = (long)state_change;
cb->args[3] = notifications_for_state_change(state_change);
list_del(&head); /* detach list from head */
}
cb->args[2] = cb->nlh->nlmsg_seq;
return get_initial_state(skb, cb);
}
...@@ -245,9 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v) ...@@ -245,9 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
char wp; char wp;
static char write_ordering_chars[] = { static char write_ordering_chars[] = {
[WO_none] = 'n', [WO_NONE] = 'n',
[WO_drain_io] = 'd', [WO_DRAIN_IO] = 'd',
[WO_bdev_flush] = 'f', [WO_BDEV_FLUSH] = 'f',
}; };
seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
......
...@@ -23,7 +23,7 @@ enum drbd_packet { ...@@ -23,7 +23,7 @@ enum drbd_packet {
P_AUTH_RESPONSE = 0x11, P_AUTH_RESPONSE = 0x11,
P_STATE_CHG_REQ = 0x12, P_STATE_CHG_REQ = 0x12,
/* asender (meta socket */ /* (meta socket) */
P_PING = 0x13, P_PING = 0x13,
P_PING_ACK = 0x14, P_PING_ACK = 0x14,
P_RECV_ACK = 0x15, /* Used in protocol B */ P_RECV_ACK = 0x15, /* Used in protocol B */
......
...@@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device, ...@@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
} }
} }
static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
{ {
LIST_HEAD(reclaimed); LIST_HEAD(reclaimed);
struct drbd_peer_request *peer_req, *t; struct drbd_peer_request *peer_req, *t;
...@@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) ...@@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
spin_lock_irq(&device->resource->req_lock); spin_lock_irq(&device->resource->req_lock);
reclaim_finished_net_peer_reqs(device, &reclaimed); reclaim_finished_net_peer_reqs(device, &reclaimed);
spin_unlock_irq(&device->resource->req_lock); spin_unlock_irq(&device->resource->req_lock);
list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
drbd_free_net_peer_req(device, peer_req); drbd_free_net_peer_req(device, peer_req);
} }
static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
{
struct drbd_peer_device *peer_device;
int vnr;
rcu_read_lock();
idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
struct drbd_device *device = peer_device->device;
if (!atomic_read(&device->pp_in_use_by_net))
continue;
kref_get(&device->kref);
rcu_read_unlock();
drbd_reclaim_net_peer_reqs(device);
kref_put(&device->kref, drbd_destroy_device);
rcu_read_lock();
}
rcu_read_unlock();
}
/** /**
* drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
* @device: DRBD device. * @device: DRBD device.
...@@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int ...@@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
if (atomic_read(&device->pp_in_use) < mxb) if (atomic_read(&device->pp_in_use) < mxb)
page = __drbd_alloc_pages(device, number); page = __drbd_alloc_pages(device, number);
/* Try to keep the fast path fast, but occasionally we need
* to reclaim the pages we lended to the network stack. */
if (page && atomic_read(&device->pp_in_use_by_net) > 512)
drbd_reclaim_net_peer_reqs(device);
while (page == NULL) { while (page == NULL) {
prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
drbd_kick_lo_and_reclaim_net(device); drbd_reclaim_net_peer_reqs(device);
if (atomic_read(&device->pp_in_use) < mxb) { if (atomic_read(&device->pp_in_use) < mxb) {
page = __drbd_alloc_pages(device, number); page = __drbd_alloc_pages(device, number);
...@@ -1099,7 +1123,15 @@ static int conn_connect(struct drbd_connection *connection) ...@@ -1099,7 +1123,15 @@ static int conn_connect(struct drbd_connection *connection)
return 0; return 0;
} }
drbd_thread_start(&connection->asender); drbd_thread_start(&connection->ack_receiver);
/* opencoded create_singlethread_workqueue(),
* to be able to use format string arguments */
connection->ack_sender =
alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
if (!connection->ack_sender) {
drbd_err(connection, "Failed to create workqueue ack_sender\n");
return 0;
}
mutex_lock(&connection->resource->conf_update); mutex_lock(&connection->resource->conf_update);
/* The discard_my_data flag is a single-shot modifier to the next /* The discard_my_data flag is a single-shot modifier to the next
...@@ -1178,7 +1210,7 @@ static void drbd_flush(struct drbd_connection *connection) ...@@ -1178,7 +1210,7 @@ static void drbd_flush(struct drbd_connection *connection)
struct drbd_peer_device *peer_device; struct drbd_peer_device *peer_device;
int vnr; int vnr;
if (connection->resource->write_ordering >= WO_bdev_flush) { if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
rcu_read_lock(); rcu_read_lock();
idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
struct drbd_device *device = peer_device->device; struct drbd_device *device = peer_device->device;
...@@ -1203,7 +1235,7 @@ static void drbd_flush(struct drbd_connection *connection) ...@@ -1203,7 +1235,7 @@ static void drbd_flush(struct drbd_connection *connection)
/* would rather check on EOPNOTSUPP, but that is not reliable. /* would rather check on EOPNOTSUPP, but that is not reliable.
* don't try again for ANY return value != 0 * don't try again for ANY return value != 0
* if (rv == -EOPNOTSUPP) */ * if (rv == -EOPNOTSUPP) */
drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io); drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
} }
put_ldev(device); put_ldev(device);
kref_put(&device->kref, drbd_destroy_device); kref_put(&device->kref, drbd_destroy_device);
...@@ -1299,10 +1331,10 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) ...@@ -1299,10 +1331,10 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
dc = rcu_dereference(bdev->disk_conf); dc = rcu_dereference(bdev->disk_conf);
if (wo == WO_bdev_flush && !dc->disk_flushes) if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
wo = WO_drain_io; wo = WO_DRAIN_IO;
if (wo == WO_drain_io && !dc->disk_drain) if (wo == WO_DRAIN_IO && !dc->disk_drain)
wo = WO_none; wo = WO_NONE;
return wo; return wo;
} }
...@@ -1319,13 +1351,13 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin ...@@ -1319,13 +1351,13 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
enum write_ordering_e pwo; enum write_ordering_e pwo;
int vnr; int vnr;
static char *write_ordering_str[] = { static char *write_ordering_str[] = {
[WO_none] = "none", [WO_NONE] = "none",
[WO_drain_io] = "drain", [WO_DRAIN_IO] = "drain",
[WO_bdev_flush] = "flush", [WO_BDEV_FLUSH] = "flush",
}; };
pwo = resource->write_ordering; pwo = resource->write_ordering;
if (wo != WO_bdev_flush) if (wo != WO_BDEV_FLUSH)
wo = min(pwo, wo); wo = min(pwo, wo);
rcu_read_lock(); rcu_read_lock();
idr_for_each_entry(&resource->devices, device, vnr) { idr_for_each_entry(&resource->devices, device, vnr) {
...@@ -1343,7 +1375,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin ...@@ -1343,7 +1375,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
rcu_read_unlock(); rcu_read_unlock();
resource->write_ordering = wo; resource->write_ordering = wo;
if (pwo != resource->write_ordering || wo == WO_bdev_flush) if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
} }
...@@ -1380,7 +1412,7 @@ int drbd_submit_peer_request(struct drbd_device *device, ...@@ -1380,7 +1412,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
/* wait for all pending IO completions, before we start /* wait for all pending IO completions, before we start
* zeroing things out. */ * zeroing things out. */
conn_wait_active_ee_empty(first_peer_device(device)->connection); conn_wait_active_ee_empty(peer_req->peer_device->connection);
/* add it to the active list now, /* add it to the active list now,
* so we can find it to present it in debugfs */ * so we can find it to present it in debugfs */
peer_req->submit_jif = jiffies; peer_req->submit_jif = jiffies;
...@@ -1508,12 +1540,6 @@ static void conn_wait_active_ee_empty(struct drbd_connection *connection) ...@@ -1508,12 +1540,6 @@ static void conn_wait_active_ee_empty(struct drbd_connection *connection)
rcu_read_unlock(); rcu_read_unlock();
} }
static struct drbd_peer_device *
conn_peer_device(struct drbd_connection *connection, int volume_number)
{
return idr_find(&connection->peer_devices, volume_number);
}
static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi) static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
{ {
int rv; int rv;
...@@ -1533,7 +1559,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf ...@@ -1533,7 +1559,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
* Therefore we must send the barrier_ack after the barrier request was * Therefore we must send the barrier_ack after the barrier request was
* completed. */ * completed. */
switch (connection->resource->write_ordering) { switch (connection->resource->write_ordering) {
case WO_none: case WO_NONE:
if (rv == FE_RECYCLED) if (rv == FE_RECYCLED)
return 0; return 0;
...@@ -1546,8 +1572,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf ...@@ -1546,8 +1572,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
drbd_warn(connection, "Allocation of an epoch failed, slowing down\n"); drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
/* Fall through */ /* Fall through */
case WO_bdev_flush: case WO_BDEV_FLUSH:
case WO_drain_io: case WO_DRAIN_IO:
conn_wait_active_ee_empty(connection); conn_wait_active_ee_empty(connection);
drbd_flush(connection); drbd_flush(connection);
...@@ -1752,7 +1778,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req ...@@ -1752,7 +1778,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req
} }
/* /*
* e_end_resync_block() is called in asender context via * e_end_resync_block() is called in ack_sender context via
* drbd_finish_peer_reqs(). * drbd_finish_peer_reqs().
*/ */
static int e_end_resync_block(struct drbd_work *w, int unused) static int e_end_resync_block(struct drbd_work *w, int unused)
...@@ -1926,7 +1952,7 @@ static void restart_conflicting_writes(struct drbd_device *device, ...@@ -1926,7 +1952,7 @@ static void restart_conflicting_writes(struct drbd_device *device,
} }
/* /*
* e_end_block() is called in asender context via drbd_finish_peer_reqs(). * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
*/ */
static int e_end_block(struct drbd_work *w, int cancel) static int e_end_block(struct drbd_work *w, int cancel)
{ {
...@@ -1966,7 +1992,7 @@ static int e_end_block(struct drbd_work *w, int cancel) ...@@ -1966,7 +1992,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
} else } else
D_ASSERT(device, drbd_interval_empty(&peer_req->i)); D_ASSERT(device, drbd_interval_empty(&peer_req->i));
drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
return err; return err;
} }
...@@ -2098,7 +2124,7 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co ...@@ -2098,7 +2124,7 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
} }
rcu_read_lock(); rcu_read_lock();
tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries; tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
rcu_read_unlock(); rcu_read_unlock();
if (!tp) if (!tp)
...@@ -2217,7 +2243,7 @@ static int handle_write_conflicts(struct drbd_device *device, ...@@ -2217,7 +2243,7 @@ static int handle_write_conflicts(struct drbd_device *device,
peer_req->w.cb = superseded ? e_send_superseded : peer_req->w.cb = superseded ? e_send_superseded :
e_send_retry_write; e_send_retry_write;
list_add_tail(&peer_req->w.list, &device->done_ee); list_add_tail(&peer_req->w.list, &device->done_ee);
wake_asender(connection); queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
err = -ENOENT; err = -ENOENT;
goto out; goto out;
...@@ -2364,7 +2390,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * ...@@ -2364,7 +2390,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
if (dp_flags & DP_SEND_RECEIVE_ACK) { if (dp_flags & DP_SEND_RECEIVE_ACK) {
/* I really don't like it that the receiver thread /* I really don't like it that the receiver thread
* sends on the msock, but anyways */ * sends on the msock, but anyways */
drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
} }
if (tp) { if (tp) {
...@@ -4056,7 +4082,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info ...@@ -4056,7 +4082,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info
os = ns = drbd_read_state(device); os = ns = drbd_read_state(device);
spin_unlock_irq(&device->resource->req_lock); spin_unlock_irq(&device->resource->req_lock);
/* If some other part of the code (asender thread, timeout) /* If some other part of the code (ack_receiver thread, timeout)
* already decided to close the connection again, * already decided to close the connection again,
* we must not "re-establish" it here. */ * we must not "re-establish" it here. */
if (os.conn <= C_TEAR_DOWN) if (os.conn <= C_TEAR_DOWN)
...@@ -4661,8 +4687,12 @@ static void conn_disconnect(struct drbd_connection *connection) ...@@ -4661,8 +4687,12 @@ static void conn_disconnect(struct drbd_connection *connection)
*/ */
conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
/* asender does not clean up anything. it must not interfere, either */ /* ack_receiver does not clean up anything. it must not interfere, either */
drbd_thread_stop(&connection->asender); drbd_thread_stop(&connection->ack_receiver);
if (connection->ack_sender) {
destroy_workqueue(connection->ack_sender);
connection->ack_sender = NULL;
}
drbd_free_sock(connection); drbd_free_sock(connection);
rcu_read_lock(); rcu_read_lock();
...@@ -5431,49 +5461,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi) ...@@ -5431,49 +5461,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
return 0; return 0;
} }
static int connection_finish_peer_reqs(struct drbd_connection *connection) struct meta_sock_cmd {
size_t pkt_size;
int (*fn)(struct drbd_connection *connection, struct packet_info *);
};
static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
{ {
struct drbd_peer_device *peer_device; long t;
int vnr, not_empty = 0; struct net_conf *nc;
do { rcu_read_lock();
clear_bit(SIGNAL_ASENDER, &connection->flags); nc = rcu_dereference(connection->net_conf);
flush_signals(current); t = ping_timeout ? nc->ping_timeo : nc->ping_int;
rcu_read_unlock();
rcu_read_lock(); t *= HZ;
idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { if (ping_timeout)
struct drbd_device *device = peer_device->device; t /= 10;
kref_get(&device->kref);
rcu_read_unlock();
if (drbd_finish_peer_reqs(device)) {
kref_put(&device->kref, drbd_destroy_device);
return 1;
}
kref_put(&device->kref, drbd_destroy_device);
rcu_read_lock();
}
set_bit(SIGNAL_ASENDER, &connection->flags);
spin_lock_irq(&connection->resource->req_lock); connection->meta.socket->sk->sk_rcvtimeo = t;
idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { }
struct drbd_device *device = peer_device->device;
not_empty = !list_empty(&device->done_ee);
if (not_empty)
break;
}
spin_unlock_irq(&connection->resource->req_lock);
rcu_read_unlock();
} while (not_empty);
return 0; static void set_ping_timeout(struct drbd_connection *connection)
{
set_rcvtimeo(connection, 1);
} }
struct asender_cmd { static void set_idle_timeout(struct drbd_connection *connection)
size_t pkt_size; {
int (*fn)(struct drbd_connection *connection, struct packet_info *); set_rcvtimeo(connection, 0);
}; }
static struct asender_cmd asender_tbl[] = { static struct meta_sock_cmd ack_receiver_tbl[] = {
[P_PING] = { 0, got_Ping }, [P_PING] = { 0, got_Ping },
[P_PING_ACK] = { 0, got_PingAck }, [P_PING_ACK] = { 0, got_PingAck },
[P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
...@@ -5493,64 +5513,40 @@ static struct asender_cmd asender_tbl[] = { ...@@ -5493,64 +5513,40 @@ static struct asender_cmd asender_tbl[] = {
[P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
}; };
int drbd_asender(struct drbd_thread *thi) int drbd_ack_receiver(struct drbd_thread *thi)
{ {
struct drbd_connection *connection = thi->connection; struct drbd_connection *connection = thi->connection;
struct asender_cmd *cmd = NULL; struct meta_sock_cmd *cmd = NULL;
struct packet_info pi; struct packet_info pi;
unsigned long pre_recv_jif;
int rv; int rv;
void *buf = connection->meta.rbuf; void *buf = connection->meta.rbuf;
int received = 0; int received = 0;
unsigned int header_size = drbd_header_size(connection); unsigned int header_size = drbd_header_size(connection);
int expect = header_size; int expect = header_size;
bool ping_timeout_active = false; bool ping_timeout_active = false;
struct net_conf *nc;
int ping_timeo, tcp_cork, ping_int;
struct sched_param param = { .sched_priority = 2 }; struct sched_param param = { .sched_priority = 2 };
rv = sched_setscheduler(current, SCHED_RR, &param); rv = sched_setscheduler(current, SCHED_RR, &param);
if (rv < 0) if (rv < 0)
drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv); drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
while (get_t_state(thi) == RUNNING) { while (get_t_state(thi) == RUNNING) {
drbd_thread_current_set_cpu(thi); drbd_thread_current_set_cpu(thi);
rcu_read_lock(); conn_reclaim_net_peer_reqs(connection);
nc = rcu_dereference(connection->net_conf);
ping_timeo = nc->ping_timeo;
tcp_cork = nc->tcp_cork;
ping_int = nc->ping_int;
rcu_read_unlock();
if (test_and_clear_bit(SEND_PING, &connection->flags)) { if (test_and_clear_bit(SEND_PING, &connection->flags)) {
if (drbd_send_ping(connection)) { if (drbd_send_ping(connection)) {
drbd_err(connection, "drbd_send_ping has failed\n"); drbd_err(connection, "drbd_send_ping has failed\n");
goto reconnect; goto reconnect;
} }
connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; set_ping_timeout(connection);
ping_timeout_active = true; ping_timeout_active = true;
} }
/* TODO: conditionally cork; it may hurt latency if we cork without pre_recv_jif = jiffies;
much to send */
if (tcp_cork)
drbd_tcp_cork(connection->meta.socket);
if (connection_finish_peer_reqs(connection)) {
drbd_err(connection, "connection_finish_peer_reqs() failed\n");
goto reconnect;
}
/* but unconditionally uncork unless disabled */
if (tcp_cork)
drbd_tcp_uncork(connection->meta.socket);
/* short circuit, recv_msg would return EINTR anyways. */
if (signal_pending(current))
continue;
rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0); rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
clear_bit(SIGNAL_ASENDER, &connection->flags);
flush_signals(current);
/* Note: /* Note:
* -EINTR (on meta) we got a signal * -EINTR (on meta) we got a signal
...@@ -5562,7 +5558,6 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -5562,7 +5558,6 @@ int drbd_asender(struct drbd_thread *thi)
* rv < expected: "woken" by signal during receive * rv < expected: "woken" by signal during receive
* rv == 0 : "connection shut down by peer" * rv == 0 : "connection shut down by peer"
*/ */
received_more:
if (likely(rv > 0)) { if (likely(rv > 0)) {
received += rv; received += rv;
buf += rv; buf += rv;
...@@ -5584,8 +5579,7 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -5584,8 +5579,7 @@ int drbd_asender(struct drbd_thread *thi)
} else if (rv == -EAGAIN) { } else if (rv == -EAGAIN) {
/* If the data socket received something meanwhile, /* If the data socket received something meanwhile,
* that is good enough: peer is still alive. */ * that is good enough: peer is still alive. */
if (time_after(connection->last_received, if (time_after(connection->last_received, pre_recv_jif))
jiffies - connection->meta.socket->sk->sk_rcvtimeo))
continue; continue;
if (ping_timeout_active) { if (ping_timeout_active) {
drbd_err(connection, "PingAck did not arrive in time.\n"); drbd_err(connection, "PingAck did not arrive in time.\n");
...@@ -5594,6 +5588,10 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -5594,6 +5588,10 @@ int drbd_asender(struct drbd_thread *thi)
set_bit(SEND_PING, &connection->flags); set_bit(SEND_PING, &connection->flags);
continue; continue;
} else if (rv == -EINTR) { } else if (rv == -EINTR) {
/* maybe drbd_thread_stop(): the while condition will notice.
* maybe woken for send_ping: we'll send a ping above,
* and change the rcvtimeo */
flush_signals(current);
continue; continue;
} else { } else {
drbd_err(connection, "sock_recvmsg returned %d\n", rv); drbd_err(connection, "sock_recvmsg returned %d\n", rv);
...@@ -5603,8 +5601,8 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -5603,8 +5601,8 @@ int drbd_asender(struct drbd_thread *thi)
if (received == expect && cmd == NULL) { if (received == expect && cmd == NULL) {
if (decode_header(connection, connection->meta.rbuf, &pi)) if (decode_header(connection, connection->meta.rbuf, &pi))
goto reconnect; goto reconnect;
cmd = &asender_tbl[pi.cmd]; cmd = &ack_receiver_tbl[pi.cmd];
if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n", drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
cmdname(pi.cmd), pi.cmd); cmdname(pi.cmd), pi.cmd);
goto disconnect; goto disconnect;
...@@ -5627,9 +5625,8 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -5627,9 +5625,8 @@ int drbd_asender(struct drbd_thread *thi)
connection->last_received = jiffies; connection->last_received = jiffies;
if (cmd == &asender_tbl[P_PING_ACK]) { if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
/* restore idle timeout */ set_idle_timeout(connection);
connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
ping_timeout_active = false; ping_timeout_active = false;
} }
...@@ -5638,11 +5635,6 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -5638,11 +5635,6 @@ int drbd_asender(struct drbd_thread *thi)
expect = header_size; expect = header_size;
cmd = NULL; cmd = NULL;
} }
if (test_bit(SEND_PING, &connection->flags))
continue;
rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
if (rv > 0)
goto received_more;
} }
if (0) { if (0) {
...@@ -5654,9 +5646,41 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -5654,9 +5646,41 @@ int drbd_asender(struct drbd_thread *thi)
disconnect: disconnect:
conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
} }
clear_bit(SIGNAL_ASENDER, &connection->flags);
drbd_info(connection, "asender terminated\n"); drbd_info(connection, "ack_receiver terminated\n");
return 0; return 0;
} }
void drbd_send_acks_wf(struct work_struct *ws)
{
struct drbd_peer_device *peer_device =
container_of(ws, struct drbd_peer_device, send_acks_work);
struct drbd_connection *connection = peer_device->connection;
struct drbd_device *device = peer_device->device;
struct net_conf *nc;
int tcp_cork, err;
rcu_read_lock();
nc = rcu_dereference(connection->net_conf);
tcp_cork = nc->tcp_cork;
rcu_read_unlock();
if (tcp_cork)
drbd_tcp_cork(connection->meta.socket);
err = drbd_finish_peer_reqs(device);
kref_put(&device->kref, drbd_destroy_device);
/* get is in drbd_endio_write_sec_final(). That is necessary to keep the
struct work_struct send_acks_work alive, which is in the peer_device object */
if (err) {
conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
return;
}
if (tcp_cork)
drbd_tcp_uncork(connection->meta.socket);
return;
}
...@@ -453,12 +453,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, ...@@ -453,12 +453,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
kref_get(&req->kref); /* wait for the DONE */ kref_get(&req->kref); /* wait for the DONE */
if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
/* potentially already completed in the asender thread */ /* potentially already completed in the ack_receiver thread */
if (!(s & RQ_NET_DONE)) { if (!(s & RQ_NET_DONE)) {
atomic_add(req->i.size >> 9, &device->ap_in_flight); atomic_add(req->i.size >> 9, &device->ap_in_flight);
set_if_null_req_not_net_done(peer_device, req); set_if_null_req_not_net_done(peer_device, req);
} }
if (s & RQ_NET_PENDING) if (req->rq_state & RQ_NET_PENDING)
set_if_null_req_ack_pending(peer_device, req); set_if_null_req_ack_pending(peer_device, req);
} }
...@@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req) ...@@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req)
return false; return false;
} }
bool drbd_should_do_remote(union drbd_dev_state s)
{
return s.pdsk == D_UP_TO_DATE ||
(s.pdsk >= D_INCONSISTENT &&
s.conn >= C_WF_BITMAP_T &&
s.conn < C_AHEAD);
/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
states. */
}
static bool drbd_should_send_out_of_sync(union drbd_dev_state s)
{
return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
since we enter state C_AHEAD only if proto >= 96 */
}
/* returns number of connections (== 1, for drbd 8.4) /* returns number of connections (== 1, for drbd 8.4)
* expected to actually write this data, * expected to actually write this data,
* which does NOT include those that we are L_AHEAD for. */ * which does NOT include those that we are L_AHEAD for. */
...@@ -1149,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req) ...@@ -1149,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req)
* stable storage, and this is a WRITE, we may not even submit * stable storage, and this is a WRITE, we may not even submit
* this bio. */ * this bio. */
if (get_ldev(device)) { if (get_ldev(device)) {
req->pre_submit_jif = jiffies;
if (drbd_insert_fault(device, if (drbd_insert_fault(device,
rw == WRITE ? DRBD_FAULT_DT_WR rw == WRITE ? DRBD_FAULT_DT_WR
: rw == READ ? DRBD_FAULT_DT_RD : rw == READ ? DRBD_FAULT_DT_RD
...@@ -1293,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request ...@@ -1293,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
&device->pending_master_completion[rw == WRITE]); &device->pending_master_completion[rw == WRITE]);
if (req->private_bio) { if (req->private_bio) {
/* needs to be marked within the same spinlock */ /* needs to be marked within the same spinlock */
req->pre_submit_jif = jiffies;
list_add_tail(&req->req_pending_local, list_add_tail(&req->req_pending_local,
&device->pending_completion[rw == WRITE]); &device->pending_completion[rw == WRITE]);
_req_mod(req, TO_BE_SUBMITTED); _req_mod(req, TO_BE_SUBMITTED);
...@@ -1513,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio) ...@@ -1513,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio)
return BLK_QC_T_NONE; return BLK_QC_T_NONE;
} }
static bool net_timeout_reached(struct drbd_request *net_req,
struct drbd_connection *connection,
unsigned long now, unsigned long ent,
unsigned int ko_count, unsigned int timeout)
{
struct drbd_device *device = net_req->device;
if (!time_after(now, net_req->pre_send_jif + ent))
return false;
if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent))
return false;
if (net_req->rq_state & RQ_NET_PENDING) {
drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
return true;
}
/* We received an ACK already (or are using protocol A),
* but are waiting for the epoch closing barrier ack.
* Check if we sent the barrier already. We should not blame the peer
* for being unresponsive, if we did not even ask it yet. */
if (net_req->epoch == connection->send.current_epoch_nr) {
drbd_warn(device,
"We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n",
jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
return false;
}
/* Worst case: we may have been blocked for whatever reason, then
* suddenly are able to send a lot of requests (and epoch separating
* barriers) in quick succession.
* The timestamp of the net_req may be much too old and not correspond
* to the sending time of the relevant unack'ed barrier packet, so
* would trigger a spurious timeout. The latest barrier packet may
* have a too recent timestamp to trigger the timeout, potentially miss
* a timeout. Right now we don't have a place to conveniently store
* these timestamps.
* But in this particular situation, the application requests are still
* completed to upper layers, DRBD should still "feel" responsive.
* No need yet to kill this connection, it may still recover.
* If not, eventually we will have queued enough into the network for
* us to block. From that point of view, the timestamp of the last sent
* barrier packet is relevant enough.
*/
if (time_after(now, connection->send.last_sent_barrier_jif + ent)) {
drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
connection->send.last_sent_barrier_jif, now,
jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout);
return true;
}
return false;
}
/* A request is considered timed out, if
* - we have some effective timeout from the configuration,
* with some state restrictions applied,
* - the oldest request is waiting for a response from the network
* resp. the local disk,
* - the oldest request is in fact older than the effective timeout,
* - the connection was established (resp. disk was attached)
* for longer than the timeout already.
* Note that for 32bit jiffies and very stable connections/disks,
* we may have a wrap around, which is catched by
* !time_in_range(now, last_..._jif, last_..._jif + timeout).
*
* Side effect: once per 32bit wrap-around interval, which means every
* ~198 days with 250 HZ, we have a window where the timeout would need
* to expire twice (worst case) to become effective. Good enough.
*/
void request_timer_fn(unsigned long data) void request_timer_fn(unsigned long data)
{ {
struct drbd_device *device = (struct drbd_device *) data; struct drbd_device *device = (struct drbd_device *) data;
...@@ -1522,11 +1612,14 @@ void request_timer_fn(unsigned long data) ...@@ -1522,11 +1612,14 @@ void request_timer_fn(unsigned long data)
unsigned long oldest_submit_jif; unsigned long oldest_submit_jif;
unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
unsigned long now; unsigned long now;
unsigned int ko_count = 0, timeout = 0;
rcu_read_lock(); rcu_read_lock();
nc = rcu_dereference(connection->net_conf); nc = rcu_dereference(connection->net_conf);
if (nc && device->state.conn >= C_WF_REPORT_PARAMS) if (nc && device->state.conn >= C_WF_REPORT_PARAMS) {
ent = nc->timeout * HZ/10 * nc->ko_count; ko_count = nc->ko_count;
timeout = nc->timeout;
}
if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */ if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10; dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
...@@ -1534,6 +1627,8 @@ void request_timer_fn(unsigned long data) ...@@ -1534,6 +1627,8 @@ void request_timer_fn(unsigned long data)
} }
rcu_read_unlock(); rcu_read_unlock();
ent = timeout * HZ/10 * ko_count;
et = min_not_zero(dt, ent); et = min_not_zero(dt, ent);
if (!et) if (!et)
...@@ -1545,11 +1640,22 @@ void request_timer_fn(unsigned long data) ...@@ -1545,11 +1640,22 @@ void request_timer_fn(unsigned long data)
spin_lock_irq(&device->resource->req_lock); spin_lock_irq(&device->resource->req_lock);
req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local); req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local); req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
req_peer = connection->req_not_net_done;
/* maybe the oldest request waiting for the peer is in fact still /* maybe the oldest request waiting for the peer is in fact still
* blocking in tcp sendmsg */ * blocking in tcp sendmsg. That's ok, though, that's handled via the
if (!req_peer && connection->req_next && connection->req_next->pre_send_jif) * socket send timeout, requesting a ping, and bumping ko-count in
req_peer = connection->req_next; * we_should_drop_the_connection().
*/
/* check the oldest request we did successfully sent,
* but which is still waiting for an ACK. */
req_peer = connection->req_ack_pending;
/* if we don't have such request (e.g. protocoll A)
* check the oldest requests which is still waiting on its epoch
* closing barrier ack. */
if (!req_peer)
req_peer = connection->req_not_net_done;
/* evaluate the oldest peer request only in one timer! */ /* evaluate the oldest peer request only in one timer! */
if (req_peer && req_peer->device != device) if (req_peer && req_peer->device != device)
...@@ -1566,28 +1672,9 @@ void request_timer_fn(unsigned long data) ...@@ -1566,28 +1672,9 @@ void request_timer_fn(unsigned long data)
: req_write ? req_write->pre_submit_jif : req_write ? req_write->pre_submit_jif
: req_read ? req_read->pre_submit_jif : now; : req_read ? req_read->pre_submit_jif : now;
/* The request is considered timed out, if if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout))
* - we have some effective timeout from the configuration,
* with above state restrictions applied,
* - the oldest request is waiting for a response from the network
* resp. the local disk,
* - the oldest request is in fact older than the effective timeout,
* - the connection was established (resp. disk was attached)
* for longer than the timeout already.
* Note that for 32bit jiffies and very stable connections/disks,
* we may have a wrap around, which is catched by
* !time_in_range(now, last_..._jif, last_..._jif + timeout).
*
* Side effect: once per 32bit wrap-around interval, which means every
* ~198 days with 250 HZ, we have a window where the timeout would need
* to expire twice (worst case) to become effective. Good enough.
*/
if (ent && req_peer &&
time_after(now, req_peer->pre_send_jif + ent) &&
!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD); _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
}
if (dt && oldest_submit_jif != now && if (dt && oldest_submit_jif != now &&
time_after(now, oldest_submit_jif + dt) && time_after(now, oldest_submit_jif + dt) &&
!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
......
...@@ -331,21 +331,6 @@ static inline int req_mod(struct drbd_request *req, ...@@ -331,21 +331,6 @@ static inline int req_mod(struct drbd_request *req,
return rv; return rv;
} }
static inline bool drbd_should_do_remote(union drbd_dev_state s) extern bool drbd_should_do_remote(union drbd_dev_state);
{
return s.pdsk == D_UP_TO_DATE ||
(s.pdsk >= D_INCONSISTENT &&
s.conn >= C_WF_BITMAP_T &&
s.conn < C_AHEAD);
/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
states. */
}
static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
{
return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
since we enter state C_AHEAD only if proto >= 96 */
}
#endif #endif
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "drbd_int.h" #include "drbd_int.h"
#include "drbd_protocol.h" #include "drbd_protocol.h"
#include "drbd_req.h" #include "drbd_req.h"
#include "drbd_state_change.h"
struct after_state_chg_work { struct after_state_chg_work {
struct drbd_work w; struct drbd_work w;
...@@ -37,6 +38,7 @@ struct after_state_chg_work { ...@@ -37,6 +38,7 @@ struct after_state_chg_work {
union drbd_state ns; union drbd_state ns;
enum chg_state_flags flags; enum chg_state_flags flags;
struct completion *done; struct completion *done;
struct drbd_state_change *state_change;
}; };
enum sanitize_state_warnings { enum sanitize_state_warnings {
...@@ -48,9 +50,248 @@ enum sanitize_state_warnings { ...@@ -48,9 +50,248 @@ enum sanitize_state_warnings {
IMPLICITLY_UPGRADED_PDSK, IMPLICITLY_UPGRADED_PDSK,
}; };
static void count_objects(struct drbd_resource *resource,
unsigned int *n_devices,
unsigned int *n_connections)
{
struct drbd_device *device;
struct drbd_connection *connection;
int vnr;
*n_devices = 0;
*n_connections = 0;
idr_for_each_entry(&resource->devices, device, vnr)
(*n_devices)++;
for_each_connection(connection, resource)
(*n_connections)++;
}
static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp)
{
struct drbd_state_change *state_change;
unsigned int size, n;
size = sizeof(struct drbd_state_change) +
n_devices * sizeof(struct drbd_device_state_change) +
n_connections * sizeof(struct drbd_connection_state_change) +
n_devices * n_connections * sizeof(struct drbd_peer_device_state_change);
state_change = kmalloc(size, gfp);
if (!state_change)
return NULL;
state_change->n_devices = n_devices;
state_change->n_connections = n_connections;
state_change->devices = (void *)(state_change + 1);
state_change->connections = (void *)&state_change->devices[n_devices];
state_change->peer_devices = (void *)&state_change->connections[n_connections];
state_change->resource->resource = NULL;
for (n = 0; n < n_devices; n++)
state_change->devices[n].device = NULL;
for (n = 0; n < n_connections; n++)
state_change->connections[n].connection = NULL;
return state_change;
}
struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp)
{
struct drbd_state_change *state_change;
struct drbd_device *device;
unsigned int n_devices;
struct drbd_connection *connection;
unsigned int n_connections;
int vnr;
struct drbd_device_state_change *device_state_change;
struct drbd_peer_device_state_change *peer_device_state_change;
struct drbd_connection_state_change *connection_state_change;
/* Caller holds req_lock spinlock.
* No state, no device IDR, no connections lists can change. */
count_objects(resource, &n_devices, &n_connections);
state_change = alloc_state_change(n_devices, n_connections, gfp);
if (!state_change)
return NULL;
kref_get(&resource->kref);
state_change->resource->resource = resource;
state_change->resource->role[OLD] =
conn_highest_role(first_connection(resource));
state_change->resource->susp[OLD] = resource->susp;
state_change->resource->susp_nod[OLD] = resource->susp_nod;
state_change->resource->susp_fen[OLD] = resource->susp_fen;
connection_state_change = state_change->connections;
for_each_connection(connection, resource) {
kref_get(&connection->kref);
connection_state_change->connection = connection;
connection_state_change->cstate[OLD] =
connection->cstate;
connection_state_change->peer_role[OLD] =
conn_highest_peer(connection);
connection_state_change++;
}
device_state_change = state_change->devices;
peer_device_state_change = state_change->peer_devices;
idr_for_each_entry(&resource->devices, device, vnr) {
kref_get(&device->kref);
device_state_change->device = device;
device_state_change->disk_state[OLD] = device->state.disk;
/* The peer_devices for each device have to be enumerated in
the order of the connections. We may not use for_each_peer_device() here. */
for_each_connection(connection, resource) {
struct drbd_peer_device *peer_device;
peer_device = conn_peer_device(connection, device->vnr);
peer_device_state_change->peer_device = peer_device;
peer_device_state_change->disk_state[OLD] =
device->state.pdsk;
peer_device_state_change->repl_state[OLD] =
max_t(enum drbd_conns,
C_WF_REPORT_PARAMS, device->state.conn);
peer_device_state_change->resync_susp_user[OLD] =
device->state.user_isp;
peer_device_state_change->resync_susp_peer[OLD] =
device->state.peer_isp;
peer_device_state_change->resync_susp_dependency[OLD] =
device->state.aftr_isp;
peer_device_state_change++;
}
device_state_change++;
}
return state_change;
}
static void remember_new_state(struct drbd_state_change *state_change)
{
struct drbd_resource_state_change *resource_state_change;
struct drbd_resource *resource;
unsigned int n;
if (!state_change)
return;
resource_state_change = &state_change->resource[0];
resource = resource_state_change->resource;
resource_state_change->role[NEW] =
conn_highest_role(first_connection(resource));
resource_state_change->susp[NEW] = resource->susp;
resource_state_change->susp_nod[NEW] = resource->susp_nod;
resource_state_change->susp_fen[NEW] = resource->susp_fen;
for (n = 0; n < state_change->n_devices; n++) {
struct drbd_device_state_change *device_state_change =
&state_change->devices[n];
struct drbd_device *device = device_state_change->device;
device_state_change->disk_state[NEW] = device->state.disk;
}
for (n = 0; n < state_change->n_connections; n++) {
struct drbd_connection_state_change *connection_state_change =
&state_change->connections[n];
struct drbd_connection *connection =
connection_state_change->connection;
connection_state_change->cstate[NEW] = connection->cstate;
connection_state_change->peer_role[NEW] =
conn_highest_peer(connection);
}
for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) {
struct drbd_peer_device_state_change *peer_device_state_change =
&state_change->peer_devices[n];
struct drbd_device *device =
peer_device_state_change->peer_device->device;
union drbd_dev_state state = device->state;
peer_device_state_change->disk_state[NEW] = state.pdsk;
peer_device_state_change->repl_state[NEW] =
max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn);
peer_device_state_change->resync_susp_user[NEW] =
state.user_isp;
peer_device_state_change->resync_susp_peer[NEW] =
state.peer_isp;
peer_device_state_change->resync_susp_dependency[NEW] =
state.aftr_isp;
}
}
void copy_old_to_new_state_change(struct drbd_state_change *state_change)
{
struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
#define OLD_TO_NEW(x) \
(x[NEW] = x[OLD])
OLD_TO_NEW(resource_state_change->role);
OLD_TO_NEW(resource_state_change->susp);
OLD_TO_NEW(resource_state_change->susp_nod);
OLD_TO_NEW(resource_state_change->susp_fen);
for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
struct drbd_connection_state_change *connection_state_change =
&state_change->connections[n_connection];
OLD_TO_NEW(connection_state_change->peer_role);
OLD_TO_NEW(connection_state_change->cstate);
}
for (n_device = 0; n_device < state_change->n_devices; n_device++) {
struct drbd_device_state_change *device_state_change =
&state_change->devices[n_device];
OLD_TO_NEW(device_state_change->disk_state);
}
n_peer_devices = state_change->n_devices * state_change->n_connections;
for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
struct drbd_peer_device_state_change *p =
&state_change->peer_devices[n_peer_device];
OLD_TO_NEW(p->disk_state);
OLD_TO_NEW(p->repl_state);
OLD_TO_NEW(p->resync_susp_user);
OLD_TO_NEW(p->resync_susp_peer);
OLD_TO_NEW(p->resync_susp_dependency);
}
#undef OLD_TO_NEW
}
void forget_state_change(struct drbd_state_change *state_change)
{
unsigned int n;
if (!state_change)
return;
if (state_change->resource->resource)
kref_put(&state_change->resource->resource->kref, drbd_destroy_resource);
for (n = 0; n < state_change->n_devices; n++) {
struct drbd_device *device = state_change->devices[n].device;
if (device)
kref_put(&device->kref, drbd_destroy_device);
}
for (n = 0; n < state_change->n_connections; n++) {
struct drbd_connection *connection =
state_change->connections[n].connection;
if (connection)
kref_put(&connection->kref, drbd_destroy_connection);
}
kfree(state_change);
}
static int w_after_state_ch(struct drbd_work *w, int unused); static int w_after_state_ch(struct drbd_work *w, int unused);
static void after_state_ch(struct drbd_device *device, union drbd_state os, static void after_state_ch(struct drbd_device *device, union drbd_state os,
union drbd_state ns, enum chg_state_flags flags); union drbd_state ns, enum chg_state_flags flags,
struct drbd_state_change *);
static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
...@@ -93,6 +334,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) ...@@ -93,6 +334,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
return R_SECONDARY; return R_SECONDARY;
return R_UNKNOWN; return R_UNKNOWN;
} }
static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
{ {
if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
...@@ -937,7 +1179,7 @@ void drbd_resume_al(struct drbd_device *device) ...@@ -937,7 +1179,7 @@ void drbd_resume_al(struct drbd_device *device)
drbd_info(device, "Resumed AL updates\n"); drbd_info(device, "Resumed AL updates\n");
} }
/* helper for __drbd_set_state */ /* helper for _drbd_set_state */
static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
{ {
if (first_peer_device(device)->connection->agreed_pro_version < 90) if (first_peer_device(device)->connection->agreed_pro_version < 90)
...@@ -965,17 +1207,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) ...@@ -965,17 +1207,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
} }
/** /**
* __drbd_set_state() - Set a new DRBD state * _drbd_set_state() - Set a new DRBD state
* @device: DRBD device. * @device: DRBD device.
* @ns: new state. * @ns: new state.
* @flags: Flags * @flags: Flags
* @done: Optional completion, that will get completed after the after_state_ch() finished * @done: Optional completion, that will get completed after the after_state_ch() finished
* *
* Caller needs to hold req_lock, and global_state_lock. Do not call directly. * Caller needs to hold req_lock. Do not call directly.
*/ */
enum drbd_state_rv enum drbd_state_rv
__drbd_set_state(struct drbd_device *device, union drbd_state ns, _drbd_set_state(struct drbd_device *device, union drbd_state ns,
enum chg_state_flags flags, struct completion *done) enum chg_state_flags flags, struct completion *done)
{ {
struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_peer_device *peer_device = first_peer_device(device);
struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
...@@ -983,6 +1225,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, ...@@ -983,6 +1225,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
enum drbd_state_rv rv = SS_SUCCESS; enum drbd_state_rv rv = SS_SUCCESS;
enum sanitize_state_warnings ssw; enum sanitize_state_warnings ssw;
struct after_state_chg_work *ascw; struct after_state_chg_work *ascw;
struct drbd_state_change *state_change;
os = drbd_read_state(device); os = drbd_read_state(device);
...@@ -1037,6 +1280,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, ...@@ -1037,6 +1280,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
clear_bit(RS_DONE, &device->flags); clear_bit(RS_DONE, &device->flags);
/* FIXME: Have any flags been set earlier in this function already? */
state_change = remember_old_state(device->resource, GFP_ATOMIC);
/* changes to local_cnt and device flags should be visible before /* changes to local_cnt and device flags should be visible before
* changes to state, which again should be visible before anything else * changes to state, which again should be visible before anything else
* depending on that change happens. */ * depending on that change happens. */
...@@ -1047,6 +1293,8 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, ...@@ -1047,6 +1293,8 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
device->resource->susp_fen = ns.susp_fen; device->resource->susp_fen = ns.susp_fen;
smp_wmb(); smp_wmb();
remember_new_state(state_change);
/* put replicated vs not-replicated requests in seperate epochs */ /* put replicated vs not-replicated requests in seperate epochs */
if (drbd_should_do_remote((union drbd_dev_state)os.i) != if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
drbd_should_do_remote((union drbd_dev_state)ns.i)) drbd_should_do_remote((union drbd_dev_state)ns.i))
...@@ -1184,6 +1432,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, ...@@ -1184,6 +1432,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
ascw->w.cb = w_after_state_ch; ascw->w.cb = w_after_state_ch;
ascw->device = device; ascw->device = device;
ascw->done = done; ascw->done = done;
ascw->state_change = state_change;
drbd_queue_work(&connection->sender_work, drbd_queue_work(&connection->sender_work,
&ascw->w); &ascw->w);
} else { } else {
...@@ -1199,7 +1448,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused) ...@@ -1199,7 +1448,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused)
container_of(w, struct after_state_chg_work, w); container_of(w, struct after_state_chg_work, w);
struct drbd_device *device = ascw->device; struct drbd_device *device = ascw->device;
after_state_ch(device, ascw->os, ascw->ns, ascw->flags); after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change);
forget_state_change(ascw->state_change);
if (ascw->flags & CS_WAIT_COMPLETE) if (ascw->flags & CS_WAIT_COMPLETE)
complete(ascw->done); complete(ascw->done);
kfree(ascw); kfree(ascw);
...@@ -1234,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, ...@@ -1234,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
/* open coded non-blocking drbd_suspend_io(device); */ /* open coded non-blocking drbd_suspend_io(device); */
set_bit(SUSPEND_IO, &device->flags); atomic_inc(&device->suspend_cnt);
drbd_bm_lock(device, why, flags); drbd_bm_lock(device, why, flags);
rv = io_fn(device); rv = io_fn(device);
...@@ -1245,6 +1495,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, ...@@ -1245,6 +1495,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
return rv; return rv;
} }
void notify_resource_state_change(struct sk_buff *skb,
unsigned int seq,
struct drbd_resource_state_change *resource_state_change,
enum drbd_notification_type type)
{
struct drbd_resource *resource = resource_state_change->resource;
struct resource_info resource_info = {
.res_role = resource_state_change->role[NEW],
.res_susp = resource_state_change->susp[NEW],
.res_susp_nod = resource_state_change->susp_nod[NEW],
.res_susp_fen = resource_state_change->susp_fen[NEW],
};
notify_resource_state(skb, seq, resource, &resource_info, type);
}
void notify_connection_state_change(struct sk_buff *skb,
unsigned int seq,
struct drbd_connection_state_change *connection_state_change,
enum drbd_notification_type type)
{
struct drbd_connection *connection = connection_state_change->connection;
struct connection_info connection_info = {
.conn_connection_state = connection_state_change->cstate[NEW],
.conn_role = connection_state_change->peer_role[NEW],
};
notify_connection_state(skb, seq, connection, &connection_info, type);
}
void notify_device_state_change(struct sk_buff *skb,
unsigned int seq,
struct drbd_device_state_change *device_state_change,
enum drbd_notification_type type)
{
struct drbd_device *device = device_state_change->device;
struct device_info device_info = {
.dev_disk_state = device_state_change->disk_state[NEW],
};
notify_device_state(skb, seq, device, &device_info, type);
}
void notify_peer_device_state_change(struct sk_buff *skb,
unsigned int seq,
struct drbd_peer_device_state_change *p,
enum drbd_notification_type type)
{
struct drbd_peer_device *peer_device = p->peer_device;
struct peer_device_info peer_device_info = {
.peer_repl_state = p->repl_state[NEW],
.peer_disk_state = p->disk_state[NEW],
.peer_resync_susp_user = p->resync_susp_user[NEW],
.peer_resync_susp_peer = p->resync_susp_peer[NEW],
.peer_resync_susp_dependency = p->resync_susp_dependency[NEW],
};
notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
}
static void broadcast_state_change(struct drbd_state_change *state_change)
{
struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
bool resource_state_has_changed;
unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
void (*last_func)(struct sk_buff *, unsigned int, void *,
enum drbd_notification_type) = NULL;
void *uninitialized_var(last_arg);
#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
#define FINAL_STATE_CHANGE(type) \
({ if (last_func) \
last_func(NULL, 0, last_arg, type); \
})
#define REMEMBER_STATE_CHANGE(func, arg, type) \
({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
last_func = (typeof(last_func))func; \
last_arg = arg; \
})
mutex_lock(&notification_mutex);
resource_state_has_changed =
HAS_CHANGED(resource_state_change->role) ||
HAS_CHANGED(resource_state_change->susp) ||
HAS_CHANGED(resource_state_change->susp_nod) ||
HAS_CHANGED(resource_state_change->susp_fen);
if (resource_state_has_changed)
REMEMBER_STATE_CHANGE(notify_resource_state_change,
resource_state_change, NOTIFY_CHANGE);
for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
struct drbd_connection_state_change *connection_state_change =
&state_change->connections[n_connection];
if (HAS_CHANGED(connection_state_change->peer_role) ||
HAS_CHANGED(connection_state_change->cstate))
REMEMBER_STATE_CHANGE(notify_connection_state_change,
connection_state_change, NOTIFY_CHANGE);
}
for (n_device = 0; n_device < state_change->n_devices; n_device++) {
struct drbd_device_state_change *device_state_change =
&state_change->devices[n_device];
if (HAS_CHANGED(device_state_change->disk_state))
REMEMBER_STATE_CHANGE(notify_device_state_change,
device_state_change, NOTIFY_CHANGE);
}
n_peer_devices = state_change->n_devices * state_change->n_connections;
for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
struct drbd_peer_device_state_change *p =
&state_change->peer_devices[n_peer_device];
if (HAS_CHANGED(p->disk_state) ||
HAS_CHANGED(p->repl_state) ||
HAS_CHANGED(p->resync_susp_user) ||
HAS_CHANGED(p->resync_susp_peer) ||
HAS_CHANGED(p->resync_susp_dependency))
REMEMBER_STATE_CHANGE(notify_peer_device_state_change,
p, NOTIFY_CHANGE);
}
FINAL_STATE_CHANGE(NOTIFY_CHANGE);
mutex_unlock(&notification_mutex);
#undef HAS_CHANGED
#undef FINAL_STATE_CHANGE
#undef REMEMBER_STATE_CHANGE
}
/** /**
* after_state_ch() - Perform after state change actions that may sleep * after_state_ch() - Perform after state change actions that may sleep
* @device: DRBD device. * @device: DRBD device.
...@@ -1253,13 +1636,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, ...@@ -1253,13 +1636,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
* @flags: Flags * @flags: Flags
*/ */
static void after_state_ch(struct drbd_device *device, union drbd_state os, static void after_state_ch(struct drbd_device *device, union drbd_state os,
union drbd_state ns, enum chg_state_flags flags) union drbd_state ns, enum chg_state_flags flags,
struct drbd_state_change *state_change)
{ {
struct drbd_resource *resource = device->resource; struct drbd_resource *resource = device->resource;
struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_peer_device *peer_device = first_peer_device(device);
struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
struct sib_info sib; struct sib_info sib;
broadcast_state_change(state_change);
sib.sib_reason = SIB_STATE_CHANGE; sib.sib_reason = SIB_STATE_CHANGE;
sib.os = os; sib.os = os;
sib.ns = ns; sib.ns = ns;
...@@ -1377,7 +1763,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, ...@@ -1377,7 +1763,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
} }
if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) { if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY &&
device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
drbd_uuid_new_current(device); drbd_uuid_new_current(device);
drbd_send_uuids(peer_device); drbd_send_uuids(peer_device);
...@@ -1444,7 +1830,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, ...@@ -1444,7 +1830,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
if (os.disk != D_FAILED && ns.disk == D_FAILED) { if (os.disk != D_FAILED && ns.disk == D_FAILED) {
enum drbd_io_error_p eh = EP_PASS_ON; enum drbd_io_error_p eh = EP_PASS_ON;
int was_io_error = 0; int was_io_error = 0;
/* corresponding get_ldev was in __drbd_set_state, to serialize /* corresponding get_ldev was in _drbd_set_state, to serialize
* our cleanup here with the transition to D_DISKLESS. * our cleanup here with the transition to D_DISKLESS.
* But is is still not save to dreference ldev here, since * But is is still not save to dreference ldev here, since
* we might come from an failed Attach before ldev was set. */ * we might come from an failed Attach before ldev was set. */
...@@ -1455,6 +1841,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, ...@@ -1455,6 +1841,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags); was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
/* Intentionally call this handler first, before drbd_send_state().
* See: 2932204 drbd: call local-io-error handler early
* People may chose to hard-reset the box from this handler.
* It is useful if this looks like a "regular node crash". */
if (was_io_error && eh == EP_CALL_HELPER) if (was_io_error && eh == EP_CALL_HELPER)
drbd_khelper(device, "local-io-error"); drbd_khelper(device, "local-io-error");
...@@ -1572,6 +1962,7 @@ struct after_conn_state_chg_work { ...@@ -1572,6 +1962,7 @@ struct after_conn_state_chg_work {
union drbd_state ns_max; /* new, max state, over all devices */ union drbd_state ns_max; /* new, max state, over all devices */
enum chg_state_flags flags; enum chg_state_flags flags;
struct drbd_connection *connection; struct drbd_connection *connection;
struct drbd_state_change *state_change;
}; };
static int w_after_conn_state_ch(struct drbd_work *w, int unused) static int w_after_conn_state_ch(struct drbd_work *w, int unused)
...@@ -1584,6 +1975,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) ...@@ -1584,6 +1975,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
struct drbd_peer_device *peer_device; struct drbd_peer_device *peer_device;
int vnr; int vnr;
broadcast_state_change(acscw->state_change);
forget_state_change(acscw->state_change);
kfree(acscw); kfree(acscw);
/* Upon network configuration, we need to start the receiver */ /* Upon network configuration, we need to start the receiver */
...@@ -1593,6 +1986,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) ...@@ -1593,6 +1986,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
struct net_conf *old_conf; struct net_conf *old_conf;
mutex_lock(&notification_mutex);
idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
notify_peer_device_state(NULL, 0, peer_device, NULL,
NOTIFY_DESTROY | NOTIFY_CONTINUES);
notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY);
mutex_unlock(&notification_mutex);
mutex_lock(&connection->resource->conf_update); mutex_lock(&connection->resource->conf_update);
old_conf = connection->net_conf; old_conf = connection->net_conf;
connection->my_addr_len = 0; connection->my_addr_len = 0;
...@@ -1759,7 +2159,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union ...@@ -1759,7 +2159,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
ns.disk = os.disk; ns.disk = os.disk;
rv = __drbd_set_state(device, ns, flags, NULL); rv = _drbd_set_state(device, ns, flags, NULL);
if (rv < SS_SUCCESS) if (rv < SS_SUCCESS)
BUG(); BUG();
...@@ -1823,6 +2223,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u ...@@ -1823,6 +2223,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
enum drbd_conns oc = connection->cstate; enum drbd_conns oc = connection->cstate;
union drbd_state ns_max, ns_min, os; union drbd_state ns_max, ns_min, os;
bool have_mutex = false; bool have_mutex = false;
struct drbd_state_change *state_change;
if (mask.conn) { if (mask.conn) {
rv = is_valid_conn_transition(oc, val.conn); rv = is_valid_conn_transition(oc, val.conn);
...@@ -1868,10 +2269,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u ...@@ -1868,10 +2269,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
goto abort; goto abort;
} }
state_change = remember_old_state(connection->resource, GFP_ATOMIC);
conn_old_common_state(connection, &os, &flags); conn_old_common_state(connection, &os, &flags);
flags |= CS_DC_SUSP; flags |= CS_DC_SUSP;
conn_set_state(connection, mask, val, &ns_min, &ns_max, flags); conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
conn_pr_state_change(connection, os, ns_max, flags); conn_pr_state_change(connection, os, ns_max, flags);
remember_new_state(state_change);
acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
if (acscw) { if (acscw) {
...@@ -1882,6 +2285,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u ...@@ -1882,6 +2285,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
acscw->w.cb = w_after_conn_state_ch; acscw->w.cb = w_after_conn_state_ch;
kref_get(&connection->kref); kref_get(&connection->kref);
acscw->connection = connection; acscw->connection = connection;
acscw->state_change = state_change;
drbd_queue_work(&connection->sender_work, &acscw->w); drbd_queue_work(&connection->sender_work, &acscw->w);
} else { } else {
drbd_err(connection, "Could not kmalloc an acscw\n"); drbd_err(connection, "Could not kmalloc an acscw\n");
......
...@@ -122,9 +122,9 @@ extern enum drbd_state_rv ...@@ -122,9 +122,9 @@ extern enum drbd_state_rv
_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state, _drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
union drbd_state, enum chg_state_flags); union drbd_state, enum chg_state_flags);
extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state,
enum chg_state_flags, enum chg_state_flags,
struct completion *done); struct completion *done);
extern void print_st_err(struct drbd_device *, union drbd_state, extern void print_st_err(struct drbd_device *, union drbd_state,
union drbd_state, int); union drbd_state, int);
......
#ifndef DRBD_STATE_CHANGE_H
#define DRBD_STATE_CHANGE_H
struct drbd_resource_state_change {
struct drbd_resource *resource;
enum drbd_role role[2];
bool susp[2];
bool susp_nod[2];
bool susp_fen[2];
};
struct drbd_device_state_change {
struct drbd_device *device;
enum drbd_disk_state disk_state[2];
};
struct drbd_connection_state_change {
struct drbd_connection *connection;
enum drbd_conns cstate[2]; /* drbd9: enum drbd_conn_state */
enum drbd_role peer_role[2];
};
struct drbd_peer_device_state_change {
struct drbd_peer_device *peer_device;
enum drbd_disk_state disk_state[2];
enum drbd_conns repl_state[2]; /* drbd9: enum drbd_repl_state */
bool resync_susp_user[2];
bool resync_susp_peer[2];
bool resync_susp_dependency[2];
};
struct drbd_state_change {
struct list_head list;
unsigned int n_devices;
unsigned int n_connections;
struct drbd_resource_state_change resource[1];
struct drbd_device_state_change *devices;
struct drbd_connection_state_change *connections;
struct drbd_peer_device_state_change *peer_devices;
};
extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t);
extern void copy_old_to_new_state_change(struct drbd_state_change *);
extern void forget_state_change(struct drbd_state_change *);
extern void notify_resource_state_change(struct sk_buff *,
unsigned int,
struct drbd_resource_state_change *,
enum drbd_notification_type type);
extern void notify_connection_state_change(struct sk_buff *,
unsigned int,
struct drbd_connection_state_change *,
enum drbd_notification_type type);
extern void notify_device_state_change(struct sk_buff *,
unsigned int,
struct drbd_device_state_change *,
enum drbd_notification_type type);
extern void notify_peer_device_state_change(struct sk_buff *,
unsigned int,
struct drbd_peer_device_state_change *,
enum drbd_notification_type type);
#endif /* DRBD_STATE_CHANGE_H */
...@@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int); ...@@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int);
* *
*/ */
/* About the global_state_lock
Each state transition on an device holds a read lock. In case we have
to evaluate the resync after dependencies, we grab a write lock, because
we need stable states on all devices for that. */
rwlock_t global_state_lock;
/* used for synchronous meta data and bitmap IO /* used for synchronous meta data and bitmap IO
* submitted by drbd_md_sync_page_io() * submitted by drbd_md_sync_page_io()
*/ */
...@@ -120,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l ...@@ -120,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
unsigned long flags = 0; unsigned long flags = 0;
struct drbd_peer_device *peer_device = peer_req->peer_device; struct drbd_peer_device *peer_device = peer_req->peer_device;
struct drbd_device *device = peer_device->device; struct drbd_device *device = peer_device->device;
struct drbd_connection *connection = peer_device->connection;
struct drbd_interval i; struct drbd_interval i;
int do_wake; int do_wake;
u64 block_id; u64 block_id;
...@@ -152,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l ...@@ -152,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
* ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
if (peer_req->flags & EE_WAS_ERROR) if (peer_req->flags & EE_WAS_ERROR)
__drbd_chk_io_error(device, DRBD_WRITE_ERROR); __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
if (connection->cstate >= C_WF_REPORT_PARAMS) {
kref_get(&device->kref); /* put is in drbd_send_acks_wf() */
if (!queue_work(connection->ack_sender, &peer_device->send_acks_work))
kref_put(&device->kref, drbd_destroy_device);
}
spin_unlock_irqrestore(&device->resource->req_lock, flags); spin_unlock_irqrestore(&device->resource->req_lock, flags);
if (block_id == ID_SYNCER) if (block_id == ID_SYNCER)
...@@ -163,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l ...@@ -163,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
if (do_al_complete_io) if (do_al_complete_io)
drbd_al_complete_io(device, &i); drbd_al_complete_io(device, &i);
wake_asender(peer_device->connection);
put_ldev(device); put_ldev(device);
} }
...@@ -195,6 +194,12 @@ void drbd_peer_request_endio(struct bio *bio) ...@@ -195,6 +194,12 @@ void drbd_peer_request_endio(struct bio *bio)
} }
} }
void drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device)
{
panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n",
device->minor, device->resource->name, device->vnr);
}
/* read, readA or write requests on R_PRIMARY coming from drbd_make_request /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
*/ */
void drbd_request_endio(struct bio *bio) void drbd_request_endio(struct bio *bio)
...@@ -238,7 +243,7 @@ void drbd_request_endio(struct bio *bio) ...@@ -238,7 +243,7 @@ void drbd_request_endio(struct bio *bio)
drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
if (!bio->bi_error) if (!bio->bi_error)
panic("possible random memory corruption caused by delayed completion of aborted local request\n"); drbd_panic_after_delayed_completion_of_aborted_request(device);
} }
/* to avoid recursion in __req_mod */ /* to avoid recursion in __req_mod */
...@@ -1291,6 +1296,7 @@ static int drbd_send_barrier(struct drbd_connection *connection) ...@@ -1291,6 +1296,7 @@ static int drbd_send_barrier(struct drbd_connection *connection)
p->barrier = connection->send.current_epoch_nr; p->barrier = connection->send.current_epoch_nr;
p->pad = 0; p->pad = 0;
connection->send.current_epoch_writes = 0; connection->send.current_epoch_writes = 0;
connection->send.last_sent_barrier_jif = jiffies;
return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0); return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
} }
...@@ -1315,6 +1321,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned ...@@ -1315,6 +1321,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned
connection->send.seen_any_write_yet = true; connection->send.seen_any_write_yet = true;
connection->send.current_epoch_nr = epoch; connection->send.current_epoch_nr = epoch;
connection->send.current_epoch_writes = 0; connection->send.current_epoch_writes = 0;
connection->send.last_sent_barrier_jif = jiffies;
} }
} }
...@@ -1456,70 +1463,73 @@ static int _drbd_may_sync_now(struct drbd_device *device) ...@@ -1456,70 +1463,73 @@ static int _drbd_may_sync_now(struct drbd_device *device)
} }
/** /**
* _drbd_pause_after() - Pause resync on all devices that may not resync now * drbd_pause_after() - Pause resync on all devices that may not resync now
* @device: DRBD device. * @device: DRBD device.
* *
* Called from process context only (admin command and after_state_ch). * Called from process context only (admin command and after_state_ch).
*/ */
static int _drbd_pause_after(struct drbd_device *device) static bool drbd_pause_after(struct drbd_device *device)
{ {
bool changed = false;
struct drbd_device *odev; struct drbd_device *odev;
int i, rv = 0; int i;
rcu_read_lock(); rcu_read_lock();
idr_for_each_entry(&drbd_devices, odev, i) { idr_for_each_entry(&drbd_devices, odev, i) {
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue; continue;
if (!_drbd_may_sync_now(odev)) if (!_drbd_may_sync_now(odev) &&
rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) _drbd_set_state(_NS(odev, aftr_isp, 1),
!= SS_NOTHING_TO_DO); CS_HARD, NULL) != SS_NOTHING_TO_DO)
changed = true;
} }
rcu_read_unlock(); rcu_read_unlock();
return rv; return changed;
} }
/** /**
* _drbd_resume_next() - Resume resync on all devices that may resync now * drbd_resume_next() - Resume resync on all devices that may resync now
* @device: DRBD device. * @device: DRBD device.
* *
* Called from process context only (admin command and worker). * Called from process context only (admin command and worker).
*/ */
static int _drbd_resume_next(struct drbd_device *device) static bool drbd_resume_next(struct drbd_device *device)
{ {
bool changed = false;
struct drbd_device *odev; struct drbd_device *odev;
int i, rv = 0; int i;
rcu_read_lock(); rcu_read_lock();
idr_for_each_entry(&drbd_devices, odev, i) { idr_for_each_entry(&drbd_devices, odev, i) {
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue; continue;
if (odev->state.aftr_isp) { if (odev->state.aftr_isp) {
if (_drbd_may_sync_now(odev)) if (_drbd_may_sync_now(odev) &&
rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), _drbd_set_state(_NS(odev, aftr_isp, 0),
CS_HARD, NULL) CS_HARD, NULL) != SS_NOTHING_TO_DO)
!= SS_NOTHING_TO_DO) ; changed = true;
} }
} }
rcu_read_unlock(); rcu_read_unlock();
return rv; return changed;
} }
void resume_next_sg(struct drbd_device *device) void resume_next_sg(struct drbd_device *device)
{ {
write_lock_irq(&global_state_lock); lock_all_resources();
_drbd_resume_next(device); drbd_resume_next(device);
write_unlock_irq(&global_state_lock); unlock_all_resources();
} }
void suspend_other_sg(struct drbd_device *device) void suspend_other_sg(struct drbd_device *device)
{ {
write_lock_irq(&global_state_lock); lock_all_resources();
_drbd_pause_after(device); drbd_pause_after(device);
write_unlock_irq(&global_state_lock); unlock_all_resources();
} }
/* caller must hold global_state_lock */ /* caller must lock_all_resources() */
enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor) enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
{ {
struct drbd_device *odev; struct drbd_device *odev;
...@@ -1557,15 +1567,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min ...@@ -1557,15 +1567,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min
} }
} }
/* caller must hold global_state_lock */ /* caller must lock_all_resources() */
void drbd_resync_after_changed(struct drbd_device *device) void drbd_resync_after_changed(struct drbd_device *device)
{ {
int changes; int changed;
do { do {
changes = _drbd_pause_after(device); changed = drbd_pause_after(device);
changes |= _drbd_resume_next(device); changed |= drbd_resume_next(device);
} while (changes); } while (changed);
} }
void drbd_rs_controller_reset(struct drbd_device *device) void drbd_rs_controller_reset(struct drbd_device *device)
...@@ -1685,19 +1695,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) ...@@ -1685,19 +1695,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
} else { } else {
mutex_lock(device->state_mutex); mutex_lock(device->state_mutex);
} }
clear_bit(B_RS_H_DONE, &device->flags);
/* req_lock: serialize with drbd_send_and_submit() and others lock_all_resources();
* global_state_lock: for stable sync-after dependencies */ clear_bit(B_RS_H_DONE, &device->flags);
spin_lock_irq(&device->resource->req_lock);
write_lock(&global_state_lock);
/* Did some connection breakage or IO error race with us? */ /* Did some connection breakage or IO error race with us? */
if (device->state.conn < C_CONNECTED if (device->state.conn < C_CONNECTED
|| !get_ldev_if_state(device, D_NEGOTIATING)) { || !get_ldev_if_state(device, D_NEGOTIATING)) {
write_unlock(&global_state_lock); unlock_all_resources();
spin_unlock_irq(&device->resource->req_lock); goto out;
mutex_unlock(device->state_mutex);
return;
} }
ns = drbd_read_state(device); ns = drbd_read_state(device);
...@@ -1711,7 +1716,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) ...@@ -1711,7 +1716,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
else /* side == C_SYNC_SOURCE */ else /* side == C_SYNC_SOURCE */
ns.pdsk = D_INCONSISTENT; ns.pdsk = D_INCONSISTENT;
r = __drbd_set_state(device, ns, CS_VERBOSE, NULL); r = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
ns = drbd_read_state(device); ns = drbd_read_state(device);
if (ns.conn < C_CONNECTED) if (ns.conn < C_CONNECTED)
...@@ -1732,7 +1737,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) ...@@ -1732,7 +1737,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
device->rs_mark_left[i] = tw; device->rs_mark_left[i] = tw;
device->rs_mark_time[i] = now; device->rs_mark_time[i] = now;
} }
_drbd_pause_after(device); drbd_pause_after(device);
/* Forget potentially stale cached per resync extent bit-counts. /* Forget potentially stale cached per resync extent bit-counts.
* Open coded drbd_rs_cancel_all(device), we already have IRQs * Open coded drbd_rs_cancel_all(device), we already have IRQs
* disabled, and know the disk state is ok. */ * disabled, and know the disk state is ok. */
...@@ -1742,8 +1747,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) ...@@ -1742,8 +1747,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
device->resync_wenr = LC_FREE; device->resync_wenr = LC_FREE;
spin_unlock(&device->al_lock); spin_unlock(&device->al_lock);
} }
write_unlock(&global_state_lock); unlock_all_resources();
spin_unlock_irq(&device->resource->req_lock);
if (r == SS_SUCCESS) { if (r == SS_SUCCESS) {
wake_up(&device->al_wait); /* for lc_reset() above */ wake_up(&device->al_wait); /* for lc_reset() above */
...@@ -1807,6 +1811,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) ...@@ -1807,6 +1811,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
drbd_md_sync(device); drbd_md_sync(device);
} }
put_ldev(device); put_ldev(device);
out:
mutex_unlock(device->state_mutex); mutex_unlock(device->state_mutex);
} }
...@@ -1836,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device) ...@@ -1836,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device)
device->act_log = NULL; device->act_log = NULL;
__acquire(local); __acquire(local);
drbd_free_ldev(device->ldev); drbd_backing_dev_free(device, device->ldev);
device->ldev = NULL; device->ldev = NULL;
__release(local); __release(local);
......
...@@ -104,9 +104,9 @@ ...@@ -104,9 +104,9 @@
/* Device instance number, incremented each time a device is probed. */ /* Device instance number, incremented each time a device is probed. */
static int instance; static int instance;
struct list_head online_list; static struct list_head online_list;
struct list_head removing_list; static struct list_head removing_list;
spinlock_t dev_lock; static spinlock_t dev_lock;
/* /*
* Global variable used to hold the major block device number * Global variable used to hold the major block device number
......
...@@ -495,17 +495,17 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) ...@@ -495,17 +495,17 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
id->ppaf.ch_offset = 56; id->ppaf.ch_offset = 56;
id->ppaf.ch_len = 8; id->ppaf.ch_len = 8;
do_div(size, bs); /* convert size to pages */ sector_div(size, bs); /* convert size to pages */
do_div(size, 256); /* concert size to pgs pr blk */ size >>= 8; /* concert size to pgs pr blk */
grp = &id->groups[0]; grp = &id->groups[0];
grp->mtype = 0; grp->mtype = 0;
grp->fmtype = 0; grp->fmtype = 0;
grp->num_ch = 1; grp->num_ch = 1;
grp->num_pg = 256; grp->num_pg = 256;
blksize = size; blksize = size;
do_div(size, (1 << 16)); size >>= 16;
grp->num_lun = size + 1; grp->num_lun = size + 1;
do_div(blksize, grp->num_lun); sector_div(blksize, grp->num_lun);
grp->num_blk = blksize; grp->num_blk = blksize;
grp->num_pln = 1; grp->num_pln = 1;
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/time.h> #include <linux/ktime.h>
#include <linux/hdreg.h> #include <linux/hdreg.h>
#include <linux/dma-mapping.h> #include <linux/dma-mapping.h>
#include <linux/completion.h> #include <linux/completion.h>
...@@ -671,16 +671,15 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) ...@@ -671,16 +671,15 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
static unsigned int carm_fill_sync_time(struct carm_host *host, static unsigned int carm_fill_sync_time(struct carm_host *host,
unsigned int idx, void *mem) unsigned int idx, void *mem)
{ {
struct timeval tv;
struct carm_msg_sync_time *st = mem; struct carm_msg_sync_time *st = mem;
do_gettimeofday(&tv); time64_t tv = ktime_get_real_seconds();
memset(st, 0, sizeof(*st)); memset(st, 0, sizeof(*st));
st->type = CARM_MSG_MISC; st->type = CARM_MSG_MISC;
st->subtype = MISC_SET_TIME; st->subtype = MISC_SET_TIME;
st->handle = cpu_to_le32(TAG_ENCODE(idx)); st->handle = cpu_to_le32(TAG_ENCODE(idx));
st->timestamp = cpu_to_le32(tv.tv_sec); st->timestamp = cpu_to_le32(tv);
return sizeof(struct carm_msg_sync_time); return sizeof(struct carm_msg_sync_time);
} }
......
...@@ -83,6 +83,16 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); ...@@ -83,6 +83,16 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
MODULE_PARM_DESC(max_persistent_grants, MODULE_PARM_DESC(max_persistent_grants,
"Maximum number of grants to map persistently"); "Maximum number of grants to map persistently");
/*
* Maximum number of rings/queues blkback supports, allow as many queues as there
* are CPUs if user has not specified a value.
*/
unsigned int xenblk_max_queues;
module_param_named(max_queues, xenblk_max_queues, uint, 0644);
MODULE_PARM_DESC(max_queues,
"Maximum number of hardware queues per virtual disk." \
"By default it is the number of online CPUs.");
/* /*
* Maximum order of pages to be used for the shared ring between front and * Maximum order of pages to be used for the shared ring between front and
* backend, 4KB page granularity is used. * backend, 4KB page granularity is used.
...@@ -113,71 +123,71 @@ module_param(log_stats, int, 0644); ...@@ -113,71 +123,71 @@ module_param(log_stats, int, 0644);
/* Number of free pages to remove on each call to gnttab_free_pages */ /* Number of free pages to remove on each call to gnttab_free_pages */
#define NUM_BATCH_FREE_PAGES 10 #define NUM_BATCH_FREE_PAGES 10
static inline int get_free_page(struct xen_blkif *blkif, struct page **page) static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
{ {
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&blkif->free_pages_lock, flags); spin_lock_irqsave(&ring->free_pages_lock, flags);
if (list_empty(&blkif->free_pages)) { if (list_empty(&ring->free_pages)) {
BUG_ON(blkif->free_pages_num != 0); BUG_ON(ring->free_pages_num != 0);
spin_unlock_irqrestore(&blkif->free_pages_lock, flags); spin_unlock_irqrestore(&ring->free_pages_lock, flags);
return gnttab_alloc_pages(1, page); return gnttab_alloc_pages(1, page);
} }
BUG_ON(blkif->free_pages_num == 0); BUG_ON(ring->free_pages_num == 0);
page[0] = list_first_entry(&blkif->free_pages, struct page, lru); page[0] = list_first_entry(&ring->free_pages, struct page, lru);
list_del(&page[0]->lru); list_del(&page[0]->lru);
blkif->free_pages_num--; ring->free_pages_num--;
spin_unlock_irqrestore(&blkif->free_pages_lock, flags); spin_unlock_irqrestore(&ring->free_pages_lock, flags);
return 0; return 0;
} }
static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
int num) int num)
{ {
unsigned long flags; unsigned long flags;
int i; int i;
spin_lock_irqsave(&blkif->free_pages_lock, flags); spin_lock_irqsave(&ring->free_pages_lock, flags);
for (i = 0; i < num; i++) for (i = 0; i < num; i++)
list_add(&page[i]->lru, &blkif->free_pages); list_add(&page[i]->lru, &ring->free_pages);
blkif->free_pages_num += num; ring->free_pages_num += num;
spin_unlock_irqrestore(&blkif->free_pages_lock, flags); spin_unlock_irqrestore(&ring->free_pages_lock, flags);
} }
static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
{ {
/* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
struct page *page[NUM_BATCH_FREE_PAGES]; struct page *page[NUM_BATCH_FREE_PAGES];
unsigned int num_pages = 0; unsigned int num_pages = 0;
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&blkif->free_pages_lock, flags); spin_lock_irqsave(&ring->free_pages_lock, flags);
while (blkif->free_pages_num > num) { while (ring->free_pages_num > num) {
BUG_ON(list_empty(&blkif->free_pages)); BUG_ON(list_empty(&ring->free_pages));
page[num_pages] = list_first_entry(&blkif->free_pages, page[num_pages] = list_first_entry(&ring->free_pages,
struct page, lru); struct page, lru);
list_del(&page[num_pages]->lru); list_del(&page[num_pages]->lru);
blkif->free_pages_num--; ring->free_pages_num--;
if (++num_pages == NUM_BATCH_FREE_PAGES) { if (++num_pages == NUM_BATCH_FREE_PAGES) {
spin_unlock_irqrestore(&blkif->free_pages_lock, flags); spin_unlock_irqrestore(&ring->free_pages_lock, flags);
gnttab_free_pages(num_pages, page); gnttab_free_pages(num_pages, page);
spin_lock_irqsave(&blkif->free_pages_lock, flags); spin_lock_irqsave(&ring->free_pages_lock, flags);
num_pages = 0; num_pages = 0;
} }
} }
spin_unlock_irqrestore(&blkif->free_pages_lock, flags); spin_unlock_irqrestore(&ring->free_pages_lock, flags);
if (num_pages != 0) if (num_pages != 0)
gnttab_free_pages(num_pages, page); gnttab_free_pages(num_pages, page);
} }
#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
static int do_block_io_op(struct xen_blkif *blkif); static int do_block_io_op(struct xen_blkif_ring *ring);
static int dispatch_rw_block_io(struct xen_blkif *blkif, static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
struct blkif_request *req, struct blkif_request *req,
struct pending_req *pending_req); struct pending_req *pending_req);
static void make_response(struct xen_blkif *blkif, u64 id, static void make_response(struct xen_blkif_ring *ring, u64 id,
unsigned short op, int st); unsigned short op, int st);
#define foreach_grant_safe(pos, n, rbtree, node) \ #define foreach_grant_safe(pos, n, rbtree, node) \
...@@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id, ...@@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id,
/* /*
* We don't need locking around the persistent grant helpers * We don't need locking around the persistent grant helpers
* because blkback uses a single-thread for each backed, so we * because blkback uses a single-thread for each backend, so we
* can be sure that this functions will never be called recursively. * can be sure that this functions will never be called recursively.
* *
* The only exception to that is put_persistent_grant, that can be called * The only exception to that is put_persistent_grant, that can be called
...@@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id, ...@@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
* bit operations to modify the flags of a persistent grant and to count * bit operations to modify the flags of a persistent grant and to count
* the number of used grants. * the number of used grants.
*/ */
static int add_persistent_gnt(struct xen_blkif *blkif, static int add_persistent_gnt(struct xen_blkif_ring *ring,
struct persistent_gnt *persistent_gnt) struct persistent_gnt *persistent_gnt)
{ {
struct rb_node **new = NULL, *parent = NULL; struct rb_node **new = NULL, *parent = NULL;
struct persistent_gnt *this; struct persistent_gnt *this;
struct xen_blkif *blkif = ring->blkif;
if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
if (!blkif->vbd.overflow_max_grants) if (!blkif->vbd.overflow_max_grants)
blkif->vbd.overflow_max_grants = 1; blkif->vbd.overflow_max_grants = 1;
return -EBUSY; return -EBUSY;
} }
/* Figure out where to put new node */ /* Figure out where to put new node */
new = &blkif->persistent_gnts.rb_node; new = &ring->persistent_gnts.rb_node;
while (*new) { while (*new) {
this = container_of(*new, struct persistent_gnt, node); this = container_of(*new, struct persistent_gnt, node);
...@@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif, ...@@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif,
set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
/* Add new node and rebalance tree. */ /* Add new node and rebalance tree. */
rb_link_node(&(persistent_gnt->node), parent, new); rb_link_node(&(persistent_gnt->node), parent, new);
rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
blkif->persistent_gnt_c++; ring->persistent_gnt_c++;
atomic_inc(&blkif->persistent_gnt_in_use); atomic_inc(&ring->persistent_gnt_in_use);
return 0; return 0;
} }
static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
grant_ref_t gref) grant_ref_t gref)
{ {
struct persistent_gnt *data; struct persistent_gnt *data;
struct rb_node *node = NULL; struct rb_node *node = NULL;
node = blkif->persistent_gnts.rb_node; node = ring->persistent_gnts.rb_node;
while (node) { while (node) {
data = container_of(node, struct persistent_gnt, node); data = container_of(node, struct persistent_gnt, node);
...@@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, ...@@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
return NULL; return NULL;
} }
set_bit(PERSISTENT_GNT_ACTIVE, data->flags); set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
atomic_inc(&blkif->persistent_gnt_in_use); atomic_inc(&ring->persistent_gnt_in_use);
return data; return data;
} }
} }
return NULL; return NULL;
} }
static void put_persistent_gnt(struct xen_blkif *blkif, static void put_persistent_gnt(struct xen_blkif_ring *ring,
struct persistent_gnt *persistent_gnt) struct persistent_gnt *persistent_gnt)
{ {
if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
pr_alert_ratelimited("freeing a grant already unused\n"); pr_alert_ratelimited("freeing a grant already unused\n");
set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
atomic_dec(&blkif->persistent_gnt_in_use); atomic_dec(&ring->persistent_gnt_in_use);
} }
static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
unsigned int num) unsigned int num)
{ {
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
...@@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, ...@@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
unmap_data.count = segs_to_unmap; unmap_data.count = segs_to_unmap;
BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
put_free_pages(blkif, pages, segs_to_unmap); put_free_pages(ring, pages, segs_to_unmap);
segs_to_unmap = 0; segs_to_unmap = 0;
} }
...@@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) ...@@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct persistent_gnt *persistent_gnt; struct persistent_gnt *persistent_gnt;
int segs_to_unmap = 0; int segs_to_unmap = 0;
struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
struct gntab_unmap_queue_data unmap_data; struct gntab_unmap_queue_data unmap_data;
unmap_data.pages = pages; unmap_data.pages = pages;
unmap_data.unmap_ops = unmap; unmap_data.unmap_ops = unmap;
unmap_data.kunmap_ops = NULL; unmap_data.kunmap_ops = NULL;
while(!list_empty(&blkif->persistent_purge_list)) { while(!list_empty(&ring->persistent_purge_list)) {
persistent_gnt = list_first_entry(&blkif->persistent_purge_list, persistent_gnt = list_first_entry(&ring->persistent_purge_list,
struct persistent_gnt, struct persistent_gnt,
remove_node); remove_node);
list_del(&persistent_gnt->remove_node); list_del(&persistent_gnt->remove_node);
...@@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) ...@@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
unmap_data.count = segs_to_unmap; unmap_data.count = segs_to_unmap;
BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
put_free_pages(blkif, pages, segs_to_unmap); put_free_pages(ring, pages, segs_to_unmap);
segs_to_unmap = 0; segs_to_unmap = 0;
} }
kfree(persistent_gnt); kfree(persistent_gnt);
...@@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) ...@@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
if (segs_to_unmap > 0) { if (segs_to_unmap > 0) {
unmap_data.count = segs_to_unmap; unmap_data.count = segs_to_unmap;
BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
put_free_pages(blkif, pages, segs_to_unmap); put_free_pages(ring, pages, segs_to_unmap);
} }
} }
static void purge_persistent_gnt(struct xen_blkif *blkif) static void purge_persistent_gnt(struct xen_blkif_ring *ring)
{ {
struct persistent_gnt *persistent_gnt; struct persistent_gnt *persistent_gnt;
struct rb_node *n; struct rb_node *n;
...@@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) ...@@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
bool scan_used = false, clean_used = false; bool scan_used = false, clean_used = false;
struct rb_root *root; struct rb_root *root;
if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
(blkif->persistent_gnt_c == xen_blkif_max_pgrants && (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
!blkif->vbd.overflow_max_grants)) { !ring->blkif->vbd.overflow_max_grants)) {
return; goto out;
} }
if (work_busy(&blkif->persistent_purge_work)) { if (work_busy(&ring->persistent_purge_work)) {
pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
return; goto out;
} }
num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
num_clean = min(blkif->persistent_gnt_c, num_clean); num_clean = min(ring->persistent_gnt_c, num_clean);
if ((num_clean == 0) || if ((num_clean == 0) ||
(num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
return; goto out;
/* /*
* At this point, we can assure that there will be no calls * At this point, we can assure that there will be no calls
...@@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) ...@@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
pr_debug("Going to purge %u persistent grants\n", num_clean); pr_debug("Going to purge %u persistent grants\n", num_clean);
BUG_ON(!list_empty(&blkif->persistent_purge_list)); BUG_ON(!list_empty(&ring->persistent_purge_list));
root = &blkif->persistent_gnts; root = &ring->persistent_gnts;
purge_list: purge_list:
foreach_grant_safe(persistent_gnt, n, root, node) { foreach_grant_safe(persistent_gnt, n, root, node) {
BUG_ON(persistent_gnt->handle == BUG_ON(persistent_gnt->handle ==
...@@ -414,7 +425,7 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) ...@@ -414,7 +425,7 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
rb_erase(&persistent_gnt->node, root); rb_erase(&persistent_gnt->node, root);
list_add(&persistent_gnt->remove_node, list_add(&persistent_gnt->remove_node,
&blkif->persistent_purge_list); &ring->persistent_purge_list);
if (--num_clean == 0) if (--num_clean == 0)
goto finished; goto finished;
} }
...@@ -435,30 +446,32 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) ...@@ -435,30 +446,32 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
goto purge_list; goto purge_list;
} }
blkif->persistent_gnt_c -= (total - num_clean); ring->persistent_gnt_c -= (total - num_clean);
blkif->vbd.overflow_max_grants = 0; ring->blkif->vbd.overflow_max_grants = 0;
/* We can defer this work */ /* We can defer this work */
schedule_work(&blkif->persistent_purge_work); schedule_work(&ring->persistent_purge_work);
pr_debug("Purged %u/%u\n", (total - num_clean), total); pr_debug("Purged %u/%u\n", (total - num_clean), total);
out:
return; return;
} }
/* /*
* Retrieve from the 'pending_reqs' a free pending_req structure to be used. * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
*/ */
static struct pending_req *alloc_req(struct xen_blkif *blkif) static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
{ {
struct pending_req *req = NULL; struct pending_req *req = NULL;
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&blkif->pending_free_lock, flags); spin_lock_irqsave(&ring->pending_free_lock, flags);
if (!list_empty(&blkif->pending_free)) { if (!list_empty(&ring->pending_free)) {
req = list_entry(blkif->pending_free.next, struct pending_req, req = list_entry(ring->pending_free.next, struct pending_req,
free_list); free_list);
list_del(&req->free_list); list_del(&req->free_list);
} }
spin_unlock_irqrestore(&blkif->pending_free_lock, flags); spin_unlock_irqrestore(&ring->pending_free_lock, flags);
return req; return req;
} }
...@@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif) ...@@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif)
* Return the 'pending_req' structure back to the freepool. We also * Return the 'pending_req' structure back to the freepool. We also
* wake up the thread if it was waiting for a free page. * wake up the thread if it was waiting for a free page.
*/ */
static void free_req(struct xen_blkif *blkif, struct pending_req *req) static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
{ {
unsigned long flags; unsigned long flags;
int was_empty; int was_empty;
spin_lock_irqsave(&blkif->pending_free_lock, flags); spin_lock_irqsave(&ring->pending_free_lock, flags);
was_empty = list_empty(&blkif->pending_free); was_empty = list_empty(&ring->pending_free);
list_add(&req->free_list, &blkif->pending_free); list_add(&req->free_list, &ring->pending_free);
spin_unlock_irqrestore(&blkif->pending_free_lock, flags); spin_unlock_irqrestore(&ring->pending_free_lock, flags);
if (was_empty) if (was_empty)
wake_up(&blkif->pending_free_wq); wake_up(&ring->pending_free_wq);
} }
/* /*
...@@ -556,10 +569,10 @@ static void xen_vbd_resize(struct xen_blkif *blkif) ...@@ -556,10 +569,10 @@ static void xen_vbd_resize(struct xen_blkif *blkif)
/* /*
* Notification from the guest OS. * Notification from the guest OS.
*/ */
static void blkif_notify_work(struct xen_blkif *blkif) static void blkif_notify_work(struct xen_blkif_ring *ring)
{ {
blkif->waiting_reqs = 1; ring->waiting_reqs = 1;
wake_up(&blkif->wq); wake_up(&ring->wq);
} }
irqreturn_t xen_blkif_be_int(int irq, void *dev_id) irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
...@@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) ...@@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
* SCHEDULER FUNCTIONS * SCHEDULER FUNCTIONS
*/ */
static void print_stats(struct xen_blkif *blkif) static void print_stats(struct xen_blkif_ring *ring)
{ {
pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
" | ds %4llu | pg: %4u/%4d\n", " | ds %4llu | pg: %4u/%4d\n",
current->comm, blkif->st_oo_req, current->comm, ring->st_oo_req,
blkif->st_rd_req, blkif->st_wr_req, ring->st_rd_req, ring->st_wr_req,
blkif->st_f_req, blkif->st_ds_req, ring->st_f_req, ring->st_ds_req,
blkif->persistent_gnt_c, ring->persistent_gnt_c,
xen_blkif_max_pgrants); xen_blkif_max_pgrants);
blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
blkif->st_rd_req = 0; ring->st_rd_req = 0;
blkif->st_wr_req = 0; ring->st_wr_req = 0;
blkif->st_oo_req = 0; ring->st_oo_req = 0;
blkif->st_ds_req = 0; ring->st_ds_req = 0;
} }
int xen_blkif_schedule(void *arg) int xen_blkif_schedule(void *arg)
{ {
struct xen_blkif *blkif = arg; struct xen_blkif_ring *ring = arg;
struct xen_blkif *blkif = ring->blkif;
struct xen_vbd *vbd = &blkif->vbd; struct xen_vbd *vbd = &blkif->vbd;
unsigned long timeout; unsigned long timeout;
int ret; int ret;
xen_blkif_get(blkif); xen_blkif_get(blkif);
set_freezable();
while (!kthread_should_stop()) { while (!kthread_should_stop()) {
if (try_to_freeze()) if (try_to_freeze())
continue; continue;
...@@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg) ...@@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg)
timeout = msecs_to_jiffies(LRU_INTERVAL); timeout = msecs_to_jiffies(LRU_INTERVAL);
timeout = wait_event_interruptible_timeout( timeout = wait_event_interruptible_timeout(
blkif->wq, ring->wq,
blkif->waiting_reqs || kthread_should_stop(), ring->waiting_reqs || kthread_should_stop(),
timeout); timeout);
if (timeout == 0) if (timeout == 0)
goto purge_gnt_list; goto purge_gnt_list;
timeout = wait_event_interruptible_timeout( timeout = wait_event_interruptible_timeout(
blkif->pending_free_wq, ring->pending_free_wq,
!list_empty(&blkif->pending_free) || !list_empty(&ring->pending_free) ||
kthread_should_stop(), kthread_should_stop(),
timeout); timeout);
if (timeout == 0) if (timeout == 0)
goto purge_gnt_list; goto purge_gnt_list;
blkif->waiting_reqs = 0; ring->waiting_reqs = 0;
smp_mb(); /* clear flag *before* checking for work */ smp_mb(); /* clear flag *before* checking for work */
ret = do_block_io_op(blkif); ret = do_block_io_op(ring);
if (ret > 0) if (ret > 0)
blkif->waiting_reqs = 1; ring->waiting_reqs = 1;
if (ret == -EACCES) if (ret == -EACCES)
wait_event_interruptible(blkif->shutdown_wq, wait_event_interruptible(ring->shutdown_wq,
kthread_should_stop()); kthread_should_stop());
purge_gnt_list: purge_gnt_list:
if (blkif->vbd.feature_gnt_persistent && if (blkif->vbd.feature_gnt_persistent &&
time_after(jiffies, blkif->next_lru)) { time_after(jiffies, ring->next_lru)) {
purge_persistent_gnt(blkif); purge_persistent_gnt(ring);
blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
} }
/* Shrink if we have more than xen_blkif_max_buffer_pages */ /* Shrink if we have more than xen_blkif_max_buffer_pages */
shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
if (log_stats && time_after(jiffies, blkif->st_print)) if (log_stats && time_after(jiffies, ring->st_print))
print_stats(blkif); print_stats(ring);
} }
/* Drain pending purge work */ /* Drain pending purge work */
flush_work(&blkif->persistent_purge_work); flush_work(&ring->persistent_purge_work);
if (log_stats) if (log_stats)
print_stats(blkif); print_stats(ring);
blkif->xenblkd = NULL; ring->xenblkd = NULL;
xen_blkif_put(blkif); xen_blkif_put(blkif);
return 0; return 0;
...@@ -658,22 +673,22 @@ int xen_blkif_schedule(void *arg) ...@@ -658,22 +673,22 @@ int xen_blkif_schedule(void *arg)
/* /*
* Remove persistent grants and empty the pool of free pages * Remove persistent grants and empty the pool of free pages
*/ */
void xen_blkbk_free_caches(struct xen_blkif *blkif) void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
{ {
/* Free all persistent grant pages */ /* Free all persistent grant pages */
if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
free_persistent_gnts(blkif, &blkif->persistent_gnts, free_persistent_gnts(ring, &ring->persistent_gnts,
blkif->persistent_gnt_c); ring->persistent_gnt_c);
BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
blkif->persistent_gnt_c = 0; ring->persistent_gnt_c = 0;
/* Since we are shutting down remove all pages from the buffer */ /* Since we are shutting down remove all pages from the buffer */
shrink_free_pagepool(blkif, 0 /* All */); shrink_free_pagepool(ring, 0 /* All */);
} }
static unsigned int xen_blkbk_unmap_prepare( static unsigned int xen_blkbk_unmap_prepare(
struct xen_blkif *blkif, struct xen_blkif_ring *ring,
struct grant_page **pages, struct grant_page **pages,
unsigned int num, unsigned int num,
struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *unmap_ops,
...@@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare( ...@@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare(
for (i = 0; i < num; i++) { for (i = 0; i < num; i++) {
if (pages[i]->persistent_gnt != NULL) { if (pages[i]->persistent_gnt != NULL) {
put_persistent_gnt(blkif, pages[i]->persistent_gnt); put_persistent_gnt(ring, pages[i]->persistent_gnt);
continue; continue;
} }
if (pages[i]->handle == BLKBACK_INVALID_HANDLE) if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
...@@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare( ...@@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare(
static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
{ {
struct pending_req* pending_req = (struct pending_req*) (data->data); struct pending_req *pending_req = (struct pending_req *)(data->data);
struct xen_blkif *blkif = pending_req->blkif; struct xen_blkif_ring *ring = pending_req->ring;
struct xen_blkif *blkif = ring->blkif;
/* BUG_ON used to reproduce existing behaviour, /* BUG_ON used to reproduce existing behaviour,
but is this the best way to deal with this? */ but is this the best way to deal with this? */
BUG_ON(result); BUG_ON(result);
put_free_pages(blkif, data->pages, data->count); put_free_pages(ring, data->pages, data->count);
make_response(blkif, pending_req->id, make_response(ring, pending_req->id,
pending_req->operation, pending_req->status); pending_req->operation, pending_req->status);
free_req(blkif, pending_req); free_req(ring, pending_req);
/* /*
* Make sure the request is freed before releasing blkif, * Make sure the request is freed before releasing blkif,
* or there could be a race between free_req and the * or there could be a race between free_req and the
...@@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ ...@@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
* pending_free_wq if there's a drain going on, but it has * pending_free_wq if there's a drain going on, but it has
* to be taken into account if the current model is changed. * to be taken into account if the current model is changed.
*/ */
if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
complete(&blkif->drain_complete); complete(&blkif->drain_complete);
} }
xen_blkif_put(blkif); xen_blkif_put(blkif);
...@@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ ...@@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
static void xen_blkbk_unmap_and_respond(struct pending_req *req) static void xen_blkbk_unmap_and_respond(struct pending_req *req)
{ {
struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
struct xen_blkif *blkif = req->blkif; struct xen_blkif_ring *ring = req->ring;
struct grant_page **pages = req->segments; struct grant_page **pages = req->segments;
unsigned int invcount; unsigned int invcount;
invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
req->unmap, req->unmap_pages); req->unmap, req->unmap_pages);
work->data = req; work->data = req;
...@@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req) ...@@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req)
* of hypercalls, but since this is only used in error paths there's * of hypercalls, but since this is only used in error paths there's
* no real need. * no real need.
*/ */
static void xen_blkbk_unmap(struct xen_blkif *blkif, static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
struct grant_page *pages[], struct grant_page *pages[],
int num) int num)
{ {
...@@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif, ...@@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
while (num) { while (num) {
unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
unmap, unmap_pages); unmap, unmap_pages);
if (invcount) { if (invcount) {
ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
BUG_ON(ret); BUG_ON(ret);
put_free_pages(blkif, unmap_pages, invcount); put_free_pages(ring, unmap_pages, invcount);
} }
pages += batch; pages += batch;
num -= batch; num -= batch;
} }
} }
static int xen_blkbk_map(struct xen_blkif *blkif, static int xen_blkbk_map(struct xen_blkif_ring *ring,
struct grant_page *pages[], struct grant_page *pages[],
int num, bool ro) int num, bool ro)
{ {
...@@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif, ...@@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
int ret = 0; int ret = 0;
int last_map = 0, map_until = 0; int last_map = 0, map_until = 0;
int use_persistent_gnts; int use_persistent_gnts;
struct xen_blkif *blkif = ring->blkif;
use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
...@@ -806,10 +823,11 @@ static int xen_blkbk_map(struct xen_blkif *blkif, ...@@ -806,10 +823,11 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
for (i = map_until; i < num; i++) { for (i = map_until; i < num; i++) {
uint32_t flags; uint32_t flags;
if (use_persistent_gnts) if (use_persistent_gnts) {
persistent_gnt = get_persistent_gnt( persistent_gnt = get_persistent_gnt(
blkif, ring,
pages[i]->gref); pages[i]->gref);
}
if (persistent_gnt) { if (persistent_gnt) {
/* /*
...@@ -819,7 +837,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif, ...@@ -819,7 +837,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
pages[i]->page = persistent_gnt->page; pages[i]->page = persistent_gnt->page;
pages[i]->persistent_gnt = persistent_gnt; pages[i]->persistent_gnt = persistent_gnt;
} else { } else {
if (get_free_page(blkif, &pages[i]->page)) if (get_free_page(ring, &pages[i]->page))
goto out_of_memory; goto out_of_memory;
addr = vaddr(pages[i]->page); addr = vaddr(pages[i]->page);
pages_to_gnt[segs_to_map] = pages[i]->page; pages_to_gnt[segs_to_map] = pages[i]->page;
...@@ -852,7 +870,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif, ...@@ -852,7 +870,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
BUG_ON(new_map_idx >= segs_to_map); BUG_ON(new_map_idx >= segs_to_map);
if (unlikely(map[new_map_idx].status != 0)) { if (unlikely(map[new_map_idx].status != 0)) {
pr_debug("invalid buffer -- could not remap it\n"); pr_debug("invalid buffer -- could not remap it\n");
put_free_pages(blkif, &pages[seg_idx]->page, 1); put_free_pages(ring, &pages[seg_idx]->page, 1);
pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
ret |= 1; ret |= 1;
goto next; goto next;
...@@ -862,7 +880,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif, ...@@ -862,7 +880,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
continue; continue;
} }
if (use_persistent_gnts && if (use_persistent_gnts &&
blkif->persistent_gnt_c < xen_blkif_max_pgrants) { ring->persistent_gnt_c < xen_blkif_max_pgrants) {
/* /*
* We are using persistent grants, the grant is * We are using persistent grants, the grant is
* not mapped but we might have room for it. * not mapped but we might have room for it.
...@@ -880,7 +898,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif, ...@@ -880,7 +898,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
persistent_gnt->gnt = map[new_map_idx].ref; persistent_gnt->gnt = map[new_map_idx].ref;
persistent_gnt->handle = map[new_map_idx].handle; persistent_gnt->handle = map[new_map_idx].handle;
persistent_gnt->page = pages[seg_idx]->page; persistent_gnt->page = pages[seg_idx]->page;
if (add_persistent_gnt(blkif, if (add_persistent_gnt(ring,
persistent_gnt)) { persistent_gnt)) {
kfree(persistent_gnt); kfree(persistent_gnt);
persistent_gnt = NULL; persistent_gnt = NULL;
...@@ -888,7 +906,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif, ...@@ -888,7 +906,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
} }
pages[seg_idx]->persistent_gnt = persistent_gnt; pages[seg_idx]->persistent_gnt = persistent_gnt;
pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
persistent_gnt->gnt, blkif->persistent_gnt_c, persistent_gnt->gnt, ring->persistent_gnt_c,
xen_blkif_max_pgrants); xen_blkif_max_pgrants);
goto next; goto next;
} }
...@@ -913,7 +931,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif, ...@@ -913,7 +931,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
out_of_memory: out_of_memory:
pr_alert("%s: out of memory\n", __func__); pr_alert("%s: out of memory\n", __func__);
put_free_pages(blkif, pages_to_gnt, segs_to_map); put_free_pages(ring, pages_to_gnt, segs_to_map);
return -ENOMEM; return -ENOMEM;
} }
...@@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req) ...@@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req)
{ {
int rc; int rc;
rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
pending_req->nr_segs, pending_req->nr_segs,
(pending_req->operation != BLKIF_OP_READ)); (pending_req->operation != BLKIF_OP_READ));
...@@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, ...@@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
struct phys_req *preq) struct phys_req *preq)
{ {
struct grant_page **pages = pending_req->indirect_pages; struct grant_page **pages = pending_req->indirect_pages;
struct xen_blkif *blkif = pending_req->blkif; struct xen_blkif_ring *ring = pending_req->ring;
int indirect_grefs, rc, n, nseg, i; int indirect_grefs, rc, n, nseg, i;
struct blkif_request_segment *segments = NULL; struct blkif_request_segment *segments = NULL;
...@@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, ...@@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
for (i = 0; i < indirect_grefs; i++) for (i = 0; i < indirect_grefs; i++)
pages[i]->gref = req->u.indirect.indirect_grefs[i]; pages[i]->gref = req->u.indirect.indirect_grefs[i];
rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
if (rc) if (rc)
goto unmap; goto unmap;
...@@ -977,15 +995,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, ...@@ -977,15 +995,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
unmap: unmap:
if (segments) if (segments)
kunmap_atomic(segments); kunmap_atomic(segments);
xen_blkbk_unmap(blkif, pages, indirect_grefs); xen_blkbk_unmap(ring, pages, indirect_grefs);
return rc; return rc;
} }
static int dispatch_discard_io(struct xen_blkif *blkif, static int dispatch_discard_io(struct xen_blkif_ring *ring,
struct blkif_request *req) struct blkif_request *req)
{ {
int err = 0; int err = 0;
int status = BLKIF_RSP_OKAY; int status = BLKIF_RSP_OKAY;
struct xen_blkif *blkif = ring->blkif;
struct block_device *bdev = blkif->vbd.bdev; struct block_device *bdev = blkif->vbd.bdev;
unsigned long secure; unsigned long secure;
struct phys_req preq; struct phys_req preq;
...@@ -1002,7 +1021,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif, ...@@ -1002,7 +1021,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
goto fail_response; goto fail_response;
} }
blkif->st_ds_req++; ring->st_ds_req++;
secure = (blkif->vbd.discard_secure && secure = (blkif->vbd.discard_secure &&
(req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
...@@ -1018,26 +1037,28 @@ static int dispatch_discard_io(struct xen_blkif *blkif, ...@@ -1018,26 +1037,28 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
} else if (err) } else if (err)
status = BLKIF_RSP_ERROR; status = BLKIF_RSP_ERROR;
make_response(blkif, req->u.discard.id, req->operation, status); make_response(ring, req->u.discard.id, req->operation, status);
xen_blkif_put(blkif); xen_blkif_put(blkif);
return err; return err;
} }
static int dispatch_other_io(struct xen_blkif *blkif, static int dispatch_other_io(struct xen_blkif_ring *ring,
struct blkif_request *req, struct blkif_request *req,
struct pending_req *pending_req) struct pending_req *pending_req)
{ {
free_req(blkif, pending_req); free_req(ring, pending_req);
make_response(blkif, req->u.other.id, req->operation, make_response(ring, req->u.other.id, req->operation,
BLKIF_RSP_EOPNOTSUPP); BLKIF_RSP_EOPNOTSUPP);
return -EIO; return -EIO;
} }
static void xen_blk_drain_io(struct xen_blkif *blkif) static void xen_blk_drain_io(struct xen_blkif_ring *ring)
{ {
struct xen_blkif *blkif = ring->blkif;
atomic_set(&blkif->drain, 1); atomic_set(&blkif->drain, 1);
do { do {
if (atomic_read(&blkif->inflight) == 0) if (atomic_read(&ring->inflight) == 0)
break; break;
wait_for_completion_interruptible_timeout( wait_for_completion_interruptible_timeout(
&blkif->drain_complete, HZ); &blkif->drain_complete, HZ);
...@@ -1058,12 +1079,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) ...@@ -1058,12 +1079,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
(error == -EOPNOTSUPP)) { (error == -EOPNOTSUPP)) {
pr_debug("flush diskcache op failed, not supported\n"); pr_debug("flush diskcache op failed, not supported\n");
xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
pending_req->status = BLKIF_RSP_EOPNOTSUPP; pending_req->status = BLKIF_RSP_EOPNOTSUPP;
} else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
(error == -EOPNOTSUPP)) { (error == -EOPNOTSUPP)) {
pr_debug("write barrier op failed, not supported\n"); pr_debug("write barrier op failed, not supported\n");
xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
pending_req->status = BLKIF_RSP_EOPNOTSUPP; pending_req->status = BLKIF_RSP_EOPNOTSUPP;
} else if (error) { } else if (error) {
pr_debug("Buffer not up-to-date at end of operation," pr_debug("Buffer not up-to-date at end of operation,"
...@@ -1097,9 +1118,9 @@ static void end_block_io_op(struct bio *bio) ...@@ -1097,9 +1118,9 @@ static void end_block_io_op(struct bio *bio)
* and transmute it to the block API to hand it over to the proper block disk. * and transmute it to the block API to hand it over to the proper block disk.
*/ */
static int static int
__do_block_io_op(struct xen_blkif *blkif) __do_block_io_op(struct xen_blkif_ring *ring)
{ {
union blkif_back_rings *blk_rings = &blkif->blk_rings; union blkif_back_rings *blk_rings = &ring->blk_rings;
struct blkif_request req; struct blkif_request req;
struct pending_req *pending_req; struct pending_req *pending_req;
RING_IDX rc, rp; RING_IDX rc, rp;
...@@ -1112,7 +1133,7 @@ __do_block_io_op(struct xen_blkif *blkif) ...@@ -1112,7 +1133,7 @@ __do_block_io_op(struct xen_blkif *blkif)
if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
rc = blk_rings->common.rsp_prod_pvt; rc = blk_rings->common.rsp_prod_pvt;
pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
rp, rc, rp - rc, blkif->vbd.pdevice); rp, rc, rp - rc, ring->blkif->vbd.pdevice);
return -EACCES; return -EACCES;
} }
while (rc != rp) { while (rc != rp) {
...@@ -1125,14 +1146,14 @@ __do_block_io_op(struct xen_blkif *blkif) ...@@ -1125,14 +1146,14 @@ __do_block_io_op(struct xen_blkif *blkif)
break; break;
} }
pending_req = alloc_req(blkif); pending_req = alloc_req(ring);
if (NULL == pending_req) { if (NULL == pending_req) {
blkif->st_oo_req++; ring->st_oo_req++;
more_to_do = 1; more_to_do = 1;
break; break;
} }
switch (blkif->blk_protocol) { switch (ring->blkif->blk_protocol) {
case BLKIF_PROTOCOL_NATIVE: case BLKIF_PROTOCOL_NATIVE:
memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
break; break;
...@@ -1156,16 +1177,16 @@ __do_block_io_op(struct xen_blkif *blkif) ...@@ -1156,16 +1177,16 @@ __do_block_io_op(struct xen_blkif *blkif)
case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_WRITE_BARRIER:
case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_INDIRECT: case BLKIF_OP_INDIRECT:
if (dispatch_rw_block_io(blkif, &req, pending_req)) if (dispatch_rw_block_io(ring, &req, pending_req))
goto done; goto done;
break; break;
case BLKIF_OP_DISCARD: case BLKIF_OP_DISCARD:
free_req(blkif, pending_req); free_req(ring, pending_req);
if (dispatch_discard_io(blkif, &req)) if (dispatch_discard_io(ring, &req))
goto done; goto done;
break; break;
default: default:
if (dispatch_other_io(blkif, &req, pending_req)) if (dispatch_other_io(ring, &req, pending_req))
goto done; goto done;
break; break;
} }
...@@ -1178,13 +1199,13 @@ __do_block_io_op(struct xen_blkif *blkif) ...@@ -1178,13 +1199,13 @@ __do_block_io_op(struct xen_blkif *blkif)
} }
static int static int
do_block_io_op(struct xen_blkif *blkif) do_block_io_op(struct xen_blkif_ring *ring)
{ {
union blkif_back_rings *blk_rings = &blkif->blk_rings; union blkif_back_rings *blk_rings = &ring->blk_rings;
int more_to_do; int more_to_do;
do { do {
more_to_do = __do_block_io_op(blkif); more_to_do = __do_block_io_op(ring);
if (more_to_do) if (more_to_do)
break; break;
...@@ -1197,7 +1218,7 @@ do_block_io_op(struct xen_blkif *blkif) ...@@ -1197,7 +1218,7 @@ do_block_io_op(struct xen_blkif *blkif)
* Transmutation of the 'struct blkif_request' to a proper 'struct bio' * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
* and call the 'submit_bio' to pass it to the underlying storage. * and call the 'submit_bio' to pass it to the underlying storage.
*/ */
static int dispatch_rw_block_io(struct xen_blkif *blkif, static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
struct blkif_request *req, struct blkif_request *req,
struct pending_req *pending_req) struct pending_req *pending_req)
{ {
...@@ -1225,17 +1246,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, ...@@ -1225,17 +1246,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
switch (req_operation) { switch (req_operation) {
case BLKIF_OP_READ: case BLKIF_OP_READ:
blkif->st_rd_req++; ring->st_rd_req++;
operation = READ; operation = READ;
break; break;
case BLKIF_OP_WRITE: case BLKIF_OP_WRITE:
blkif->st_wr_req++; ring->st_wr_req++;
operation = WRITE_ODIRECT; operation = WRITE_ODIRECT;
break; break;
case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_WRITE_BARRIER:
drain = true; drain = true;
case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_FLUSH_DISKCACHE:
blkif->st_f_req++; ring->st_f_req++;
operation = WRITE_FLUSH; operation = WRITE_FLUSH;
break; break;
default: default:
...@@ -1260,7 +1281,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, ...@@ -1260,7 +1281,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
preq.nr_sects = 0; preq.nr_sects = 0;
pending_req->blkif = blkif; pending_req->ring = ring;
pending_req->id = req->u.rw.id; pending_req->id = req->u.rw.id;
pending_req->operation = req_operation; pending_req->operation = req_operation;
pending_req->status = BLKIF_RSP_OKAY; pending_req->status = BLKIF_RSP_OKAY;
...@@ -1287,12 +1308,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, ...@@ -1287,12 +1308,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
goto fail_response; goto fail_response;
} }
if (xen_vbd_translate(&preq, blkif, operation) != 0) { if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
operation == READ ? "read" : "write", operation == READ ? "read" : "write",
preq.sector_number, preq.sector_number,
preq.sector_number + preq.nr_sects, preq.sector_number + preq.nr_sects,
blkif->vbd.pdevice); ring->blkif->vbd.pdevice);
goto fail_response; goto fail_response;
} }
...@@ -1304,7 +1325,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, ...@@ -1304,7 +1325,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
if (((int)preq.sector_number|(int)seg[i].nsec) & if (((int)preq.sector_number|(int)seg[i].nsec) &
((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
pr_debug("Misaligned I/O request from domain %d\n", pr_debug("Misaligned I/O request from domain %d\n",
blkif->domid); ring->blkif->domid);
goto fail_response; goto fail_response;
} }
} }
...@@ -1313,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, ...@@ -1313,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
* issue the WRITE_FLUSH. * issue the WRITE_FLUSH.
*/ */
if (drain) if (drain)
xen_blk_drain_io(pending_req->blkif); xen_blk_drain_io(pending_req->ring);
/* /*
* If we have failed at this point, we need to undo the M2P override, * If we have failed at this point, we need to undo the M2P override,
...@@ -1328,8 +1349,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, ...@@ -1328,8 +1349,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
* This corresponding xen_blkif_put is done in __end_block_io_op, or * This corresponding xen_blkif_put is done in __end_block_io_op, or
* below (in "!bio") if we are handling a BLKIF_OP_DISCARD. * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
*/ */
xen_blkif_get(blkif); xen_blkif_get(ring->blkif);
atomic_inc(&blkif->inflight); atomic_inc(&ring->inflight);
for (i = 0; i < nseg; i++) { for (i = 0; i < nseg; i++) {
while ((bio == NULL) || while ((bio == NULL) ||
...@@ -1377,19 +1398,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, ...@@ -1377,19 +1398,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
blk_finish_plug(&plug); blk_finish_plug(&plug);
if (operation == READ) if (operation == READ)
blkif->st_rd_sect += preq.nr_sects; ring->st_rd_sect += preq.nr_sects;
else if (operation & WRITE) else if (operation & WRITE)
blkif->st_wr_sect += preq.nr_sects; ring->st_wr_sect += preq.nr_sects;
return 0; return 0;
fail_flush: fail_flush:
xen_blkbk_unmap(blkif, pending_req->segments, xen_blkbk_unmap(ring, pending_req->segments,
pending_req->nr_segs); pending_req->nr_segs);
fail_response: fail_response:
/* Haven't submitted any bio's yet. */ /* Haven't submitted any bio's yet. */
make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
free_req(blkif, pending_req); free_req(ring, pending_req);
msleep(1); /* back off a bit */ msleep(1); /* back off a bit */
return -EIO; return -EIO;
...@@ -1407,21 +1428,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, ...@@ -1407,21 +1428,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
/* /*
* Put a response on the ring on how the operation fared. * Put a response on the ring on how the operation fared.
*/ */
static void make_response(struct xen_blkif *blkif, u64 id, static void make_response(struct xen_blkif_ring *ring, u64 id,
unsigned short op, int st) unsigned short op, int st)
{ {
struct blkif_response resp; struct blkif_response resp;
unsigned long flags; unsigned long flags;
union blkif_back_rings *blk_rings = &blkif->blk_rings; union blkif_back_rings *blk_rings;
int notify; int notify;
resp.id = id; resp.id = id;
resp.operation = op; resp.operation = op;
resp.status = st; resp.status = st;
spin_lock_irqsave(&blkif->blk_ring_lock, flags); spin_lock_irqsave(&ring->blk_ring_lock, flags);
blk_rings = &ring->blk_rings;
/* Place on the response ring for the relevant domain. */ /* Place on the response ring for the relevant domain. */
switch (blkif->blk_protocol) { switch (ring->blkif->blk_protocol) {
case BLKIF_PROTOCOL_NATIVE: case BLKIF_PROTOCOL_NATIVE:
memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
&resp, sizeof(resp)); &resp, sizeof(resp));
...@@ -1439,9 +1461,9 @@ static void make_response(struct xen_blkif *blkif, u64 id, ...@@ -1439,9 +1461,9 @@ static void make_response(struct xen_blkif *blkif, u64 id,
} }
blk_rings->common.rsp_prod_pvt++; blk_rings->common.rsp_prod_pvt++;
RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
if (notify) if (notify)
notify_remote_via_irq(blkif->irq); notify_remote_via_irq(ring->irq);
} }
static int __init xen_blkif_init(void) static int __init xen_blkif_init(void)
...@@ -1457,6 +1479,9 @@ static int __init xen_blkif_init(void) ...@@ -1457,6 +1479,9 @@ static int __init xen_blkif_init(void)
xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
} }
if (xenblk_max_queues == 0)
xenblk_max_queues = num_online_cpus();
rc = xen_blkif_interface_init(); rc = xen_blkif_interface_init();
if (rc) if (rc)
goto failed_init; goto failed_init;
......
...@@ -46,6 +46,7 @@ ...@@ -46,6 +46,7 @@
#include <xen/interface/io/protocols.h> #include <xen/interface/io/protocols.h>
extern unsigned int xen_blkif_max_ring_order; extern unsigned int xen_blkif_max_ring_order;
extern unsigned int xenblk_max_queues;
/* /*
* This is the maximum number of segments that would be allowed in indirect * This is the maximum number of segments that would be allowed in indirect
* requests. This value will also be passed to the frontend. * requests. This value will also be passed to the frontend.
...@@ -269,68 +270,79 @@ struct persistent_gnt { ...@@ -269,68 +270,79 @@ struct persistent_gnt {
struct list_head remove_node; struct list_head remove_node;
}; };
struct xen_blkif { /* Per-ring information. */
/* Unique identifier for this interface. */ struct xen_blkif_ring {
domid_t domid;
unsigned int handle;
/* Physical parameters of the comms window. */ /* Physical parameters of the comms window. */
unsigned int irq; unsigned int irq;
/* Comms information. */
enum blkif_protocol blk_protocol;
union blkif_back_rings blk_rings; union blkif_back_rings blk_rings;
void *blk_ring; void *blk_ring;
/* The VBD attached to this interface. */
struct xen_vbd vbd;
/* Back pointer to the backend_info. */
struct backend_info *be;
/* Private fields. */ /* Private fields. */
spinlock_t blk_ring_lock; spinlock_t blk_ring_lock;
atomic_t refcnt;
wait_queue_head_t wq; wait_queue_head_t wq;
/* for barrier (drain) requests */
struct completion drain_complete;
atomic_t drain;
atomic_t inflight; atomic_t inflight;
/* One thread per one blkif. */ /* One thread per blkif ring. */
struct task_struct *xenblkd; struct task_struct *xenblkd;
unsigned int waiting_reqs; unsigned int waiting_reqs;
/* tree to store persistent grants */ /* List of all 'pending_req' available */
struct list_head pending_free;
/* And its spinlock. */
spinlock_t pending_free_lock;
wait_queue_head_t pending_free_wq;
/* Tree to store persistent grants. */
spinlock_t pers_gnts_lock;
struct rb_root persistent_gnts; struct rb_root persistent_gnts;
unsigned int persistent_gnt_c; unsigned int persistent_gnt_c;
atomic_t persistent_gnt_in_use; atomic_t persistent_gnt_in_use;
unsigned long next_lru; unsigned long next_lru;
/* used by the kworker that offload work from the persistent purge */ /* Statistics. */
unsigned long st_print;
unsigned long long st_rd_req;
unsigned long long st_wr_req;
unsigned long long st_oo_req;
unsigned long long st_f_req;
unsigned long long st_ds_req;
unsigned long long st_rd_sect;
unsigned long long st_wr_sect;
/* Used by the kworker that offload work from the persistent purge. */
struct list_head persistent_purge_list; struct list_head persistent_purge_list;
struct work_struct persistent_purge_work; struct work_struct persistent_purge_work;
/* buffer of free pages to map grant refs */ /* Buffer of free pages to map grant refs. */
spinlock_t free_pages_lock; spinlock_t free_pages_lock;
int free_pages_num; int free_pages_num;
struct list_head free_pages; struct list_head free_pages;
/* List of all 'pending_req' available */
struct list_head pending_free;
/* And its spinlock. */
spinlock_t pending_free_lock;
wait_queue_head_t pending_free_wq;
/* statistics */
unsigned long st_print;
unsigned long long st_rd_req;
unsigned long long st_wr_req;
unsigned long long st_oo_req;
unsigned long long st_f_req;
unsigned long long st_ds_req;
unsigned long long st_rd_sect;
unsigned long long st_wr_sect;
struct work_struct free_work; struct work_struct free_work;
/* Thread shutdown wait queue. */ /* Thread shutdown wait queue. */
wait_queue_head_t shutdown_wq; wait_queue_head_t shutdown_wq;
unsigned int nr_ring_pages; struct xen_blkif *blkif;
};
struct xen_blkif {
/* Unique identifier for this interface. */
domid_t domid;
unsigned int handle;
/* Comms information. */
enum blkif_protocol blk_protocol;
/* The VBD attached to this interface. */
struct xen_vbd vbd;
/* Back pointer to the backend_info. */
struct backend_info *be;
atomic_t refcnt;
/* for barrier (drain) requests */
struct completion drain_complete;
atomic_t drain;
struct work_struct free_work;
unsigned int nr_ring_pages;
/* All rings for this device. */
struct xen_blkif_ring *rings;
unsigned int nr_rings;
}; };
struct seg_buf { struct seg_buf {
...@@ -352,7 +364,7 @@ struct grant_page { ...@@ -352,7 +364,7 @@ struct grant_page {
* response queued for it, with the saved 'id' passed back. * response queued for it, with the saved 'id' passed back.
*/ */
struct pending_req { struct pending_req {
struct xen_blkif *blkif; struct xen_blkif_ring *ring;
u64 id; u64 id;
int nr_segs; int nr_segs;
atomic_t pendcnt; atomic_t pendcnt;
...@@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void); ...@@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void);
irqreturn_t xen_blkif_be_int(int irq, void *dev_id); irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
int xen_blkif_schedule(void *arg); int xen_blkif_schedule(void *arg);
int xen_blkif_purge_persistent(void *arg); int xen_blkif_purge_persistent(void *arg);
void xen_blkbk_free_caches(struct xen_blkif *blkif); void xen_blkbk_free_caches(struct xen_blkif_ring *ring);
int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
struct backend_info *be, int state); struct backend_info *be, int state);
......
...@@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) ...@@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
{ {
int err; int err;
char name[BLKBACK_NAME_LEN]; char name[BLKBACK_NAME_LEN];
struct xen_blkif_ring *ring;
int i;
/* Not ready to connect? */ /* Not ready to connect? */
if (!blkif->irq || !blkif->vbd.bdev) if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
return; return;
/* Already connected? */ /* Already connected? */
...@@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) ...@@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
} }
invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name); for (i = 0; i < blkif->nr_rings; i++) {
if (IS_ERR(blkif->xenblkd)) { ring = &blkif->rings[i];
err = PTR_ERR(blkif->xenblkd); ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
blkif->xenblkd = NULL; if (IS_ERR(ring->xenblkd)) {
xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); err = PTR_ERR(ring->xenblkd);
return; ring->xenblkd = NULL;
xenbus_dev_fatal(blkif->be->dev, err,
"start %s-%d xenblkd", name, i);
goto out;
}
}
return;
out:
while (--i >= 0) {
ring = &blkif->rings[i];
kthread_stop(ring->xenblkd);
}
return;
}
static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
{
unsigned int r;
blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL);
if (!blkif->rings)
return -ENOMEM;
for (r = 0; r < blkif->nr_rings; r++) {
struct xen_blkif_ring *ring = &blkif->rings[r];
spin_lock_init(&ring->blk_ring_lock);
init_waitqueue_head(&ring->wq);
INIT_LIST_HEAD(&ring->pending_free);
INIT_LIST_HEAD(&ring->persistent_purge_list);
INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants);
spin_lock_init(&ring->free_pages_lock);
INIT_LIST_HEAD(&ring->free_pages);
spin_lock_init(&ring->pending_free_lock);
init_waitqueue_head(&ring->pending_free_wq);
init_waitqueue_head(&ring->shutdown_wq);
ring->blkif = blkif;
ring->st_print = jiffies;
xen_blkif_get(blkif);
} }
return 0;
} }
static struct xen_blkif *xen_blkif_alloc(domid_t domid) static struct xen_blkif *xen_blkif_alloc(domid_t domid)
...@@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) ...@@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
blkif->domid = domid; blkif->domid = domid;
spin_lock_init(&blkif->blk_ring_lock);
atomic_set(&blkif->refcnt, 1); atomic_set(&blkif->refcnt, 1);
init_waitqueue_head(&blkif->wq);
init_completion(&blkif->drain_complete); init_completion(&blkif->drain_complete);
atomic_set(&blkif->drain, 0);
blkif->st_print = jiffies;
blkif->persistent_gnts.rb_node = NULL;
spin_lock_init(&blkif->free_pages_lock);
INIT_LIST_HEAD(&blkif->free_pages);
INIT_LIST_HEAD(&blkif->persistent_purge_list);
blkif->free_pages_num = 0;
atomic_set(&blkif->persistent_gnt_in_use, 0);
atomic_set(&blkif->inflight, 0);
INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
INIT_LIST_HEAD(&blkif->pending_free);
INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
spin_lock_init(&blkif->pending_free_lock);
init_waitqueue_head(&blkif->pending_free_wq);
init_waitqueue_head(&blkif->shutdown_wq);
return blkif; return blkif;
} }
static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
unsigned int nr_grefs, unsigned int evtchn) unsigned int nr_grefs, unsigned int evtchn)
{ {
int err; int err;
struct xen_blkif *blkif = ring->blkif;
/* Already connected through? */ /* Already connected through? */
if (blkif->irq) if (ring->irq)
return 0; return 0;
err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
&blkif->blk_ring); &ring->blk_ring);
if (err < 0) if (err < 0)
return err; return err;
...@@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, ...@@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
case BLKIF_PROTOCOL_NATIVE: case BLKIF_PROTOCOL_NATIVE:
{ {
struct blkif_sring *sring; struct blkif_sring *sring;
sring = (struct blkif_sring *)blkif->blk_ring; sring = (struct blkif_sring *)ring->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.native, sring, BACK_RING_INIT(&ring->blk_rings.native, sring,
XEN_PAGE_SIZE * nr_grefs); XEN_PAGE_SIZE * nr_grefs);
break; break;
} }
case BLKIF_PROTOCOL_X86_32: case BLKIF_PROTOCOL_X86_32:
{ {
struct blkif_x86_32_sring *sring_x86_32; struct blkif_x86_32_sring *sring_x86_32;
sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32,
XEN_PAGE_SIZE * nr_grefs); XEN_PAGE_SIZE * nr_grefs);
break; break;
} }
case BLKIF_PROTOCOL_X86_64: case BLKIF_PROTOCOL_X86_64:
{ {
struct blkif_x86_64_sring *sring_x86_64; struct blkif_x86_64_sring *sring_x86_64;
sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64,
XEN_PAGE_SIZE * nr_grefs); XEN_PAGE_SIZE * nr_grefs);
break; break;
} }
...@@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, ...@@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
xen_blkif_be_int, 0, xen_blkif_be_int, 0,
"blkif-backend", blkif); "blkif-backend", ring);
if (err < 0) { if (err < 0) {
xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
blkif->blk_rings.common.sring = NULL; ring->blk_rings.common.sring = NULL;
return err; return err;
} }
blkif->irq = err; ring->irq = err;
return 0; return 0;
} }
...@@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, ...@@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
static int xen_blkif_disconnect(struct xen_blkif *blkif) static int xen_blkif_disconnect(struct xen_blkif *blkif)
{ {
struct pending_req *req, *n; struct pending_req *req, *n;
int i = 0, j; unsigned int j, r;
if (blkif->xenblkd) { for (r = 0; r < blkif->nr_rings; r++) {
kthread_stop(blkif->xenblkd); struct xen_blkif_ring *ring = &blkif->rings[r];
wake_up(&blkif->shutdown_wq); unsigned int i = 0;
blkif->xenblkd = NULL;
}
/* The above kthread_stop() guarantees that at this point we if (ring->xenblkd) {
* don't have any discard_io or other_io requests. So, checking kthread_stop(ring->xenblkd);
* for inflight IO is enough. wake_up(&ring->shutdown_wq);
*/ ring->xenblkd = NULL;
if (atomic_read(&blkif->inflight) > 0) }
return -EBUSY;
if (blkif->irq) { /* The above kthread_stop() guarantees that at this point we
unbind_from_irqhandler(blkif->irq, blkif); * don't have any discard_io or other_io requests. So, checking
blkif->irq = 0; * for inflight IO is enough.
} */
if (atomic_read(&ring->inflight) > 0)
return -EBUSY;
if (blkif->blk_rings.common.sring) { if (ring->irq) {
xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); unbind_from_irqhandler(ring->irq, ring);
blkif->blk_rings.common.sring = NULL; ring->irq = 0;
} }
/* Remove all persistent grants and the cache of ballooned pages. */ if (ring->blk_rings.common.sring) {
xen_blkbk_free_caches(blkif); xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
ring->blk_rings.common.sring = NULL;
}
/* Check that there is no request in use */ /* Remove all persistent grants and the cache of ballooned pages. */
list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { xen_blkbk_free_caches(ring);
list_del(&req->free_list);
for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) /* Check that there is no request in use */
kfree(req->segments[j]); list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
list_del(&req->free_list);
for (j = 0; j < MAX_INDIRECT_PAGES; j++) for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
kfree(req->indirect_pages[j]); kfree(req->segments[j]);
kfree(req); for (j = 0; j < MAX_INDIRECT_PAGES; j++)
i++; kfree(req->indirect_pages[j]);
}
kfree(req);
i++;
}
WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0);
BUG_ON(!list_empty(&ring->persistent_purge_list));
BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
BUG_ON(!list_empty(&ring->free_pages));
BUG_ON(ring->free_pages_num != 0);
BUG_ON(ring->persistent_gnt_c != 0);
WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
xen_blkif_put(blkif);
}
blkif->nr_ring_pages = 0; blkif->nr_ring_pages = 0;
/*
* blkif->rings was allocated in connect_ring, so we should free it in
* here.
*/
kfree(blkif->rings);
blkif->rings = NULL;
blkif->nr_rings = 0;
return 0; return 0;
} }
...@@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif) ...@@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
xen_vbd_free(&blkif->vbd); xen_vbd_free(&blkif->vbd);
/* Make sure everything is drained before shutting down */ /* Make sure everything is drained before shutting down */
BUG_ON(blkif->persistent_gnt_c != 0);
BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
BUG_ON(blkif->free_pages_num != 0);
BUG_ON(!list_empty(&blkif->persistent_purge_list));
BUG_ON(!list_empty(&blkif->free_pages));
BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
kmem_cache_free(xen_blkif_cachep, blkif); kmem_cache_free(xen_blkif_cachep, blkif);
} }
...@@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void) ...@@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void)
* sysfs interface for VBD I/O requests * sysfs interface for VBD I/O requests
*/ */
#define VBD_SHOW(name, format, args...) \ #define VBD_SHOW_ALLRING(name, format) \
static ssize_t show_##name(struct device *_dev, \ static ssize_t show_##name(struct device *_dev, \
struct device_attribute *attr, \ struct device_attribute *attr, \
char *buf) \ char *buf) \
{ \ { \
struct xenbus_device *dev = to_xenbus_device(_dev); \ struct xenbus_device *dev = to_xenbus_device(_dev); \
struct backend_info *be = dev_get_drvdata(&dev->dev); \ struct backend_info *be = dev_get_drvdata(&dev->dev); \
struct xen_blkif *blkif = be->blkif; \
unsigned int i; \
unsigned long long result = 0; \
\ \
return sprintf(buf, format, ##args); \ if (!blkif->rings) \
goto out; \
\
for (i = 0; i < blkif->nr_rings; i++) { \
struct xen_blkif_ring *ring = &blkif->rings[i]; \
\
result += ring->st_##name; \
} \
\
out: \
return sprintf(buf, format, result); \
} \ } \
static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); VBD_SHOW_ALLRING(oo_req, "%llu\n");
VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); VBD_SHOW_ALLRING(rd_req, "%llu\n");
VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); VBD_SHOW_ALLRING(wr_req, "%llu\n");
VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); VBD_SHOW_ALLRING(f_req, "%llu\n");
VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); VBD_SHOW_ALLRING(ds_req, "%llu\n");
VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); VBD_SHOW_ALLRING(rd_sect, "%llu\n");
VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); VBD_SHOW_ALLRING(wr_sect, "%llu\n");
static struct attribute *xen_vbdstat_attrs[] = { static struct attribute *xen_vbdstat_attrs[] = {
&dev_attr_oo_req.attr, &dev_attr_oo_req.attr,
...@@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = { ...@@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = {
.attrs = xen_vbdstat_attrs, .attrs = xen_vbdstat_attrs,
}; };
#define VBD_SHOW(name, format, args...) \
static ssize_t show_##name(struct device *_dev, \
struct device_attribute *attr, \
char *buf) \
{ \
struct xenbus_device *dev = to_xenbus_device(_dev); \
struct backend_info *be = dev_get_drvdata(&dev->dev); \
\
return sprintf(buf, format, ##args); \
} \
static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
VBD_SHOW(mode, "%s\n", be->mode); VBD_SHOW(mode, "%s\n", be->mode);
...@@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev) ...@@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
dev_set_drvdata(&dev->dev, NULL); dev_set_drvdata(&dev->dev, NULL);
if (be->blkif) { if (be->blkif)
xen_blkif_disconnect(be->blkif); xen_blkif_disconnect(be->blkif);
xen_blkif_put(be->blkif);
}
/* Put the reference we set in xen_blkif_alloc(). */
xen_blkif_put(be->blkif);
kfree(be->mode); kfree(be->mode);
kfree(be); kfree(be);
return 0; return 0;
...@@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev, ...@@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
goto fail; goto fail;
} }
/* Multi-queue: advertise how many queues are supported by us.*/
err = xenbus_printf(XBT_NIL, dev->nodename,
"multi-queue-max-queues", "%u", xenblk_max_queues);
if (err)
pr_warn("Error writing multi-queue-max-queues\n");
/* setup back pointer */ /* setup back pointer */
be->blkif->be = be; be->blkif->be = be;
...@@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev, ...@@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev,
} }
err = connect_ring(be); err = connect_ring(be);
if (err) if (err) {
/*
* Clean up so that memory resources can be used by
* other devices. connect_ring reported already error.
*/
xen_blkif_disconnect(be->blkif);
break; break;
}
xen_update_blkif_status(be->blkif); xen_update_blkif_status(be->blkif);
break; break;
...@@ -825,50 +902,43 @@ static void connect(struct backend_info *be) ...@@ -825,50 +902,43 @@ static void connect(struct backend_info *be)
xenbus_transaction_end(xbt, 1); xenbus_transaction_end(xbt, 1);
} }
/*
static int connect_ring(struct backend_info *be) * Each ring may have multi pages, depends on "ring-page-order".
*/
static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
{ {
struct xenbus_device *dev = be->dev;
unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
unsigned int evtchn, nr_grefs, ring_page_order;
unsigned int pers_grants;
char protocol[64] = "";
struct pending_req *req, *n; struct pending_req *req, *n;
int err, i, j; int err, i, j;
struct xen_blkif *blkif = ring->blkif;
struct xenbus_device *dev = blkif->be->dev;
unsigned int ring_page_order, nr_grefs, evtchn;
pr_debug("%s %s\n", __func__, dev->otherend); err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
&evtchn); &evtchn);
if (err != 1) { if (err != 1) {
err = -EINVAL; err = -EINVAL;
xenbus_dev_fatal(dev, err, "reading %s/event-channel", xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir);
dev->otherend);
return err; return err;
} }
pr_info("event-channel %u\n", evtchn);
err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
&ring_page_order); &ring_page_order);
if (err != 1) { if (err != 1) {
err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]);
"%u", &ring_ref[0]);
if (err != 1) { if (err != 1) {
err = -EINVAL; err = -EINVAL;
xenbus_dev_fatal(dev, err, "reading %s/ring-ref", xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
dev->otherend);
return err; return err;
} }
nr_grefs = 1; nr_grefs = 1;
pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
ring_ref[0]);
} else { } else {
unsigned int i; unsigned int i;
if (ring_page_order > xen_blkif_max_ring_order) { if (ring_page_order > xen_blkif_max_ring_order) {
err = -EINVAL; err = -EINVAL;
xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
dev->otherend, ring_page_order, dir, ring_page_order,
xen_blkif_max_ring_order); xen_blkif_max_ring_order);
return err; return err;
} }
...@@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be) ...@@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be)
char ring_ref_name[RINGREF_NAME_LEN]; char ring_ref_name[RINGREF_NAME_LEN];
snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
"%u", &ring_ref[i]); "%u", &ring_ref[i]);
if (err != 1) { if (err != 1) {
err = -EINVAL; err = -EINVAL;
xenbus_dev_fatal(dev, err, "reading %s/%s", xenbus_dev_fatal(dev, err, "reading %s/%s",
dev->otherend, ring_ref_name); dir, ring_ref_name);
return err; return err;
} }
pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
} }
} }
blkif->nr_ring_pages = nr_grefs;
be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
"%63s", protocol, NULL);
if (err)
strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
else {
xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
return -1;
}
err = xenbus_gather(XBT_NIL, dev->otherend,
"feature-persistent", "%u",
&pers_grants, NULL);
if (err)
pers_grants = 0;
be->blkif->vbd.feature_gnt_persistent = pers_grants;
be->blkif->vbd.overflow_max_grants = 0;
be->blkif->nr_ring_pages = nr_grefs;
pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
pers_grants ? "persistent grants" : "");
for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
req = kzalloc(sizeof(*req), GFP_KERNEL); req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req) if (!req)
goto fail; goto fail;
list_add_tail(&req->free_list, &be->blkif->pending_free); list_add_tail(&req->free_list, &ring->pending_free);
for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
if (!req->segments[j]) if (!req->segments[j])
...@@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be) ...@@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be)
} }
/* Map the shared frame, irq etc. */ /* Map the shared frame, irq etc. */
err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
if (err) { if (err) {
xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
return err; return err;
...@@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be) ...@@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be)
return 0; return 0;
fail: fail:
list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
list_del(&req->free_list); list_del(&req->free_list);
for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
if (!req->segments[j]) if (!req->segments[j])
...@@ -962,6 +1003,93 @@ static int connect_ring(struct backend_info *be) ...@@ -962,6 +1003,93 @@ static int connect_ring(struct backend_info *be)
kfree(req); kfree(req);
} }
return -ENOMEM; return -ENOMEM;
}
static int connect_ring(struct backend_info *be)
{
struct xenbus_device *dev = be->dev;
unsigned int pers_grants;
char protocol[64] = "";
int err, i;
char *xspath;
size_t xspathsize;
const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
unsigned int requested_num_queues = 0;
pr_debug("%s %s\n", __func__, dev->otherend);
be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
"%63s", protocol, NULL);
if (err)
strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
else {
xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
return -ENOSYS;
}
err = xenbus_gather(XBT_NIL, dev->otherend,
"feature-persistent", "%u",
&pers_grants, NULL);
if (err)
pers_grants = 0;
be->blkif->vbd.feature_gnt_persistent = pers_grants;
be->blkif->vbd.overflow_max_grants = 0;
/*
* Read the number of hardware queues from frontend.
*/
err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues",
"%u", &requested_num_queues);
if (err < 0) {
requested_num_queues = 1;
} else {
if (requested_num_queues > xenblk_max_queues
|| requested_num_queues == 0) {
/* Buggy or malicious guest. */
xenbus_dev_fatal(dev, err,
"guest requested %u queues, exceeding the maximum of %u.",
requested_num_queues, xenblk_max_queues);
return -ENOSYS;
}
}
be->blkif->nr_rings = requested_num_queues;
if (xen_blkif_alloc_rings(be->blkif))
return -ENOMEM;
pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
pers_grants ? "persistent grants" : "");
if (be->blkif->nr_rings == 1)
return read_per_ring_refs(&be->blkif->rings[0], dev->otherend);
else {
xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
xspath = kmalloc(xspathsize, GFP_KERNEL);
if (!xspath) {
xenbus_dev_fatal(dev, -ENOMEM, "reading ring references");
return -ENOMEM;
}
for (i = 0; i < be->blkif->nr_rings; i++) {
memset(xspath, 0, xspathsize);
snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i);
err = read_per_ring_refs(&be->blkif->rings[i], xspath);
if (err) {
kfree(xspath);
return err;
}
}
kfree(xspath);
}
return 0;
} }
static const struct xenbus_device_id xen_blkbk_ids[] = { static const struct xenbus_device_id xen_blkbk_ids[] = {
......
...@@ -60,6 +60,20 @@ ...@@ -60,6 +60,20 @@
#include <asm/xen/hypervisor.h> #include <asm/xen/hypervisor.h>
/*
* The minimal size of segment supported by the block framework is PAGE_SIZE.
* When Linux is using a different page size than Xen, it may not be possible
* to put all the data in a single segment.
* This can happen when the backend doesn't support indirect descriptor and
* therefore the maximum amount of data that a request can carry is
* BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
*
* Note that we only support one extra request. So the Linux page size
* should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
* 88KB.
*/
#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
enum blkif_state { enum blkif_state {
BLKIF_STATE_DISCONNECTED, BLKIF_STATE_DISCONNECTED,
BLKIF_STATE_CONNECTED, BLKIF_STATE_CONNECTED,
...@@ -72,6 +86,13 @@ struct grant { ...@@ -72,6 +86,13 @@ struct grant {
struct list_head node; struct list_head node;
}; };
enum blk_req_status {
REQ_WAITING,
REQ_DONE,
REQ_ERROR,
REQ_EOPNOTSUPP,
};
struct blk_shadow { struct blk_shadow {
struct blkif_request req; struct blkif_request req;
struct request *request; struct request *request;
...@@ -79,6 +100,14 @@ struct blk_shadow { ...@@ -79,6 +100,14 @@ struct blk_shadow {
struct grant **indirect_grants; struct grant **indirect_grants;
struct scatterlist *sg; struct scatterlist *sg;
unsigned int num_sg; unsigned int num_sg;
enum blk_req_status status;
#define NO_ASSOCIATED_ID ~0UL
/*
* Id of the sibling if we ever need 2 requests when handling a
* block I/O request
*/
unsigned long associated_id;
}; };
struct split_bio { struct split_bio {
...@@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32; ...@@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32;
module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
static unsigned int xen_blkif_max_queues = 4;
module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
/* /*
* Maximum order of pages to be used for the shared ring between front and * Maximum order of pages to be used for the shared ring between front and
* backend, 4KB page granularity is used. * backend, 4KB page granularity is used.
...@@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the ...@@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
/* /*
* ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
* characters are enough. Define to 20 to keep consist with backend. * characters are enough. Define to 20 to keep consistent with backend.
*/ */
#define RINGREF_NAME_LEN (20) #define RINGREF_NAME_LEN (20)
/*
* queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
*/
#define QUEUE_NAME_LEN (17)
/*
* Per-ring info.
* Every blkfront device can associate with one or more blkfront_ring_info,
* depending on how many hardware queues/rings to be used.
*/
struct blkfront_ring_info {
/* Lock to protect data in every ring buffer. */
spinlock_t ring_lock;
struct blkif_front_ring ring;
unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
unsigned int evtchn, irq;
struct work_struct work;
struct gnttab_free_callback callback;
struct blk_shadow shadow[BLK_MAX_RING_SIZE];
struct list_head indirect_pages;
struct list_head grants;
unsigned int persistent_gnts_c;
unsigned long shadow_free;
struct blkfront_info *dev_info;
};
/* /*
* We have one of these per vbd, whether ide, scsi or 'other'. They * We have one of these per vbd, whether ide, scsi or 'other'. They
...@@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the ...@@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
*/ */
struct blkfront_info struct blkfront_info
{ {
spinlock_t io_lock;
struct mutex mutex; struct mutex mutex;
struct xenbus_device *xbdev; struct xenbus_device *xbdev;
struct gendisk *gd; struct gendisk *gd;
int vdevice; int vdevice;
blkif_vdev_t handle; blkif_vdev_t handle;
enum blkif_state connected; enum blkif_state connected;
int ring_ref[XENBUS_MAX_RING_GRANTS]; /* Number of pages per ring buffer. */
unsigned int nr_ring_pages; unsigned int nr_ring_pages;
struct blkif_front_ring ring;
unsigned int evtchn, irq;
struct request_queue *rq; struct request_queue *rq;
struct work_struct work;
struct gnttab_free_callback callback;
struct blk_shadow shadow[BLK_MAX_RING_SIZE];
struct list_head grants;
struct list_head indirect_pages;
unsigned int persistent_gnts_c;
unsigned long shadow_free;
unsigned int feature_flush; unsigned int feature_flush;
unsigned int feature_discard:1; unsigned int feature_discard:1;
unsigned int feature_secdiscard:1; unsigned int feature_secdiscard:1;
...@@ -155,6 +203,8 @@ struct blkfront_info ...@@ -155,6 +203,8 @@ struct blkfront_info
unsigned int max_indirect_segments; unsigned int max_indirect_segments;
int is_ready; int is_ready;
struct blk_mq_tag_set tag_set; struct blk_mq_tag_set tag_set;
struct blkfront_ring_info *rinfo;
unsigned int nr_rings;
}; };
static unsigned int nr_minors; static unsigned int nr_minors;
...@@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock); ...@@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock);
#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) #define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG)
static int blkfront_setup_indirect(struct blkfront_info *info); static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
static int blkfront_gather_backend_features(struct blkfront_info *info); static void blkfront_gather_backend_features(struct blkfront_info *info);
static int get_id_from_freelist(struct blkfront_info *info) static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
{ {
unsigned long free = info->shadow_free; unsigned long free = rinfo->shadow_free;
BUG_ON(free >= BLK_RING_SIZE(info));
info->shadow_free = info->shadow[free].req.u.rw.id; BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
return free; return free;
} }
static int add_id_to_freelist(struct blkfront_info *info, static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
unsigned long id) unsigned long id)
{ {
if (info->shadow[id].req.u.rw.id != id) if (rinfo->shadow[id].req.u.rw.id != id)
return -EINVAL; return -EINVAL;
if (info->shadow[id].request == NULL) if (rinfo->shadow[id].request == NULL)
return -EINVAL; return -EINVAL;
info->shadow[id].req.u.rw.id = info->shadow_free; rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free;
info->shadow[id].request = NULL; rinfo->shadow[id].request = NULL;
info->shadow_free = id; rinfo->shadow_free = id;
return 0; return 0;
} }
static int fill_grant_buffer(struct blkfront_info *info, int num) static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
{ {
struct blkfront_info *info = rinfo->dev_info;
struct page *granted_page; struct page *granted_page;
struct grant *gnt_list_entry, *n; struct grant *gnt_list_entry, *n;
int i = 0; int i = 0;
while(i < num) { while (i < num) {
gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
if (!gnt_list_entry) if (!gnt_list_entry)
goto out_of_memory; goto out_of_memory;
...@@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) ...@@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
} }
gnt_list_entry->gref = GRANT_INVALID_REF; gnt_list_entry->gref = GRANT_INVALID_REF;
list_add(&gnt_list_entry->node, &info->grants); list_add(&gnt_list_entry->node, &rinfo->grants);
i++; i++;
} }
...@@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) ...@@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
out_of_memory: out_of_memory:
list_for_each_entry_safe(gnt_list_entry, n, list_for_each_entry_safe(gnt_list_entry, n,
&info->grants, node) { &rinfo->grants, node) {
list_del(&gnt_list_entry->node); list_del(&gnt_list_entry->node);
if (info->feature_persistent) if (info->feature_persistent)
__free_page(gnt_list_entry->page); __free_page(gnt_list_entry->page);
...@@ -263,17 +315,17 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) ...@@ -263,17 +315,17 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
return -ENOMEM; return -ENOMEM;
} }
static struct grant *get_free_grant(struct blkfront_info *info) static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
{ {
struct grant *gnt_list_entry; struct grant *gnt_list_entry;
BUG_ON(list_empty(&info->grants)); BUG_ON(list_empty(&rinfo->grants));
gnt_list_entry = list_first_entry(&info->grants, struct grant, gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
node); node);
list_del(&gnt_list_entry->node); list_del(&gnt_list_entry->node);
if (gnt_list_entry->gref != GRANT_INVALID_REF) if (gnt_list_entry->gref != GRANT_INVALID_REF)
info->persistent_gnts_c--; rinfo->persistent_gnts_c--;
return gnt_list_entry; return gnt_list_entry;
} }
...@@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry, ...@@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry,
static struct grant *get_grant(grant_ref_t *gref_head, static struct grant *get_grant(grant_ref_t *gref_head,
unsigned long gfn, unsigned long gfn,
struct blkfront_info *info) struct blkfront_ring_info *rinfo)
{ {
struct grant *gnt_list_entry = get_free_grant(info); struct grant *gnt_list_entry = get_free_grant(rinfo);
struct blkfront_info *info = rinfo->dev_info;
if (gnt_list_entry->gref != GRANT_INVALID_REF) if (gnt_list_entry->gref != GRANT_INVALID_REF)
return gnt_list_entry; return gnt_list_entry;
...@@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head, ...@@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
} }
static struct grant *get_indirect_grant(grant_ref_t *gref_head, static struct grant *get_indirect_grant(grant_ref_t *gref_head,
struct blkfront_info *info) struct blkfront_ring_info *rinfo)
{ {
struct grant *gnt_list_entry = get_free_grant(info); struct grant *gnt_list_entry = get_free_grant(rinfo);
struct blkfront_info *info = rinfo->dev_info;
if (gnt_list_entry->gref != GRANT_INVALID_REF) if (gnt_list_entry->gref != GRANT_INVALID_REF)
return gnt_list_entry; return gnt_list_entry;
...@@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, ...@@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
struct page *indirect_page; struct page *indirect_page;
/* Fetch a pre-allocated page to use for indirect grefs */ /* Fetch a pre-allocated page to use for indirect grefs */
BUG_ON(list_empty(&info->indirect_pages)); BUG_ON(list_empty(&rinfo->indirect_pages));
indirect_page = list_first_entry(&info->indirect_pages, indirect_page = list_first_entry(&rinfo->indirect_pages,
struct page, lru); struct page, lru);
list_del(&indirect_page->lru); list_del(&indirect_page->lru);
gnt_list_entry->page = indirect_page; gnt_list_entry->page = indirect_page;
...@@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr) ...@@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
static void blkif_restart_queue_callback(void *arg) static void blkif_restart_queue_callback(void *arg)
{ {
struct blkfront_info *info = (struct blkfront_info *)arg; struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
schedule_work(&info->work); schedule_work(&rinfo->work);
} }
static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
...@@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, ...@@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
return 0; return 0;
} }
static int blkif_queue_discard_req(struct request *req) static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
struct request *req,
struct blkif_request **ring_req)
{ {
struct blkfront_info *info = req->rq_disk->private_data; unsigned long id;
*ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
rinfo->ring.req_prod_pvt++;
id = get_id_from_freelist(rinfo);
rinfo->shadow[id].request = req;
rinfo->shadow[id].status = REQ_WAITING;
rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
(*ring_req)->u.rw.id = id;
return id;
}
static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
{
struct blkfront_info *info = rinfo->dev_info;
struct blkif_request *ring_req; struct blkif_request *ring_req;
unsigned long id; unsigned long id;
/* Fill out a communications ring structure. */ /* Fill out a communications ring structure. */
ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); id = blkif_ring_get_request(rinfo, req, &ring_req);
id = get_id_from_freelist(info);
info->shadow[id].request = req;
ring_req->operation = BLKIF_OP_DISCARD; ring_req->operation = BLKIF_OP_DISCARD;
ring_req->u.discard.nr_sectors = blk_rq_sectors(req); ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
...@@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req) ...@@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req)
else else
ring_req->u.discard.flag = 0; ring_req->u.discard.flag = 0;
info->ring.req_prod_pvt++;
/* Keep a private copy so we can reissue requests when recovering. */ /* Keep a private copy so we can reissue requests when recovering. */
info->shadow[id].req = *ring_req; rinfo->shadow[id].req = *ring_req;
return 0; return 0;
} }
...@@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req) ...@@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req)
struct setup_rw_req { struct setup_rw_req {
unsigned int grant_idx; unsigned int grant_idx;
struct blkif_request_segment *segments; struct blkif_request_segment *segments;
struct blkfront_info *info; struct blkfront_ring_info *rinfo;
struct blkif_request *ring_req; struct blkif_request *ring_req;
grant_ref_t gref_head; grant_ref_t gref_head;
unsigned int id; unsigned int id;
...@@ -495,6 +564,9 @@ struct setup_rw_req { ...@@ -495,6 +564,9 @@ struct setup_rw_req {
bool need_copy; bool need_copy;
unsigned int bvec_off; unsigned int bvec_off;
char *bvec_data; char *bvec_data;
bool require_extra_req;
struct blkif_request *extra_ring_req;
}; };
static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
...@@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, ...@@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
/* Convenient aliases */ /* Convenient aliases */
unsigned int grant_idx = setup->grant_idx; unsigned int grant_idx = setup->grant_idx;
struct blkif_request *ring_req = setup->ring_req; struct blkif_request *ring_req = setup->ring_req;
struct blkfront_info *info = setup->info; struct blkfront_ring_info *rinfo = setup->rinfo;
struct blk_shadow *shadow = &info->shadow[setup->id]; /*
* We always use the shadow of the first request to store the list
* of grant associated to the block I/O request. This made the
* completion more easy to handle even if the block I/O request is
* split.
*/
struct blk_shadow *shadow = &rinfo->shadow[setup->id];
if (unlikely(setup->require_extra_req &&
grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
/*
* We are using the second request, setup grant_idx
* to be the index of the segment array.
*/
grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
ring_req = setup->extra_ring_req;
}
if ((ring_req->operation == BLKIF_OP_INDIRECT) && if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
(grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
...@@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, ...@@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
kunmap_atomic(setup->segments); kunmap_atomic(setup->segments);
n = grant_idx / GRANTS_PER_INDIRECT_FRAME; n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
gnt_list_entry = get_indirect_grant(&setup->gref_head, info); gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
shadow->indirect_grants[n] = gnt_list_entry; shadow->indirect_grants[n] = gnt_list_entry;
setup->segments = kmap_atomic(gnt_list_entry->page); setup->segments = kmap_atomic(gnt_list_entry->page);
ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
} }
gnt_list_entry = get_grant(&setup->gref_head, gfn, info); gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
ref = gnt_list_entry->gref; ref = gnt_list_entry->gref;
shadow->grants_used[grant_idx] = gnt_list_entry; /*
* All the grants are stored in the shadow of the first
* request. Therefore we have to use the global index.
*/
shadow->grants_used[setup->grant_idx] = gnt_list_entry;
if (setup->need_copy) { if (setup->need_copy) {
void *shared_data; void *shared_data;
...@@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, ...@@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
(setup->grant_idx)++; (setup->grant_idx)++;
} }
static int blkif_queue_rw_req(struct request *req) static void blkif_setup_extra_req(struct blkif_request *first,
struct blkif_request *second)
{ {
struct blkfront_info *info = req->rq_disk->private_data; uint16_t nr_segments = first->u.rw.nr_segments;
struct blkif_request *ring_req;
unsigned long id; /*
* The second request is only present when the first request uses
* all its segments. It's always the continuity of the first one.
*/
first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
second->u.rw.sector_number = first->u.rw.sector_number +
(BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
second->u.rw.handle = first->u.rw.handle;
second->operation = first->operation;
}
static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
{
struct blkfront_info *info = rinfo->dev_info;
struct blkif_request *ring_req, *extra_ring_req = NULL;
unsigned long id, extra_id = NO_ASSOCIATED_ID;
bool require_extra_req = false;
int i; int i;
struct setup_rw_req setup = { struct setup_rw_req setup = {
.grant_idx = 0, .grant_idx = 0,
.segments = NULL, .segments = NULL,
.info = info, .rinfo = rinfo,
.need_copy = rq_data_dir(req) && info->feature_persistent, .need_copy = rq_data_dir(req) && info->feature_persistent,
}; };
...@@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req) ...@@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req)
* existing persistent grants, or if we have to get new grants, * existing persistent grants, or if we have to get new grants,
* as there are not sufficiently many free. * as there are not sufficiently many free.
*/ */
bool new_persistent_gnts;
struct scatterlist *sg; struct scatterlist *sg;
int num_sg, max_grefs, num_grant; int num_sg, max_grefs, num_grant;
...@@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req) ...@@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req)
*/ */
max_grefs += INDIRECT_GREFS(max_grefs); max_grefs += INDIRECT_GREFS(max_grefs);
/* Check if we have enough grants to allocate a requests */ /*
if (info->persistent_gnts_c < max_grefs) { * We have to reserve 'max_grefs' grants because persistent
new_persistent_gnts = 1; * grants are shared by all rings.
if (gnttab_alloc_grant_references( */
max_grefs - info->persistent_gnts_c, if (max_grefs > 0)
&setup.gref_head) < 0) { if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) {
gnttab_request_free_callback( gnttab_request_free_callback(
&info->callback, &rinfo->callback,
blkif_restart_queue_callback, blkif_restart_queue_callback,
info, rinfo,
max_grefs); max_grefs);
return 1; return 1;
} }
} else
new_persistent_gnts = 0;
/* Fill out a communications ring structure. */ /* Fill out a communications ring structure. */
ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); id = blkif_ring_get_request(rinfo, req, &ring_req);
id = get_id_from_freelist(info);
info->shadow[id].request = req;
BUG_ON(info->max_indirect_segments == 0 &&
GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
BUG_ON(info->max_indirect_segments &&
GREFS(req->nr_phys_segments) > info->max_indirect_segments);
num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
num_grant = 0; num_grant = 0;
/* Calculate the number of grant used */ /* Calculate the number of grant used */
for_each_sg(info->shadow[id].sg, sg, num_sg, i) for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
num_grant += gnttab_count_grant(sg->offset, sg->length); num_grant += gnttab_count_grant(sg->offset, sg->length);
ring_req->u.rw.id = id; require_extra_req = info->max_indirect_segments == 0 &&
info->shadow[id].num_sg = num_sg; num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
rinfo->shadow[id].num_sg = num_sg;
if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
likely(!require_extra_req)) {
/* /*
* The indirect operation can only be a BLKIF_OP_READ or * The indirect operation can only be a BLKIF_OP_READ or
* BLKIF_OP_WRITE * BLKIF_OP_WRITE
...@@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req) ...@@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req)
} }
} }
ring_req->u.rw.nr_segments = num_grant; ring_req->u.rw.nr_segments = num_grant;
if (unlikely(require_extra_req)) {
extra_id = blkif_ring_get_request(rinfo, req,
&extra_ring_req);
/*
* Only the first request contains the scatter-gather
* list.
*/
rinfo->shadow[extra_id].num_sg = 0;
blkif_setup_extra_req(ring_req, extra_ring_req);
/* Link the 2 requests together */
rinfo->shadow[extra_id].associated_id = id;
rinfo->shadow[id].associated_id = extra_id;
}
} }
setup.ring_req = ring_req; setup.ring_req = ring_req;
setup.id = id; setup.id = id;
for_each_sg(info->shadow[id].sg, sg, num_sg, i) {
setup.require_extra_req = require_extra_req;
if (unlikely(require_extra_req))
setup.extra_ring_req = extra_ring_req;
for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
BUG_ON(sg->offset + sg->length > PAGE_SIZE); BUG_ON(sg->offset + sg->length > PAGE_SIZE);
if (setup.need_copy) { if (setup.need_copy) {
...@@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req) ...@@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req)
if (setup.segments) if (setup.segments)
kunmap_atomic(setup.segments); kunmap_atomic(setup.segments);
info->ring.req_prod_pvt++;
/* Keep a private copy so we can reissue requests when recovering. */ /* Keep a private copy so we can reissue requests when recovering. */
info->shadow[id].req = *ring_req; rinfo->shadow[id].req = *ring_req;
if (unlikely(require_extra_req))
rinfo->shadow[extra_id].req = *extra_ring_req;
if (new_persistent_gnts) if (max_grefs > 0)
gnttab_free_grant_references(setup.gref_head); gnttab_free_grant_references(setup.gref_head);
return 0; return 0;
...@@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req) ...@@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req)
* *
* @req: a request struct * @req: a request struct
*/ */
static int blkif_queue_request(struct request *req) static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
{ {
struct blkfront_info *info = req->rq_disk->private_data; if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
return 1; return 1;
if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
return blkif_queue_discard_req(req); return blkif_queue_discard_req(req, rinfo);
else else
return blkif_queue_rw_req(req); return blkif_queue_rw_req(req, rinfo);
} }
static inline void flush_requests(struct blkfront_info *info) static inline void flush_requests(struct blkfront_ring_info *rinfo)
{ {
int notify; int notify;
RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
if (notify) if (notify)
notify_remote_via_irq(info->irq); notify_remote_via_irq(rinfo->irq);
} }
static inline bool blkif_request_flush_invalid(struct request *req, static inline bool blkif_request_flush_invalid(struct request *req,
...@@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req, ...@@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req,
} }
static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *qd) const struct blk_mq_queue_data *qd)
{ {
struct blkfront_info *info = qd->rq->rq_disk->private_data; unsigned long flags;
struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
blk_mq_start_request(qd->rq); blk_mq_start_request(qd->rq);
spin_lock_irq(&info->io_lock); spin_lock_irqsave(&rinfo->ring_lock, flags);
if (RING_FULL(&info->ring)) if (RING_FULL(&rinfo->ring))
goto out_busy; goto out_busy;
if (blkif_request_flush_invalid(qd->rq, info)) if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
goto out_err; goto out_err;
if (blkif_queue_request(qd->rq)) if (blkif_queue_request(qd->rq, rinfo))
goto out_busy; goto out_busy;
flush_requests(info); flush_requests(rinfo);
spin_unlock_irq(&info->io_lock); spin_unlock_irqrestore(&rinfo->ring_lock, flags);
return BLK_MQ_RQ_QUEUE_OK; return BLK_MQ_RQ_QUEUE_OK;
out_err: out_err:
spin_unlock_irq(&info->io_lock); spin_unlock_irqrestore(&rinfo->ring_lock, flags);
return BLK_MQ_RQ_QUEUE_ERROR; return BLK_MQ_RQ_QUEUE_ERROR;
out_busy: out_busy:
spin_unlock_irq(&info->io_lock); spin_unlock_irqrestore(&rinfo->ring_lock, flags);
blk_mq_stop_hw_queue(hctx); blk_mq_stop_hw_queue(hctx);
return BLK_MQ_RQ_QUEUE_BUSY; return BLK_MQ_RQ_QUEUE_BUSY;
} }
static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int index)
{
struct blkfront_info *info = (struct blkfront_info *)data;
BUG_ON(info->nr_rings <= index);
hctx->driver_data = &info->rinfo[index];
return 0;
}
static struct blk_mq_ops blkfront_mq_ops = { static struct blk_mq_ops blkfront_mq_ops = {
.queue_rq = blkif_queue_rq, .queue_rq = blkif_queue_rq,
.map_queue = blk_mq_map_queue, .map_queue = blk_mq_map_queue,
.init_hctx = blk_mq_init_hctx,
}; };
static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
...@@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, ...@@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
memset(&info->tag_set, 0, sizeof(info->tag_set)); memset(&info->tag_set, 0, sizeof(info->tag_set));
info->tag_set.ops = &blkfront_mq_ops; info->tag_set.ops = &blkfront_mq_ops;
info->tag_set.nr_hw_queues = 1; info->tag_set.nr_hw_queues = info->nr_rings;
info->tag_set.queue_depth = BLK_RING_SIZE(info); if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
/*
* When indirect descriptior is not supported, the I/O request
* will be split between multiple request in the ring.
* To avoid problems when sending the request, divide by
* 2 the depth of the queue.
*/
info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2;
} else
info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE; info->tag_set.numa_node = NUMA_NO_NODE;
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
info->tag_set.cmd_size = 0; info->tag_set.cmd_size = 0;
info->tag_set.driver_data = info; info->tag_set.driver_data = info;
if (blk_mq_alloc_tag_set(&info->tag_set)) if (blk_mq_alloc_tag_set(&info->tag_set))
return -1; return -EINVAL;
rq = blk_mq_init_queue(&info->tag_set); rq = blk_mq_init_queue(&info->tag_set);
if (IS_ERR(rq)) { if (IS_ERR(rq)) {
blk_mq_free_tag_set(&info->tag_set); blk_mq_free_tag_set(&info->tag_set);
return -1; return PTR_ERR(rq);
} }
queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
...@@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, ...@@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
static void xlvbd_release_gendisk(struct blkfront_info *info) static void xlvbd_release_gendisk(struct blkfront_info *info)
{ {
unsigned int minor, nr_minors; unsigned int minor, nr_minors, i;
if (info->rq == NULL) if (info->rq == NULL)
return; return;
...@@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) ...@@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
/* No more blkif_request(). */ /* No more blkif_request(). */
blk_mq_stop_hw_queues(info->rq); blk_mq_stop_hw_queues(info->rq);
/* No more gnttab callback work. */ for (i = 0; i < info->nr_rings; i++) {
gnttab_cancel_free_callback(&info->callback); struct blkfront_ring_info *rinfo = &info->rinfo[i];
/* Flush gnttab callback work. Must be done with no locks held. */ /* No more gnttab callback work. */
flush_work(&info->work); gnttab_cancel_free_callback(&rinfo->callback);
/* Flush gnttab callback work. Must be done with no locks held. */
flush_work(&rinfo->work);
}
del_gendisk(info->gd); del_gendisk(info->gd);
...@@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) ...@@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
info->gd = NULL; info->gd = NULL;
} }
/* Must be called with io_lock holded */ /* Already hold rinfo->ring_lock. */
static void kick_pending_request_queues(struct blkfront_info *info) static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
{ {
if (!RING_FULL(&info->ring)) if (!RING_FULL(&rinfo->ring))
blk_mq_start_stopped_hw_queues(info->rq, true); blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
} }
static void blkif_restart_queue(struct work_struct *work) static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
{ {
struct blkfront_info *info = container_of(work, struct blkfront_info, work); unsigned long flags;
spin_lock_irq(&info->io_lock); spin_lock_irqsave(&rinfo->ring_lock, flags);
if (info->connected == BLKIF_STATE_CONNECTED) kick_pending_request_queues_locked(rinfo);
kick_pending_request_queues(info); spin_unlock_irqrestore(&rinfo->ring_lock, flags);
spin_unlock_irq(&info->io_lock);
} }
static void blkif_free(struct blkfront_info *info, int suspend) static void blkif_restart_queue(struct work_struct *work)
{ {
struct grant *persistent_gnt; struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
struct grant *n;
int i, j, segs;
/* Prevent new requests being issued until we fix things up. */ if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
spin_lock_irq(&info->io_lock); kick_pending_request_queues(rinfo);
info->connected = suspend ? }
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
/* No more blkif_request(). */
if (info->rq)
blk_mq_stop_hw_queues(info->rq);
/* Remove all persistent grants */ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
if (!list_empty(&info->grants)) { {
list_for_each_entry_safe(persistent_gnt, n, struct grant *persistent_gnt, *n;
&info->grants, node) { struct blkfront_info *info = rinfo->dev_info;
list_del(&persistent_gnt->node); int i, j, segs;
if (persistent_gnt->gref != GRANT_INVALID_REF) {
gnttab_end_foreign_access(persistent_gnt->gref,
0, 0UL);
info->persistent_gnts_c--;
}
if (info->feature_persistent)
__free_page(persistent_gnt->page);
kfree(persistent_gnt);
}
}
BUG_ON(info->persistent_gnts_c != 0);
/* /*
* Remove indirect pages, this only happens when using indirect * Remove indirect pages, this only happens when using indirect
* descriptors but not persistent grants * descriptors but not persistent grants
*/ */
if (!list_empty(&info->indirect_pages)) { if (!list_empty(&rinfo->indirect_pages)) {
struct page *indirect_page, *n; struct page *indirect_page, *n;
BUG_ON(info->feature_persistent); BUG_ON(info->feature_persistent);
list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
list_del(&indirect_page->lru); list_del(&indirect_page->lru);
__free_page(indirect_page); __free_page(indirect_page);
} }
} }
/* Remove all persistent grants. */
if (!list_empty(&rinfo->grants)) {
list_for_each_entry_safe(persistent_gnt, n,
&rinfo->grants, node) {
list_del(&persistent_gnt->node);
if (persistent_gnt->gref != GRANT_INVALID_REF) {
gnttab_end_foreign_access(persistent_gnt->gref,
0, 0UL);
rinfo->persistent_gnts_c--;
}
if (info->feature_persistent)
__free_page(persistent_gnt->page);
kfree(persistent_gnt);
}
}
BUG_ON(rinfo->persistent_gnts_c != 0);
for (i = 0; i < BLK_RING_SIZE(info); i++) { for (i = 0; i < BLK_RING_SIZE(info); i++) {
/* /*
* Clear persistent grants present in requests already * Clear persistent grants present in requests already
* on the shared ring * on the shared ring
*/ */
if (!info->shadow[i].request) if (!rinfo->shadow[i].request)
goto free_shadow; goto free_shadow;
segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
info->shadow[i].req.u.indirect.nr_segments : rinfo->shadow[i].req.u.indirect.nr_segments :
info->shadow[i].req.u.rw.nr_segments; rinfo->shadow[i].req.u.rw.nr_segments;
for (j = 0; j < segs; j++) { for (j = 0; j < segs; j++) {
persistent_gnt = info->shadow[i].grants_used[j]; persistent_gnt = rinfo->shadow[i].grants_used[j];
gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
if (info->feature_persistent) if (info->feature_persistent)
__free_page(persistent_gnt->page); __free_page(persistent_gnt->page);
kfree(persistent_gnt); kfree(persistent_gnt);
} }
if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
/* /*
* If this is not an indirect operation don't try to * If this is not an indirect operation don't try to
* free indirect segments * free indirect segments
...@@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend) ...@@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend)
goto free_shadow; goto free_shadow;
for (j = 0; j < INDIRECT_GREFS(segs); j++) { for (j = 0; j < INDIRECT_GREFS(segs); j++) {
persistent_gnt = info->shadow[i].indirect_grants[j]; persistent_gnt = rinfo->shadow[i].indirect_grants[j];
gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
__free_page(persistent_gnt->page); __free_page(persistent_gnt->page);
kfree(persistent_gnt); kfree(persistent_gnt);
} }
free_shadow: free_shadow:
kfree(info->shadow[i].grants_used); kfree(rinfo->shadow[i].grants_used);
info->shadow[i].grants_used = NULL; rinfo->shadow[i].grants_used = NULL;
kfree(info->shadow[i].indirect_grants); kfree(rinfo->shadow[i].indirect_grants);
info->shadow[i].indirect_grants = NULL; rinfo->shadow[i].indirect_grants = NULL;
kfree(info->shadow[i].sg); kfree(rinfo->shadow[i].sg);
info->shadow[i].sg = NULL; rinfo->shadow[i].sg = NULL;
} }
/* No more gnttab callback work. */ /* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback); gnttab_cancel_free_callback(&rinfo->callback);
spin_unlock_irq(&info->io_lock);
/* Flush gnttab callback work. Must be done with no locks held. */ /* Flush gnttab callback work. Must be done with no locks held. */
flush_work(&info->work); flush_work(&rinfo->work);
/* Free resources associated with old device channel. */ /* Free resources associated with old device channel. */
for (i = 0; i < info->nr_ring_pages; i++) { for (i = 0; i < info->nr_ring_pages; i++) {
if (info->ring_ref[i] != GRANT_INVALID_REF) { if (rinfo->ring_ref[i] != GRANT_INVALID_REF) {
gnttab_end_foreign_access(info->ring_ref[i], 0, 0); gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0);
info->ring_ref[i] = GRANT_INVALID_REF; rinfo->ring_ref[i] = GRANT_INVALID_REF;
} }
} }
free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
info->ring.sring = NULL; rinfo->ring.sring = NULL;
if (info->irq) if (rinfo->irq)
unbind_from_irqhandler(info->irq, info); unbind_from_irqhandler(rinfo->irq, rinfo);
info->evtchn = info->irq = 0; rinfo->evtchn = rinfo->irq = 0;
}
static void blkif_free(struct blkfront_info *info, int suspend)
{
unsigned int i;
/* Prevent new requests being issued until we fix things up. */
info->connected = suspend ?
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
/* No more blkif_request(). */
if (info->rq)
blk_mq_stop_hw_queues(info->rq);
for (i = 0; i < info->nr_rings; i++)
blkif_free_ring(&info->rinfo[i]);
kfree(info->rinfo);
info->rinfo = NULL;
info->nr_rings = 0;
} }
struct copy_from_grant { struct copy_from_grant {
...@@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, ...@@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
kunmap_atomic(shared_data); kunmap_atomic(shared_data);
} }
static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, static enum blk_req_status blkif_rsp_to_req_status(int rsp)
{
switch (rsp)
{
case BLKIF_RSP_OKAY:
return REQ_DONE;
case BLKIF_RSP_EOPNOTSUPP:
return REQ_EOPNOTSUPP;
case BLKIF_RSP_ERROR:
/* Fallthrough. */
default:
return REQ_ERROR;
}
}
/*
* Get the final status of the block request based on two ring response
*/
static int blkif_get_final_status(enum blk_req_status s1,
enum blk_req_status s2)
{
BUG_ON(s1 == REQ_WAITING);
BUG_ON(s2 == REQ_WAITING);
if (s1 == REQ_ERROR || s2 == REQ_ERROR)
return BLKIF_RSP_ERROR;
else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
return BLKIF_RSP_EOPNOTSUPP;
return BLKIF_RSP_OKAY;
}
static bool blkif_completion(unsigned long *id,
struct blkfront_ring_info *rinfo,
struct blkif_response *bret) struct blkif_response *bret)
{ {
int i = 0; int i = 0;
struct scatterlist *sg; struct scatterlist *sg;
int num_sg, num_grant; int num_sg, num_grant;
struct blkfront_info *info = rinfo->dev_info;
struct blk_shadow *s = &rinfo->shadow[*id];
struct copy_from_grant data = { struct copy_from_grant data = {
.s = s,
.grant_idx = 0, .grant_idx = 0,
}; };
num_grant = s->req.operation == BLKIF_OP_INDIRECT ? num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
/* The I/O request may be split in two. */
if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
/* Keep the status of the current response in shadow. */
s->status = blkif_rsp_to_req_status(bret->status);
/* Wait the second response if not yet here. */
if (s2->status == REQ_WAITING)
return 0;
bret->status = blkif_get_final_status(s->status,
s2->status);
/*
* All the grants is stored in the first shadow in order
* to make the completion code simpler.
*/
num_grant += s2->req.u.rw.nr_segments;
/*
* The two responses may not come in order. Only the
* first request will store the scatter-gather list.
*/
if (s2->num_sg != 0) {
/* Update "id" with the ID of the first response. */
*id = s->associated_id;
s = s2;
}
/*
* We don't need anymore the second request, so recycling
* it now.
*/
if (add_id_to_freelist(rinfo, s->associated_id))
WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
info->gd->disk_name, s->associated_id);
}
data.s = s;
num_sg = s->num_sg; num_sg = s->num_sg;
if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
...@@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, ...@@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
if (!info->feature_persistent) if (!info->feature_persistent)
pr_alert_ratelimited("backed has not unmapped grant: %u\n", pr_alert_ratelimited("backed has not unmapped grant: %u\n",
s->grants_used[i]->gref); s->grants_used[i]->gref);
list_add(&s->grants_used[i]->node, &info->grants); list_add(&s->grants_used[i]->node, &rinfo->grants);
info->persistent_gnts_c++; rinfo->persistent_gnts_c++;
} else { } else {
/* /*
* If the grant is not mapped by the backend we end the * If the grant is not mapped by the backend we end the
...@@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, ...@@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
*/ */
gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
s->grants_used[i]->gref = GRANT_INVALID_REF; s->grants_used[i]->gref = GRANT_INVALID_REF;
list_add_tail(&s->grants_used[i]->node, &info->grants); list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
} }
} }
if (s->req.operation == BLKIF_OP_INDIRECT) { if (s->req.operation == BLKIF_OP_INDIRECT) {
...@@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, ...@@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
if (!info->feature_persistent) if (!info->feature_persistent)
pr_alert_ratelimited("backed has not unmapped grant: %u\n", pr_alert_ratelimited("backed has not unmapped grant: %u\n",
s->indirect_grants[i]->gref); s->indirect_grants[i]->gref);
list_add(&s->indirect_grants[i]->node, &info->grants); list_add(&s->indirect_grants[i]->node, &rinfo->grants);
info->persistent_gnts_c++; rinfo->persistent_gnts_c++;
} else { } else {
struct page *indirect_page; struct page *indirect_page;
...@@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, ...@@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
*/ */
if (!info->feature_persistent) { if (!info->feature_persistent) {
indirect_page = s->indirect_grants[i]->page; indirect_page = s->indirect_grants[i]->page;
list_add(&indirect_page->lru, &info->indirect_pages); list_add(&indirect_page->lru, &rinfo->indirect_pages);
} }
s->indirect_grants[i]->gref = GRANT_INVALID_REF; s->indirect_grants[i]->gref = GRANT_INVALID_REF;
list_add_tail(&s->indirect_grants[i]->node, &info->grants); list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
} }
} }
} }
return 1;
} }
static irqreturn_t blkif_interrupt(int irq, void *dev_id) static irqreturn_t blkif_interrupt(int irq, void *dev_id)
...@@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) ...@@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
struct blkif_response *bret; struct blkif_response *bret;
RING_IDX i, rp; RING_IDX i, rp;
unsigned long flags; unsigned long flags;
struct blkfront_info *info = (struct blkfront_info *)dev_id; struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
struct blkfront_info *info = rinfo->dev_info;
int error; int error;
spin_lock_irqsave(&info->io_lock, flags); if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
spin_unlock_irqrestore(&info->io_lock, flags);
return IRQ_HANDLED; return IRQ_HANDLED;
}
spin_lock_irqsave(&rinfo->ring_lock, flags);
again: again:
rp = info->ring.sring->rsp_prod; rp = rinfo->ring.sring->rsp_prod;
rmb(); /* Ensure we see queued responses up to 'rp'. */ rmb(); /* Ensure we see queued responses up to 'rp'. */
for (i = info->ring.rsp_cons; i != rp; i++) { for (i = rinfo->ring.rsp_cons; i != rp; i++) {
unsigned long id; unsigned long id;
bret = RING_GET_RESPONSE(&info->ring, i); bret = RING_GET_RESPONSE(&rinfo->ring, i);
id = bret->id; id = bret->id;
/* /*
* The backend has messed up and given us an id that we would * The backend has messed up and given us an id that we would
...@@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) ...@@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
* the id is busted. */ * the id is busted. */
continue; continue;
} }
req = info->shadow[id].request; req = rinfo->shadow[id].request;
if (bret->operation != BLKIF_OP_DISCARD) if (bret->operation != BLKIF_OP_DISCARD) {
blkif_completion(&info->shadow[id], info, bret); /*
* We may need to wait for an extra response if the
* I/O request is split in 2
*/
if (!blkif_completion(&id, rinfo, bret))
continue;
}
if (add_id_to_freelist(info, id)) { if (add_id_to_freelist(rinfo, id)) {
WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
info->gd->disk_name, op_name(bret->operation), id); info->gd->disk_name, op_name(bret->operation), id);
continue; continue;
...@@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) ...@@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
error = -EOPNOTSUPP; error = -EOPNOTSUPP;
} }
if (unlikely(bret->status == BLKIF_RSP_ERROR && if (unlikely(bret->status == BLKIF_RSP_ERROR &&
info->shadow[id].req.u.rw.nr_segments == 0)) { rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
info->gd->disk_name, op_name(bret->operation)); info->gd->disk_name, op_name(bret->operation));
error = -EOPNOTSUPP; error = -EOPNOTSUPP;
...@@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) ...@@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
} }
} }
info->ring.rsp_cons = i; rinfo->ring.rsp_cons = i;
if (i != info->ring.req_prod_pvt) { if (i != rinfo->ring.req_prod_pvt) {
int more_to_do; int more_to_do;
RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
if (more_to_do) if (more_to_do)
goto again; goto again;
} else } else
info->ring.sring->rsp_event = i + 1; rinfo->ring.sring->rsp_event = i + 1;
kick_pending_request_queues(info); kick_pending_request_queues_locked(rinfo);
spin_unlock_irqrestore(&info->io_lock, flags); spin_unlock_irqrestore(&rinfo->ring_lock, flags);
return IRQ_HANDLED; return IRQ_HANDLED;
} }
static int setup_blkring(struct xenbus_device *dev, static int setup_blkring(struct xenbus_device *dev,
struct blkfront_info *info) struct blkfront_ring_info *rinfo)
{ {
struct blkif_sring *sring; struct blkif_sring *sring;
int err, i; int err, i;
struct blkfront_info *info = rinfo->dev_info;
unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
for (i = 0; i < info->nr_ring_pages; i++) for (i = 0; i < info->nr_ring_pages; i++)
info->ring_ref[i] = GRANT_INVALID_REF; rinfo->ring_ref[i] = GRANT_INVALID_REF;
sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
get_order(ring_size)); get_order(ring_size));
...@@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev, ...@@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev,
return -ENOMEM; return -ENOMEM;
} }
SHARED_RING_INIT(sring); SHARED_RING_INIT(sring);
FRONT_RING_INIT(&info->ring, sring, ring_size); FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref);
if (err < 0) { if (err < 0) {
free_pages((unsigned long)sring, get_order(ring_size)); free_pages((unsigned long)sring, get_order(ring_size));
info->ring.sring = NULL; rinfo->ring.sring = NULL;
goto fail; goto fail;
} }
for (i = 0; i < info->nr_ring_pages; i++) for (i = 0; i < info->nr_ring_pages; i++)
info->ring_ref[i] = gref[i]; rinfo->ring_ref[i] = gref[i];
err = xenbus_alloc_evtchn(dev, &info->evtchn); err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
if (err) if (err)
goto fail; goto fail;
err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0,
"blkif", info); "blkif", rinfo);
if (err <= 0) { if (err <= 0) {
xenbus_dev_fatal(dev, err, xenbus_dev_fatal(dev, err,
"bind_evtchn_to_irqhandler failed"); "bind_evtchn_to_irqhandler failed");
goto fail; goto fail;
} }
info->irq = err; rinfo->irq = err;
return 0; return 0;
fail: fail:
...@@ -1455,6 +1701,53 @@ static int setup_blkring(struct xenbus_device *dev, ...@@ -1455,6 +1701,53 @@ static int setup_blkring(struct xenbus_device *dev,
return err; return err;
} }
/*
* Write out per-ring/queue nodes including ring-ref and event-channel, and each
* ring buffer may have multi pages depending on ->nr_ring_pages.
*/
static int write_per_ring_nodes(struct xenbus_transaction xbt,
struct blkfront_ring_info *rinfo, const char *dir)
{
int err;
unsigned int i;
const char *message = NULL;
struct blkfront_info *info = rinfo->dev_info;
if (info->nr_ring_pages == 1) {
err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
if (err) {
message = "writing ring-ref";
goto abort_transaction;
}
} else {
for (i = 0; i < info->nr_ring_pages; i++) {
char ring_ref_name[RINGREF_NAME_LEN];
snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
err = xenbus_printf(xbt, dir, ring_ref_name,
"%u", rinfo->ring_ref[i]);
if (err) {
message = "writing ring-ref";
goto abort_transaction;
}
}
}
err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
if (err) {
message = "writing event-channel";
goto abort_transaction;
}
return 0;
abort_transaction:
xenbus_transaction_end(xbt, 1);
if (message)
xenbus_dev_fatal(info->xbdev, err, "%s", message);
return err;
}
/* Common code used when first setting up, and when resuming. */ /* Common code used when first setting up, and when resuming. */
static int talk_to_blkback(struct xenbus_device *dev, static int talk_to_blkback(struct xenbus_device *dev,
...@@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev, ...@@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev,
{ {
const char *message = NULL; const char *message = NULL;
struct xenbus_transaction xbt; struct xenbus_transaction xbt;
int err, i; int err;
unsigned int max_page_order = 0; unsigned int i, max_page_order = 0;
unsigned int ring_page_order = 0; unsigned int ring_page_order = 0;
err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
...@@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev, ...@@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
info->nr_ring_pages = 1 << ring_page_order; info->nr_ring_pages = 1 << ring_page_order;
} }
/* Create shared ring, alloc event channel. */ for (i = 0; i < info->nr_rings; i++) {
err = setup_blkring(dev, info); struct blkfront_ring_info *rinfo = &info->rinfo[i];
if (err)
goto out; /* Create shared ring, alloc event channel. */
err = setup_blkring(dev, rinfo);
if (err)
goto destroy_blkring;
}
again: again:
err = xenbus_transaction_start(&xbt); err = xenbus_transaction_start(&xbt);
...@@ -1487,38 +1784,49 @@ static int talk_to_blkback(struct xenbus_device *dev, ...@@ -1487,38 +1784,49 @@ static int talk_to_blkback(struct xenbus_device *dev,
goto destroy_blkring; goto destroy_blkring;
} }
if (info->nr_ring_pages == 1) { if (info->nr_ring_pages > 1) {
err = xenbus_printf(xbt, dev->nodename, err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
"ring-ref", "%u", info->ring_ref[0]); ring_page_order);
if (err) { if (err) {
message = "writing ring-ref"; message = "writing ring-page-order";
goto abort_transaction; goto abort_transaction;
} }
}
/* We already got the number of queues/rings in _probe */
if (info->nr_rings == 1) {
err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename);
if (err)
goto destroy_blkring;
} else { } else {
err = xenbus_printf(xbt, dev->nodename, char *path;
"ring-page-order", "%u", ring_page_order); size_t pathsize;
err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
info->nr_rings);
if (err) { if (err) {
message = "writing ring-page-order"; message = "writing multi-queue-num-queues";
goto abort_transaction; goto abort_transaction;
} }
for (i = 0; i < info->nr_ring_pages; i++) { pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
char ring_ref_name[RINGREF_NAME_LEN]; path = kmalloc(pathsize, GFP_KERNEL);
if (!path) {
err = -ENOMEM;
message = "ENOMEM while writing ring references";
goto abort_transaction;
}
snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); for (i = 0; i < info->nr_rings; i++) {
err = xenbus_printf(xbt, dev->nodename, ring_ref_name, memset(path, 0, pathsize);
"%u", info->ring_ref[i]); snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
err = write_per_ring_nodes(xbt, &info->rinfo[i], path);
if (err) { if (err) {
message = "writing ring-ref"; kfree(path);
goto abort_transaction; goto destroy_blkring;
} }
} }
} kfree(path);
err = xenbus_printf(xbt, dev->nodename,
"event-channel", "%u", info->evtchn);
if (err) {
message = "writing event-channel";
goto abort_transaction;
} }
err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
XEN_IO_PROTO_ABI_NATIVE); XEN_IO_PROTO_ABI_NATIVE);
...@@ -1540,9 +1848,14 @@ static int talk_to_blkback(struct xenbus_device *dev, ...@@ -1540,9 +1848,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
goto destroy_blkring; goto destroy_blkring;
} }
for (i = 0; i < BLK_RING_SIZE(info); i++) for (i = 0; i < info->nr_rings; i++) {
info->shadow[i].req.u.rw.id = i+1; unsigned int j;
info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; struct blkfront_ring_info *rinfo = &info->rinfo[i];
for (j = 0; j < BLK_RING_SIZE(info); j++)
rinfo->shadow[j].req.u.rw.id = j + 1;
rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
}
xenbus_switch_state(dev, XenbusStateInitialised); xenbus_switch_state(dev, XenbusStateInitialised);
return 0; return 0;
...@@ -1553,7 +1866,10 @@ static int talk_to_blkback(struct xenbus_device *dev, ...@@ -1553,7 +1866,10 @@ static int talk_to_blkback(struct xenbus_device *dev,
xenbus_dev_fatal(dev, err, "%s", message); xenbus_dev_fatal(dev, err, "%s", message);
destroy_blkring: destroy_blkring:
blkif_free(info, 0); blkif_free(info, 0);
out:
kfree(info);
dev_set_drvdata(&dev->dev, NULL);
return err; return err;
} }
...@@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev, ...@@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev,
const struct xenbus_device_id *id) const struct xenbus_device_id *id)
{ {
int err, vdevice; int err, vdevice;
unsigned int r_index;
struct blkfront_info *info; struct blkfront_info *info;
unsigned int backend_max_queues = 0;
/* FIXME: Use dynamic device id if this is not set. */ /* FIXME: Use dynamic device id if this is not set. */
err = xenbus_scanf(XBT_NIL, dev->nodename, err = xenbus_scanf(XBT_NIL, dev->nodename,
...@@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev, ...@@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev,
return -ENOMEM; return -ENOMEM;
} }
mutex_init(&info->mutex);
spin_lock_init(&info->io_lock);
info->xbdev = dev; info->xbdev = dev;
/* Check if backend supports multiple queues. */
err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
"multi-queue-max-queues", "%u", &backend_max_queues);
if (err < 0)
backend_max_queues = 1;
info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
/* We need at least one ring. */
if (!info->nr_rings)
info->nr_rings = 1;
info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
if (!info->rinfo) {
xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
kfree(info);
return -ENOMEM;
}
for (r_index = 0; r_index < info->nr_rings; r_index++) {
struct blkfront_ring_info *rinfo;
rinfo = &info->rinfo[r_index];
INIT_LIST_HEAD(&rinfo->indirect_pages);
INIT_LIST_HEAD(&rinfo->grants);
rinfo->dev_info = info;
INIT_WORK(&rinfo->work, blkif_restart_queue);
spin_lock_init(&rinfo->ring_lock);
}
mutex_init(&info->mutex);
info->vdevice = vdevice; info->vdevice = vdevice;
INIT_LIST_HEAD(&info->grants);
INIT_LIST_HEAD(&info->indirect_pages);
info->persistent_gnts_c = 0;
info->connected = BLKIF_STATE_DISCONNECTED; info->connected = BLKIF_STATE_DISCONNECTED;
INIT_WORK(&info->work, blkif_restart_queue);
/* Front end dir is a number, which is used as the id. */ /* Front end dir is a number, which is used as the id. */
info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
...@@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio) ...@@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio)
static int blkif_recover(struct blkfront_info *info) static int blkif_recover(struct blkfront_info *info)
{ {
int i; unsigned int i, r_index;
struct request *req, *n; struct request *req, *n;
struct blk_shadow *copy; struct blk_shadow *copy;
int rc; int rc;
...@@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info) ...@@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info)
struct split_bio *split_bio; struct split_bio *split_bio;
struct list_head requests; struct list_head requests;
/* Stage 1: Make a safe copy of the shadow state. */ blkfront_gather_backend_features(info);
copy = kmemdup(info->shadow, sizeof(info->shadow),
GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
if (!copy)
return -ENOMEM;
/* Stage 2: Set up free list. */
memset(&info->shadow, 0, sizeof(info->shadow));
for (i = 0; i < BLK_RING_SIZE(info); i++)
info->shadow[i].req.u.rw.id = i+1;
info->shadow_free = info->ring.req_prod_pvt;
info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
rc = blkfront_gather_backend_features(info);
if (rc) {
kfree(copy);
return rc;
}
segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
blk_queue_max_segments(info->rq, segs); blk_queue_max_segments(info->rq, segs);
bio_list_init(&bio_list); bio_list_init(&bio_list);
INIT_LIST_HEAD(&requests); INIT_LIST_HEAD(&requests);
for (i = 0; i < BLK_RING_SIZE(info); i++) {
/* Not in use? */
if (!copy[i].request)
continue;
/* for (r_index = 0; r_index < info->nr_rings; r_index++) {
* Get the bios in the request so we can re-queue them. struct blkfront_ring_info *rinfo;
*/
if (copy[i].request->cmd_flags & rinfo = &info->rinfo[r_index];
(REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { /* Stage 1: Make a safe copy of the shadow state. */
copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
if (!copy)
return -ENOMEM;
/* Stage 2: Set up free list. */
memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
for (i = 0; i < BLK_RING_SIZE(info); i++)
rinfo->shadow[i].req.u.rw.id = i+1;
rinfo->shadow_free = rinfo->ring.req_prod_pvt;
rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
rc = blkfront_setup_indirect(rinfo);
if (rc) {
kfree(copy);
return rc;
}
for (i = 0; i < BLK_RING_SIZE(info); i++) {
/* Not in use? */
if (!copy[i].request)
continue;
/* /*
* Flush operations don't contain bios, so * Get the bios in the request so we can re-queue them.
* we need to requeue the whole request
*/ */
list_add(&copy[i].request->queuelist, &requests); if (copy[i].request->cmd_flags &
continue; (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
/*
* Flush operations don't contain bios, so
* we need to requeue the whole request
*/
list_add(&copy[i].request->queuelist, &requests);
continue;
}
merge_bio.head = copy[i].request->bio;
merge_bio.tail = copy[i].request->biotail;
bio_list_merge(&bio_list, &merge_bio);
copy[i].request->bio = NULL;
blk_end_request_all(copy[i].request, 0);
} }
merge_bio.head = copy[i].request->bio;
merge_bio.tail = copy[i].request->biotail;
bio_list_merge(&bio_list, &merge_bio);
copy[i].request->bio = NULL;
blk_end_request_all(copy[i].request, 0);
}
kfree(copy);
kfree(copy);
}
xenbus_switch_state(info->xbdev, XenbusStateConnected); xenbus_switch_state(info->xbdev, XenbusStateConnected);
spin_lock_irq(&info->io_lock);
/* Now safe for us to use the shared ring */ /* Now safe for us to use the shared ring */
info->connected = BLKIF_STATE_CONNECTED; info->connected = BLKIF_STATE_CONNECTED;
/* Kick any other new requests queued since we resumed */ for (r_index = 0; r_index < info->nr_rings; r_index++) {
kick_pending_request_queues(info); struct blkfront_ring_info *rinfo;
rinfo = &info->rinfo[r_index];
/* Kick any other new requests queued since we resumed */
kick_pending_request_queues(rinfo);
}
list_for_each_entry_safe(req, n, &requests, queuelist) { list_for_each_entry_safe(req, n, &requests, queuelist) {
/* Requeue pending requests (flush or discard) */ /* Requeue pending requests (flush or discard) */
...@@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info) ...@@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info)
BUG_ON(req->nr_phys_segments > segs); BUG_ON(req->nr_phys_segments > segs);
blk_mq_requeue_request(req); blk_mq_requeue_request(req);
} }
spin_unlock_irq(&info->io_lock);
blk_mq_kick_requeue_list(info->rq); blk_mq_kick_requeue_list(info->rq);
while ((bio = bio_list_pop(&bio_list)) != NULL) { while ((bio = bio_list_pop(&bio_list)) != NULL) {
...@@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev) ...@@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev)
return err; return err;
} }
static void static void blkfront_closing(struct blkfront_info *info)
blkfront_closing(struct blkfront_info *info)
{ {
struct xenbus_device *xbdev = info->xbdev; struct xenbus_device *xbdev = info->xbdev;
struct block_device *bdev = NULL; struct block_device *bdev = NULL;
...@@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info) ...@@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info)
info->feature_secdiscard = !!discard_secure; info->feature_secdiscard = !!discard_secure;
} }
static int blkfront_setup_indirect(struct blkfront_info *info) static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
{ {
unsigned int psegs, grants; unsigned int psegs, grants;
int err, i; int err, i;
struct blkfront_info *info = rinfo->dev_info;
if (info->max_indirect_segments == 0) if (info->max_indirect_segments == 0) {
grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; if (!HAS_EXTRA_REQ)
grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
else {
/*
* When an extra req is required, the maximum
* grants supported is related to the size of the
* Linux block segment.
*/
grants = GRANTS_PER_PSEG;
}
}
else else
grants = info->max_indirect_segments; grants = info->max_indirect_segments;
psegs = grants / GRANTS_PER_PSEG; psegs = grants / GRANTS_PER_PSEG;
err = fill_grant_buffer(info, err = fill_grant_buffer(rinfo,
(grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
if (err) if (err)
goto out_of_memory; goto out_of_memory;
...@@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info) ...@@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
*/ */
int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
BUG_ON(!list_empty(&info->indirect_pages)); BUG_ON(!list_empty(&rinfo->indirect_pages));
for (i = 0; i < num; i++) { for (i = 0; i < num; i++) {
struct page *indirect_page = alloc_page(GFP_NOIO); struct page *indirect_page = alloc_page(GFP_NOIO);
if (!indirect_page) if (!indirect_page)
goto out_of_memory; goto out_of_memory;
list_add(&indirect_page->lru, &info->indirect_pages); list_add(&indirect_page->lru, &rinfo->indirect_pages);
} }
} }
for (i = 0; i < BLK_RING_SIZE(info); i++) { for (i = 0; i < BLK_RING_SIZE(info); i++) {
info->shadow[i].grants_used = kzalloc( rinfo->shadow[i].grants_used = kzalloc(
sizeof(info->shadow[i].grants_used[0]) * grants, sizeof(rinfo->shadow[i].grants_used[0]) * grants,
GFP_NOIO); GFP_NOIO);
info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO);
if (info->max_indirect_segments) if (info->max_indirect_segments)
info->shadow[i].indirect_grants = kzalloc( rinfo->shadow[i].indirect_grants = kzalloc(
sizeof(info->shadow[i].indirect_grants[0]) * sizeof(rinfo->shadow[i].indirect_grants[0]) *
INDIRECT_GREFS(grants), INDIRECT_GREFS(grants),
GFP_NOIO); GFP_NOIO);
if ((info->shadow[i].grants_used == NULL) || if ((rinfo->shadow[i].grants_used == NULL) ||
(info->shadow[i].sg == NULL) || (rinfo->shadow[i].sg == NULL) ||
(info->max_indirect_segments && (info->max_indirect_segments &&
(info->shadow[i].indirect_grants == NULL))) (rinfo->shadow[i].indirect_grants == NULL)))
goto out_of_memory; goto out_of_memory;
sg_init_table(info->shadow[i].sg, psegs); sg_init_table(rinfo->shadow[i].sg, psegs);
} }
...@@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info) ...@@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
out_of_memory: out_of_memory:
for (i = 0; i < BLK_RING_SIZE(info); i++) { for (i = 0; i < BLK_RING_SIZE(info); i++) {
kfree(info->shadow[i].grants_used); kfree(rinfo->shadow[i].grants_used);
info->shadow[i].grants_used = NULL; rinfo->shadow[i].grants_used = NULL;
kfree(info->shadow[i].sg); kfree(rinfo->shadow[i].sg);
info->shadow[i].sg = NULL; rinfo->shadow[i].sg = NULL;
kfree(info->shadow[i].indirect_grants); kfree(rinfo->shadow[i].indirect_grants);
info->shadow[i].indirect_grants = NULL; rinfo->shadow[i].indirect_grants = NULL;
} }
if (!list_empty(&info->indirect_pages)) { if (!list_empty(&rinfo->indirect_pages)) {
struct page *indirect_page, *n; struct page *indirect_page, *n;
list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
list_del(&indirect_page->lru); list_del(&indirect_page->lru);
__free_page(indirect_page); __free_page(indirect_page);
} }
...@@ -1927,7 +2287,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) ...@@ -1927,7 +2287,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
/* /*
* Gather all backend feature-* * Gather all backend feature-*
*/ */
static int blkfront_gather_backend_features(struct blkfront_info *info) static void blkfront_gather_backend_features(struct blkfront_info *info)
{ {
int err; int err;
int barrier, flush, discard, persistent; int barrier, flush, discard, persistent;
...@@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info) ...@@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info)
else else
info->max_indirect_segments = min(indirect_segments, info->max_indirect_segments = min(indirect_segments,
xen_blkif_max_segments); xen_blkif_max_segments);
return blkfront_setup_indirect(info);
} }
/* /*
...@@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info) ...@@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info)
unsigned long sector_size; unsigned long sector_size;
unsigned int physical_sector_size; unsigned int physical_sector_size;
unsigned int binfo; unsigned int binfo;
int err; int err, i;
switch (info->connected) { switch (info->connected) {
case BLKIF_STATE_CONNECTED: case BLKIF_STATE_CONNECTED:
...@@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info) ...@@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info)
if (err != 1) if (err != 1)
physical_sector_size = sector_size; physical_sector_size = sector_size;
err = blkfront_gather_backend_features(info); blkfront_gather_backend_features(info);
if (err) { for (i = 0; i < info->nr_rings; i++) {
xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", err = blkfront_setup_indirect(&info->rinfo[i]);
info->xbdev->otherend); if (err) {
return; xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
info->xbdev->otherend);
blkif_free(info, 0);
break;
}
} }
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
...@@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info) ...@@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info)
xenbus_switch_state(info->xbdev, XenbusStateConnected); xenbus_switch_state(info->xbdev, XenbusStateConnected);
/* Kick pending requests. */ /* Kick pending requests. */
spin_lock_irq(&info->io_lock);
info->connected = BLKIF_STATE_CONNECTED; info->connected = BLKIF_STATE_CONNECTED;
kick_pending_request_queues(info); for (i = 0; i < info->nr_rings; i++)
spin_unlock_irq(&info->io_lock); kick_pending_request_queues(&info->rinfo[i]);
add_disk(info->gd); add_disk(info->gd);
...@@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev, ...@@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev,
case XenbusStateInitWait: case XenbusStateInitWait:
if (dev->state != XenbusStateInitialising) if (dev->state != XenbusStateInitialising)
break; break;
if (talk_to_blkback(dev, info)) { if (talk_to_blkback(dev, info))
kfree(info);
dev_set_drvdata(&dev->dev, NULL);
break; break;
}
case XenbusStateInitialising: case XenbusStateInitialising:
case XenbusStateInitialised: case XenbusStateInitialised:
case XenbusStateReconfiguring: case XenbusStateReconfiguring:
...@@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev, ...@@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev,
break; break;
case XenbusStateConnected: case XenbusStateConnected:
if (dev->state != XenbusStateInitialised) {
if (talk_to_blkback(dev, info))
break;
}
blkfront_connect(info); blkfront_connect(info);
break; break;
...@@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = { ...@@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = {
static int __init xlblk_init(void) static int __init xlblk_init(void)
{ {
int ret; int ret;
int nr_cpus = num_online_cpus();
if (!xen_domain()) if (!xen_domain())
return -ENODEV; return -ENODEV;
...@@ -2288,7 +2651,13 @@ static int __init xlblk_init(void) ...@@ -2288,7 +2651,13 @@ static int __init xlblk_init(void)
if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
xen_blkif_max_ring_order = 0; xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
}
if (xen_blkif_max_queues > nr_cpus) {
pr_info("Invalid max_queues (%d), will use default max: %d.\n",
xen_blkif_max_queues, nr_cpus);
xen_blkif_max_queues = nr_cpus;
} }
if (!xen_has_pv_disk_devices()) if (!xen_has_pv_disk_devices())
......
...@@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c) ...@@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c)
do { do {
ret = btree_root(gc_root, c, &op, &writes, &stats); ret = btree_root(gc_root, c, &op, &writes, &stats);
closure_sync(&writes); closure_sync(&writes);
cond_resched();
if (ret && ret != -EAGAIN) if (ret && ret != -EAGAIN)
pr_warn("gc failed!"); pr_warn("gc failed!");
...@@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, ...@@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
rw_lock(true, b, b->level); rw_lock(true, b, b->level);
if (b->key.ptr[0] != btree_ptr || if (b->key.ptr[0] != btree_ptr ||
b->seq != seq + 1) b->seq != seq + 1) {
op->lock = b->level;
goto out; goto out;
}
} }
SET_KEY_PTRS(check_key, 1); SET_KEY_PTRS(check_key, 1);
......
...@@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, ...@@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
sysfs_create_link(&c->kobj, &d->kobj, d->name), sysfs_create_link(&c->kobj, &d->kobj, d->name),
"Couldn't create device <-> cache set symlinks"); "Couldn't create device <-> cache set symlinks");
clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
} }
static void bcache_device_detach(struct bcache_device *d) static void bcache_device_detach(struct bcache_device *d)
...@@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc) ...@@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc)
buf[SB_LABEL_SIZE] = '\0'; buf[SB_LABEL_SIZE] = '\0';
env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
if (atomic_xchg(&dc->running, 1)) if (atomic_xchg(&dc->running, 1)) {
kfree(env[1]);
kfree(env[2]);
return; return;
}
if (!d->c && if (!d->c &&
BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
...@@ -1933,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ...@@ -1933,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
else else
err = "device busy"; err = "device busy";
mutex_unlock(&bch_register_lock); mutex_unlock(&bch_register_lock);
if (attr == &ksysfs_register_quiet)
goto out;
} }
goto err; goto err;
} }
...@@ -1971,8 +1978,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ...@@ -1971,8 +1978,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
err_close: err_close:
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
err: err:
if (attr != &ksysfs_register_quiet) pr_info("error opening %s: %s", path, err);
pr_info("error opening %s: %s", path, err);
ret = -EINVAL; ret = -EINVAL;
goto out; goto out;
} }
...@@ -2066,8 +2072,10 @@ static int __init bcache_init(void) ...@@ -2066,8 +2072,10 @@ static int __init bcache_init(void)
closure_debug_init(); closure_debug_init();
bcache_major = register_blkdev(0, "bcache"); bcache_major = register_blkdev(0, "bcache");
if (bcache_major < 0) if (bcache_major < 0) {
unregister_reboot_notifier(&reboot);
return bcache_major; return bcache_major;
}
if (!(bcache_wq = create_workqueue("bcache")) || if (!(bcache_wq = create_workqueue("bcache")) ||
!(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
......
...@@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, ...@@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
static bool dirty_pred(struct keybuf *buf, struct bkey *k) static bool dirty_pred(struct keybuf *buf, struct bkey *k)
{ {
struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
BUG_ON(KEY_INODE(k) != dc->disk.id);
return KEY_DIRTY(k); return KEY_DIRTY(k);
} }
...@@ -372,11 +376,24 @@ static void refill_full_stripes(struct cached_dev *dc) ...@@ -372,11 +376,24 @@ static void refill_full_stripes(struct cached_dev *dc)
} }
} }
/*
* Returns true if we scanned the entire disk
*/
static bool refill_dirty(struct cached_dev *dc) static bool refill_dirty(struct cached_dev *dc)
{ {
struct keybuf *buf = &dc->writeback_keys; struct keybuf *buf = &dc->writeback_keys;
struct bkey start = KEY(dc->disk.id, 0, 0);
struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
bool searched_from_start = false; struct bkey start_pos;
/*
* make sure keybuf pos is inside the range for this disk - at bringup
* we might not be attached yet so this disk's inode nr isn't
* initialized then
*/
if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
bkey_cmp(&buf->last_scanned, &end) > 0)
buf->last_scanned = start;
if (dc->partial_stripes_expensive) { if (dc->partial_stripes_expensive) {
refill_full_stripes(dc); refill_full_stripes(dc);
...@@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc) ...@@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc)
return false; return false;
} }
if (bkey_cmp(&buf->last_scanned, &end) >= 0) { start_pos = buf->last_scanned;
buf->last_scanned = KEY(dc->disk.id, 0, 0);
searched_from_start = true;
}
bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; if (bkey_cmp(&buf->last_scanned, &end) < 0)
return false;
/*
* If we get to the end start scanning again from the beginning, and
* only scan up to where we initially started scanning from:
*/
buf->last_scanned = start;
bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
} }
static int bch_writeback_thread(void *arg) static int bch_writeback_thread(void *arg)
......
...@@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, ...@@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
static inline void bch_writeback_queue(struct cached_dev *dc) static inline void bch_writeback_queue(struct cached_dev *dc)
{ {
wake_up_process(dc->writeback_thread); if (!IS_ERR_OR_NULL(dc->writeback_thread))
wake_up_process(dc->writeback_thread);
} }
static inline void bch_writeback_add(struct cached_dev *dc) static inline void bch_writeback_add(struct cached_dev *dc)
......
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
*/ */
#ifndef DRBD_H #ifndef DRBD_H
#define DRBD_H #define DRBD_H
#include <linux/connector.h>
#include <asm/types.h> #include <asm/types.h>
#ifdef __KERNEL__ #ifdef __KERNEL__
...@@ -52,7 +51,7 @@ ...@@ -52,7 +51,7 @@
#endif #endif
extern const char *drbd_buildtag(void); extern const char *drbd_buildtag(void);
#define REL_VERSION "8.4.5" #define REL_VERSION "8.4.6"
#define API_VERSION 1 #define API_VERSION 1
#define PRO_VERSION_MIN 86 #define PRO_VERSION_MIN 86
#define PRO_VERSION_MAX 101 #define PRO_VERSION_MAX 101
...@@ -339,6 +338,8 @@ enum drbd_state_rv { ...@@ -339,6 +338,8 @@ enum drbd_state_rv {
#define MDF_AL_CLEAN (1 << 7) #define MDF_AL_CLEAN (1 << 7)
#define MDF_AL_DISABLED (1 << 8) #define MDF_AL_DISABLED (1 << 8)
#define MAX_PEERS 32
enum drbd_uuid_index { enum drbd_uuid_index {
UI_CURRENT, UI_CURRENT,
UI_BITMAP, UI_BITMAP,
...@@ -349,14 +350,35 @@ enum drbd_uuid_index { ...@@ -349,14 +350,35 @@ enum drbd_uuid_index {
UI_EXTENDED_SIZE /* Everything. */ UI_EXTENDED_SIZE /* Everything. */
}; };
#define HISTORY_UUIDS MAX_PEERS
enum drbd_timeout_flag { enum drbd_timeout_flag {
UT_DEFAULT = 0, UT_DEFAULT = 0,
UT_DEGRADED = 1, UT_DEGRADED = 1,
UT_PEER_OUTDATED = 2, UT_PEER_OUTDATED = 2,
}; };
enum drbd_notification_type {
NOTIFY_EXISTS,
NOTIFY_CREATE,
NOTIFY_CHANGE,
NOTIFY_DESTROY,
NOTIFY_CALL,
NOTIFY_RESPONSE,
NOTIFY_CONTINUES = 0x8000,
NOTIFY_FLAGS = NOTIFY_CONTINUES,
};
#define UUID_JUST_CREATED ((__u64)4) #define UUID_JUST_CREATED ((__u64)4)
enum write_ordering_e {
WO_NONE,
WO_DRAIN_IO,
WO_BDEV_FLUSH,
WO_BIO_BARRIER
};
/* magic numbers used in meta data and network packets */ /* magic numbers used in meta data and network packets */
#define DRBD_MAGIC 0x83740267 #define DRBD_MAGIC 0x83740267
#define DRBD_MAGIC_BIG 0x835a #define DRBD_MAGIC_BIG 0x835a
......
...@@ -250,6 +250,76 @@ GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, ...@@ -250,6 +250,76 @@ GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach)
) )
GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info,
__u32_field(1, 0, res_role)
__flg_field(2, 0, res_susp)
__flg_field(3, 0, res_susp_nod)
__flg_field(4, 0, res_susp_fen)
/* __flg_field(5, 0, res_weak) */
)
GENL_struct(DRBD_NLA_DEVICE_INFO, 16, device_info,
__u32_field(1, 0, dev_disk_state)
)
GENL_struct(DRBD_NLA_CONNECTION_INFO, 17, connection_info,
__u32_field(1, 0, conn_connection_state)
__u32_field(2, 0, conn_role)
)
GENL_struct(DRBD_NLA_PEER_DEVICE_INFO, 18, peer_device_info,
__u32_field(1, 0, peer_repl_state)
__u32_field(2, 0, peer_disk_state)
__u32_field(3, 0, peer_resync_susp_user)
__u32_field(4, 0, peer_resync_susp_peer)
__u32_field(5, 0, peer_resync_susp_dependency)
)
GENL_struct(DRBD_NLA_RESOURCE_STATISTICS, 19, resource_statistics,
__u32_field(1, 0, res_stat_write_ordering)
)
GENL_struct(DRBD_NLA_DEVICE_STATISTICS, 20, device_statistics,
__u64_field(1, 0, dev_size) /* (sectors) */
__u64_field(2, 0, dev_read) /* (sectors) */
__u64_field(3, 0, dev_write) /* (sectors) */
__u64_field(4, 0, dev_al_writes) /* activity log writes (count) */
__u64_field(5, 0, dev_bm_writes) /* bitmap writes (count) */
__u32_field(6, 0, dev_upper_pending) /* application requests in progress */
__u32_field(7, 0, dev_lower_pending) /* backing device requests in progress */
__flg_field(8, 0, dev_upper_blocked)
__flg_field(9, 0, dev_lower_blocked)
__flg_field(10, 0, dev_al_suspended) /* activity log suspended */
__u64_field(11, 0, dev_exposed_data_uuid)
__u64_field(12, 0, dev_current_uuid)
__u32_field(13, 0, dev_disk_flags)
__bin_field(14, 0, history_uuids, HISTORY_UUIDS * sizeof(__u64))
)
GENL_struct(DRBD_NLA_CONNECTION_STATISTICS, 21, connection_statistics,
__flg_field(1, 0, conn_congested)
)
GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics,
__u64_field(1, 0, peer_dev_received) /* sectors */
__u64_field(2, 0, peer_dev_sent) /* sectors */
__u32_field(3, 0, peer_dev_pending) /* number of requests */
__u32_field(4, 0, peer_dev_unacked) /* number of requests */
__u64_field(5, 0, peer_dev_out_of_sync) /* sectors */
__u64_field(6, 0, peer_dev_resync_failed) /* sectors */
__u64_field(7, 0, peer_dev_bitmap_uuid)
__u32_field(9, 0, peer_dev_flags)
)
GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header,
__u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type)
)
GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info,
__str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32)
__u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status)
)
/* /*
* Notifications and commands (genlmsghdr->cmd) * Notifications and commands (genlmsghdr->cmd)
*/ */
...@@ -382,3 +452,82 @@ GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type), ...@@ -382,3 +452,82 @@ GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_GET_RESOURCES, 30,
GENL_op_init(
.dumpit = drbd_adm_dump_resources,
),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY))
GENL_op(DRBD_ADM_GET_DEVICES, 31,
GENL_op_init(
.dumpit = drbd_adm_dump_devices,
.done = drbd_adm_dump_devices_done,
),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
GENL_op(DRBD_ADM_GET_CONNECTIONS, 32,
GENL_op_init(
.dumpit = drbd_adm_dump_connections,
.done = drbd_adm_dump_connections_done,
),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY))
GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33,
GENL_op_init(
.dumpit = drbd_adm_dump_peer_devices,
.done = drbd_adm_dump_peer_devices_done,
),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
GENL_notification(
DRBD_RESOURCE_STATE, 34, events,
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_F_REQUIRED))
GENL_notification(
DRBD_DEVICE_STATE, 35, events,
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_F_REQUIRED))
GENL_notification(
DRBD_CONNECTION_STATE, 36, events,
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_F_REQUIRED))
GENL_notification(
DRBD_PEER_DEVICE_STATE, 37, events,
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_F_REQUIRED))
GENL_op(
DRBD_ADM_GET_INITIAL_STATE, 38,
GENL_op_init(
.dumpit = drbd_adm_get_initial_state,
),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY))
GENL_notification(
DRBD_HELPER, 40, events,
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_HELPER, DRBD_F_REQUIRED))
GENL_notification(
DRBD_INITIAL_STATE_DONE, 41, events,
GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED))
...@@ -135,6 +135,20 @@ static inline void *idr_find(struct idr *idr, int id) ...@@ -135,6 +135,20 @@ static inline void *idr_find(struct idr *idr, int id)
#define idr_for_each_entry(idp, entry, id) \ #define idr_for_each_entry(idp, entry, id) \
for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id) for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
/**
* idr_for_each_entry - continue iteration over an idr's elements of a given type
* @idp: idr handle
* @entry: the type * to use as cursor
* @id: id entry's key
*
* Continue to iterate over list of given type, continuing after
* the current position.
*/
#define idr_for_each_entry_continue(idp, entry, id) \
for ((entry) = idr_get_next((idp), &(id)); \
entry; \
++id, (entry) = idr_get_next((idp), &(id)))
/* /*
* IDA - IDR based id allocator, use when translation from id to * IDA - IDR based id allocator, use when translation from id to
* pointer isn't necessary. * pointer isn't necessary.
......
...@@ -264,7 +264,7 @@ extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); ...@@ -264,7 +264,7 @@ extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
extern void lc_committed(struct lru_cache *lc); extern void lc_committed(struct lru_cache *lc);
struct seq_file; struct seq_file;
extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); extern void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
void (*detail) (struct seq_file *, struct lc_element *)); void (*detail) (struct seq_file *, struct lc_element *));
......
...@@ -27,6 +27,54 @@ ...@@ -27,6 +27,54 @@
typedef uint16_t blkif_vdev_t; typedef uint16_t blkif_vdev_t;
typedef uint64_t blkif_sector_t; typedef uint64_t blkif_sector_t;
/*
* Multiple hardware queues/rings:
* If supported, the backend will write the key "multi-queue-max-queues" to
* the directory for that vbd, and set its value to the maximum supported
* number of queues.
* Frontends that are aware of this feature and wish to use it can write the
* key "multi-queue-num-queues" with the number they wish to use, which must be
* greater than zero, and no more than the value reported by the backend in
* "multi-queue-max-queues".
*
* For frontends requesting just one queue, the usual event-channel and
* ring-ref keys are written as before, simplifying the backend processing
* to avoid distinguishing between a frontend that doesn't understand the
* multi-queue feature, and one that does, but requested only one queue.
*
* Frontends requesting two or more queues must not write the toplevel
* event-channel and ring-ref keys, instead writing those keys under sub-keys
* having the name "queue-N" where N is the integer ID of the queue/ring for
* which those keys belong. Queues are indexed from zero.
* For example, a frontend with two queues must write the following set of
* queue-related keys:
*
* /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
* /local/domain/1/device/vbd/0/queue-0 = ""
* /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>"
* /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
* /local/domain/1/device/vbd/0/queue-1 = ""
* /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>"
* /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
*
* It is also possible to use multiple queues/rings together with
* feature multi-page ring buffer.
* For example, a frontend requests two queues/rings and the size of each ring
* buffer is two pages must write the following set of related keys:
*
* /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
* /local/domain/1/device/vbd/0/ring-page-order = "1"
* /local/domain/1/device/vbd/0/queue-0 = ""
* /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>"
* /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>"
* /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
* /local/domain/1/device/vbd/0/queue-1 = ""
* /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>"
* /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>"
* /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
*
*/
/* /*
* REQUEST CODES. * REQUEST CODES.
*/ */
......
...@@ -238,7 +238,7 @@ void lc_reset(struct lru_cache *lc) ...@@ -238,7 +238,7 @@ void lc_reset(struct lru_cache *lc)
* @seq: the seq_file to print into * @seq: the seq_file to print into
* @lc: the lru cache to print statistics of * @lc: the lru cache to print statistics of
*/ */
size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
{ {
/* NOTE: /* NOTE:
* total calls to lc_get are * total calls to lc_get are
...@@ -250,8 +250,6 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) ...@@ -250,8 +250,6 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
lc->name, lc->used, lc->nr_elements, lc->name, lc->used, lc->nr_elements,
lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
return 0;
} }
static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册