提交 9228ff90 编写于 作者: L Linus Torvalds

Merge branch 'for-3.8/drivers' of git://git.kernel.dk/linux-block

Pull block driver update from Jens Axboe:
 "Now that the core bits are in, here are the driver bits for 3.8.  The
  branch contains:

   - A huge pile of drbd bits that were dumped from the 3.7 merge
     window.  Following that, it was both made perfectly clear that
     there is going to be no more over-the-wall pulls and how the
     situation on individual pulls can be improved.

   - A few cleanups from Akinobu Mita for drbd and cciss.

   - Queue improvement for loop from Lukas.  This grew into adding a
     generic interface for waiting/checking an even with a specific
     lock, allowing this to be pulled out of md and now loop and drbd is
     also using it.

   - A few fixes for xen back/front block driver from Roger Pau Monne.

   - Partition improvements from Stephen Warren, allowing partiion UUID
     to be used as an identifier."

* 'for-3.8/drivers' of git://git.kernel.dk/linux-block: (609 commits)
  drbd: update Kconfig to match current dependencies
  drbd: Fix drbdsetup wait-connect, wait-sync etc... commands
  drbd: close race between drbd_set_role and drbd_connect
  drbd: respect no-md-barriers setting also when changed online via disk-options
  drbd: Remove obsolete check
  drbd: fixup after wait_even_lock_irq() addition to generic code
  loop: Limit the number of requests in the bio list
  wait: add wait_event_lock_irq() interface
  xen-blkfront: free allocated page
  xen-blkback: move free persistent grants code
  block: partition: msdos: provide UUIDs for partitions
  init: reduce PARTUUID min length to 1 from 36
  block: store partition_meta_info.uuid as a string
  cciss: use check_signature()
  cciss: cleanup bitops usage
  drbd: use copy_highpage
  drbd: if the replication link breaks during handshake, keep retrying
  drbd: check return of kmalloc in receive_uuids
  drbd: Broadcast sync progress no more often than once per second
  drbd: don't try to clear bits once the disk has failed
  ...
......@@ -743,7 +743,6 @@ void __init printk_all_partitions(void)
struct hd_struct *part;
char name_buf[BDEVNAME_SIZE];
char devt_buf[BDEVT_SIZE];
char uuid_buf[PARTITION_META_INFO_UUIDLTH * 2 + 5];
/*
* Don't show empty devices or things that have been
......@@ -762,16 +761,11 @@ void __init printk_all_partitions(void)
while ((part = disk_part_iter_next(&piter))) {
bool is_part0 = part == &disk->part0;
uuid_buf[0] = '\0';
if (part->info)
snprintf(uuid_buf, sizeof(uuid_buf), "%pU",
part->info->uuid);
printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
bdevt_str(part_devt(part), devt_buf),
(unsigned long long)part_nr_sects_read(part) >> 1
, disk_name(disk, part->partno, name_buf),
uuid_buf);
part->info ? part->info->uuid : "");
if (is_part0) {
if (disk->driverfs_dev != NULL &&
disk->driverfs_dev->driver != NULL)
......
......@@ -620,7 +620,6 @@ int efi_partition(struct parsed_partitions *state)
gpt_entry *ptes = NULL;
u32 i;
unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
u8 unparsed_guid[37];
if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
......@@ -649,11 +648,7 @@ int efi_partition(struct parsed_partitions *state)
state->parts[i + 1].flags = ADDPART_FLAG_RAID;
info = &state->parts[i + 1].info;
/* Instead of doing a manual swap to big endian, reuse the
* common ASCII hex format as the interim.
*/
efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
part_pack_uuid(unparsed_guid, info->uuid);
efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
/* Naively convert UTF16-LE to 7 bits. */
label_max = min(sizeof(info->volname) - 1,
......
......@@ -94,6 +94,17 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
return ret;
}
static void set_info(struct parsed_partitions *state, int slot,
u32 disksig)
{
struct partition_meta_info *info = &state->parts[slot].info;
snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig,
slot);
info->volname[0] = 0;
state->parts[slot].has_info = true;
}
/*
* Create devices for each logical partition in an extended partition.
* The logical partitions form a linked list, with each entry being
......@@ -106,7 +117,8 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
*/
static void parse_extended(struct parsed_partitions *state,
sector_t first_sector, sector_t first_size)
sector_t first_sector, sector_t first_size,
u32 disksig)
{
struct partition *p;
Sector sect;
......@@ -166,6 +178,7 @@ static void parse_extended(struct parsed_partitions *state,
}
put_partition(state, state->next, next, size);
set_info(state, state->next, disksig);
if (SYS_IND(p) == LINUX_RAID_PARTITION)
state->parts[state->next].flags = ADDPART_FLAG_RAID;
loopct = 0;
......@@ -437,6 +450,7 @@ int msdos_partition(struct parsed_partitions *state)
struct partition *p;
struct fat_boot_sector *fb;
int slot;
u32 disksig;
data = read_part_sector(state, 0, &sect);
if (!data)
......@@ -491,6 +505,8 @@ int msdos_partition(struct parsed_partitions *state)
#endif
p = (struct partition *) (data + 0x1be);
disksig = le32_to_cpup((__le32 *)(data + 0x1b8));
/*
* Look for partitions in two passes:
* First find the primary and DOS-type extended partitions.
......@@ -515,11 +531,12 @@ int msdos_partition(struct parsed_partitions *state)
put_partition(state, slot, start, n);
strlcat(state->pp_buf, " <", PAGE_SIZE);
parse_extended(state, start, size);
parse_extended(state, start, size, disksig);
strlcat(state->pp_buf, " >", PAGE_SIZE);
continue;
}
put_partition(state, slot, start, size);
set_info(state, slot, disksig);
if (SYS_IND(p) == LINUX_RAID_PARTITION)
state->parts[slot].flags = ADDPART_FLAG_RAID;
if (SYS_IND(p) == DM6_PARTITION)
......
......@@ -41,8 +41,9 @@
#include <linux/spinlock.h>
#include <linux/compat.h>
#include <linux/mutex.h>
#include <linux/bitmap.h>
#include <linux/io.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <linux/dma-mapping.h>
#include <linux/blkdev.h>
......@@ -978,8 +979,7 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h)
i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
if (i == h->nr_cmds)
return NULL;
} while (test_and_set_bit(i & (BITS_PER_LONG - 1),
h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0);
} while (test_and_set_bit(i, h->cmd_pool_bits) != 0);
c = h->cmd_pool + i;
memset(c, 0, sizeof(CommandList_struct));
cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct);
......@@ -1046,8 +1046,7 @@ static void cmd_free(ctlr_info_t *h, CommandList_struct *c)
int i;
i = c - h->cmd_pool;
clear_bit(i & (BITS_PER_LONG - 1),
h->cmd_pool_bits + (i / BITS_PER_LONG));
clear_bit(i, h->cmd_pool_bits);
h->nr_frees++;
}
......@@ -4268,10 +4267,7 @@ static void __devinit cciss_find_board_params(ctlr_info_t *h)
static inline bool CISS_signature_present(ctlr_info_t *h)
{
if ((readb(&h->cfgtable->Signature[0]) != 'C') ||
(readb(&h->cfgtable->Signature[1]) != 'I') ||
(readb(&h->cfgtable->Signature[2]) != 'S') ||
(readb(&h->cfgtable->Signature[3]) != 'S')) {
if (!check_signature(h->cfgtable->Signature, "CISS", 4)) {
dev_warn(&h->pdev->dev, "not a valid CISS config table\n");
return false;
}
......@@ -4812,8 +4808,7 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h)
{
h->cmd_pool_bits = kmalloc(
DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) *
h->cmd_pool_bits = kmalloc(BITS_TO_LONGS(h->nr_cmds) *
sizeof(unsigned long), GFP_KERNEL);
h->cmd_pool = pci_alloc_consistent(h->pdev,
h->nr_cmds * sizeof(CommandList_struct),
......@@ -5068,9 +5063,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
pci_set_drvdata(pdev, h);
/* command and error info recs zeroed out before
they are used */
memset(h->cmd_pool_bits, 0,
DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG)
* sizeof(unsigned long));
bitmap_zero(h->cmd_pool_bits, h->nr_cmds);
h->num_luns = 0;
h->highest_lun = -1;
......
......@@ -2,13 +2,14 @@
# DRBD device driver configuration
#
comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
depends on PROC_FS='n' || INET='n' || CONNECTOR='n'
comment "DRBD disabled because PROC_FS or INET not selected"
depends on PROC_FS='n' || INET='n'
config BLK_DEV_DRBD
tristate "DRBD Distributed Replicated Block Device support"
depends on PROC_FS && INET && CONNECTOR
depends on PROC_FS && INET
select LRU_CACHE
select LIBCRC32C
default n
help
......@@ -58,7 +59,8 @@ config DRBD_FAULT_INJECTION
32 data read
64 read ahead
128 kmalloc of bitmap
256 allocation of EE (epoch_entries)
256 allocation of peer_requests
512 insert data corruption on receiving side
fault_devs: bitmask of minor numbers
fault_rate: frequency in percent
......
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
drbd-y += drbd_interval.o drbd_state.o
drbd-y += drbd_nla.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
此差异已折叠。
......@@ -119,13 +119,9 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
if (!__ratelimit(&drbd_ratelimit_state))
return;
dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
current == mdev->receiver.task ? "receiver" :
current == mdev->asender.task ? "asender" :
current == mdev->worker.task ? "worker" : current->comm,
func, b->bm_why ?: "?",
b->bm_task == mdev->receiver.task ? "receiver" :
b->bm_task == mdev->asender.task ? "asender" :
b->bm_task == mdev->worker.task ? "worker" : "?");
drbd_task_to_thread_name(mdev->tconn, current),
func, b->bm_why ?: "?",
drbd_task_to_thread_name(mdev->tconn, b->bm_task));
}
void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
......@@ -142,13 +138,9 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
if (trylock_failed) {
dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
current == mdev->receiver.task ? "receiver" :
current == mdev->asender.task ? "asender" :
current == mdev->worker.task ? "worker" : current->comm,
why, b->bm_why ?: "?",
b->bm_task == mdev->receiver.task ? "receiver" :
b->bm_task == mdev->asender.task ? "asender" :
b->bm_task == mdev->worker.task ? "worker" : "?");
drbd_task_to_thread_name(mdev->tconn, current),
why, b->bm_why ?: "?",
drbd_task_to_thread_name(mdev->tconn, b->bm_task));
mutex_lock(&b->bm_change);
}
if (BM_LOCKED_MASK & b->bm_flags)
......@@ -196,6 +188,9 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
/* to mark for lazy writeout once syncer cleared all clearable bits,
* we if bits have been cleared since last IO. */
#define BM_PAGE_LAZY_WRITEOUT 28
/* pages marked with this "HINT" will be considered for writeout
* on activity log transactions */
#define BM_PAGE_HINT_WRITEOUT 27
/* store_page_idx uses non-atomic assignment. It is only used directly after
* allocating the page. All other bm_set_page_* and bm_clear_page_* need to
......@@ -227,8 +222,7 @@ static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
{
struct drbd_bitmap *b = mdev->bitmap;
void *addr = &page_private(b->bm_pages[page_nr]);
clear_bit(BM_PAGE_IO_LOCK, addr);
smp_mb__after_clear_bit();
clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
wake_up(&mdev->bitmap->bm_io_wait);
}
......@@ -246,6 +240,27 @@ static void bm_set_page_need_writeout(struct page *page)
set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
}
/**
* drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
* @mdev: DRBD device.
* @page_nr: the bitmap page to mark with the "hint" flag
*
* From within an activity log transaction, we mark a few pages with these
* hints, then call drbd_bm_write_hinted(), which will only write out changed
* pages which are flagged with this mark.
*/
void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr)
{
struct page *page;
if (page_nr >= mdev->bitmap->bm_number_of_pages) {
dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n",
page_nr, (int)mdev->bitmap->bm_number_of_pages);
return;
}
page = mdev->bitmap->bm_pages[page_nr];
set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
}
static int bm_test_page_unchanged(struct page *page)
{
volatile const unsigned long *addr = &page_private(page);
......@@ -373,14 +388,16 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
return old_pages;
/* Trying kmalloc first, falling back to vmalloc.
* GFP_KERNEL is ok, as this is done when a lower level disk is
* "attached" to the drbd. Context is receiver thread or cqueue
* thread. As we have no disk yet, we are not in the IO path,
* not even the IO path of the peer. */
* GFP_NOIO, as this is called while drbd IO is "suspended",
* and during resize or attach on diskless Primary,
* we must not block on IO to ourselves.
* Context is receiver thread or dmsetup. */
bytes = sizeof(struct page *)*want;
new_pages = kzalloc(bytes, GFP_KERNEL);
new_pages = kzalloc(bytes, GFP_NOIO);
if (!new_pages) {
new_pages = vzalloc(bytes);
new_pages = __vmalloc(bytes,
GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
if (!new_pages)
return NULL;
vmalloced = 1;
......@@ -390,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
for (i = 0; i < have; i++)
new_pages[i] = old_pages[i];
for (; i < want; i++) {
page = alloc_page(GFP_HIGHUSER);
page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
if (!page) {
bm_free_pages(new_pages + have, i - have);
bm_vk_free(new_pages, vmalloced);
......@@ -439,7 +456,8 @@ int drbd_bm_init(struct drbd_conf *mdev)
sector_t drbd_bm_capacity(struct drbd_conf *mdev)
{
ERR_IF(!mdev->bitmap) return 0;
if (!expect(mdev->bitmap))
return 0;
return mdev->bitmap->bm_dev_capacity;
}
......@@ -447,7 +465,8 @@ sector_t drbd_bm_capacity(struct drbd_conf *mdev)
*/
void drbd_bm_cleanup(struct drbd_conf *mdev)
{
ERR_IF (!mdev->bitmap) return;
if (!expect(mdev->bitmap))
return;
bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
kfree(mdev->bitmap);
......@@ -610,7 +629,8 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
int err = 0, growing;
int opages_vmalloced;
ERR_IF(!b) return -ENOMEM;
if (!expect(b))
return -ENOMEM;
drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
......@@ -732,8 +752,10 @@ unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
unsigned long s;
unsigned long flags;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
s = b->bm_set;
......@@ -756,8 +778,10 @@ unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
size_t drbd_bm_words(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
return b->bm_words;
}
......@@ -765,7 +789,8 @@ size_t drbd_bm_words(struct drbd_conf *mdev)
unsigned long drbd_bm_bits(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return 0;
if (!expect(b))
return 0;
return b->bm_bits;
}
......@@ -786,8 +811,10 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
end = offset + number;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
if (number == 0)
return;
WARN_ON(offset >= b->bm_words);
......@@ -831,8 +858,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
end = offset + number;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
if ((offset >= b->bm_words) ||
......@@ -860,8 +889,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
void drbd_bm_set_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0xff, b->bm_words);
......@@ -874,8 +905,10 @@ void drbd_bm_set_all(struct drbd_conf *mdev)
void drbd_bm_clear_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0, b->bm_words);
......@@ -889,7 +922,8 @@ struct bm_aio_ctx {
unsigned int done;
unsigned flags;
#define BM_AIO_COPY_PAGES 1
#define BM_WRITE_ALL_PAGES 2
#define BM_AIO_WRITE_HINTED 2
#define BM_WRITE_ALL_PAGES 4
int error;
struct kref kref;
};
......@@ -977,17 +1011,11 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
bm_set_page_unchanged(b->bm_pages[page_nr]);
if (ctx->flags & BM_AIO_COPY_PAGES) {
void *src, *dest;
page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
dest = kmap_atomic(page);
src = kmap_atomic(b->bm_pages[page_nr]);
memcpy(dest, src, PAGE_SIZE);
kunmap_atomic(src);
kunmap_atomic(dest);
copy_highpage(page, b->bm_pages[page_nr]);
bm_store_page_idx(page, page_nr);
} else
page = b->bm_pages[page_nr];
bio->bi_bdev = mdev->ldev->md_bdev;
bio->bi_sector = on_disk_sector;
/* bio_add_page of a single page to an empty bio will always succeed,
......@@ -1060,6 +1088,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
if (rw & WRITE) {
if ((flags & BM_AIO_WRITE_HINTED) &&
!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
&page_private(b->bm_pages[i])))
continue;
if (!(flags & BM_WRITE_ALL_PAGES) &&
bm_test_page_unchanged(b->bm_pages[i])) {
dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
......@@ -1088,13 +1121,15 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
* "in_flight reached zero, all done" event.
*/
if (!atomic_dec_and_test(&ctx->in_flight))
wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done);
else
kref_put(&ctx->kref, &bm_aio_ctx_destroy);
dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
rw == WRITE ? "WRITE" : "READ",
count, jiffies - now);
/* summary for global bitmap IO */
if (flags == 0)
dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
rw == WRITE ? "WRITE" : "READ",
count, jiffies - now);
if (ctx->error) {
dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
......@@ -1103,7 +1138,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
}
if (atomic_read(&ctx->in_flight))
err = -EIO; /* Disk failed during IO... */
err = -EIO; /* Disk timeout/force-detach during IO... */
now = jiffies;
if (rw == WRITE) {
......@@ -1115,8 +1150,9 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
}
now = b->bm_set;
dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
if (flags == 0)
dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
kref_put(&ctx->kref, &bm_aio_ctx_destroy);
return err;
......@@ -1179,9 +1215,17 @@ int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
}
/**
* drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
* @mdev: DRBD device.
*/
int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local)
{
return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
}
/**
* drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
* drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
* @mdev: DRBD device.
* @idx: bitmap page index
*
......@@ -1222,11 +1266,11 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc
}
bm_page_io_async(ctx, idx, WRITE_SYNC);
wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done);
if (ctx->error)
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
/* that should force detach, so the in memory bitmap will be
/* that causes us to detach, so the in memory bitmap will be
* gone in a moment as well. */
mdev->bm_writ_cnt++;
......@@ -1289,8 +1333,10 @@ static unsigned long bm_find_next(struct drbd_conf *mdev,
struct drbd_bitmap *b = mdev->bitmap;
unsigned long i = DRBD_END_OF_BITMAP;
ERR_IF(!b) return i;
ERR_IF(!b->bm_pages) return i;
if (!expect(b))
return i;
if (!expect(b->bm_pages))
return i;
spin_lock_irq(&b->bm_lock);
if (BM_DONT_TEST & b->bm_flags)
......@@ -1391,8 +1437,10 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
struct drbd_bitmap *b = mdev->bitmap;
int c = 0;
ERR_IF(!b) return 1;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 1;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
......@@ -1423,13 +1471,21 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
{
int i;
int bits;
int changed = 0;
unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]);
paddr[i] = ~0UL;
b->bm_set += BITS_PER_LONG - bits;
changed += BITS_PER_LONG - bits;
}
kunmap_atomic(paddr);
if (changed) {
/* We only need lazy writeout, the information is still in the
* remote bitmap as well, and is reconstructed during the next
* bitmap exchange, if lost locally due to a crash. */
bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
b->bm_set += changed;
}
}
/* Same thing as drbd_bm_set_bits,
......@@ -1524,8 +1580,10 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
unsigned long *p_addr;
int i;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
......@@ -1559,8 +1617,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
* robust in case we screwed up elsewhere, in that case pretend there
* was one dirty bit in the requested area, so we won't try to do a
* local read there (no bitmap probably implies no disk) */
ERR_IF(!b) return 1;
ERR_IF(!b->bm_pages) return 1;
if (!expect(b))
return 1;
if (!expect(b->bm_pages))
return 1;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
......@@ -1573,11 +1633,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
bm_unmap(p_addr);
p_addr = bm_map_pidx(b, idx);
}
ERR_IF (bitnr >= b->bm_bits) {
dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
} else {
if (expect(bitnr < b->bm_bits))
c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
}
else
dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
}
if (p_addr)
bm_unmap(p_addr);
......@@ -1607,8 +1666,10 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
unsigned long flags;
unsigned long *p_addr, *bm;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
......@@ -1630,47 +1691,3 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
spin_unlock_irqrestore(&b->bm_lock, flags);
return count;
}
/* Set all bits covered by the AL-extent al_enr.
* Returns number of bits changed. */
unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr, *bm;
unsigned long weight;
unsigned long s, e;
int count, i, do_now;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
spin_lock_irq(&b->bm_lock);
if (BM_DONT_SET & b->bm_flags)
bm_print_lock_info(mdev);
weight = b->bm_set;
s = al_enr * BM_WORDS_PER_AL_EXT;
e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
/* assert that s and e are on the same page */
D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
== s >> (PAGE_SHIFT - LN2_BPL + 3));
count = 0;
if (s < b->bm_words) {
i = do_now = e-s;
p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
bm = p_addr + MLPP(s);
while (i--) {
count += hweight_long(*bm);
*bm = -1UL;
bm++;
}
bm_unmap(p_addr);
b->bm_set += do_now*BITS_PER_LONG - count;
if (e == b->bm_words)
b->bm_set -= bm_clear_surplus(b);
} else {
dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
}
weight = b->bm_set - weight;
spin_unlock_irq(&b->bm_lock);
return weight;
}
此差异已折叠。
#include <asm/bug.h>
#include <linux/rbtree_augmented.h>
#include "drbd_interval.h"
/**
* interval_end - return end of @node
*/
static inline
sector_t interval_end(struct rb_node *node)
{
struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
return this->end;
}
/**
* compute_subtree_last - compute end of @node
*
* The end of an interval is the highest (start + (size >> 9)) value of this
* node and of its children. Called for @node and its parents whenever the end
* may have changed.
*/
static inline sector_t
compute_subtree_last(struct drbd_interval *node)
{
sector_t max = node->sector + (node->size >> 9);
if (node->rb.rb_left) {
sector_t left = interval_end(node->rb.rb_left);
if (left > max)
max = left;
}
if (node->rb.rb_right) {
sector_t right = interval_end(node->rb.rb_right);
if (right > max)
max = right;
}
return max;
}
static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
{
while (rb != stop) {
struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
sector_t subtree_last = compute_subtree_last(node);
if (node->end == subtree_last)
break;
node->end = subtree_last;
rb = rb_parent(&node->rb);
}
}
static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
{
struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
new->end = old->end;
}
static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
{
struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
new->end = old->end;
old->end = compute_subtree_last(old);
}
static const struct rb_augment_callbacks augment_callbacks = {
augment_propagate,
augment_copy,
augment_rotate,
};
/**
* drbd_insert_interval - insert a new interval into a tree
*/
bool
drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
{
struct rb_node **new = &root->rb_node, *parent = NULL;
BUG_ON(!IS_ALIGNED(this->size, 512));
while (*new) {
struct drbd_interval *here =
rb_entry(*new, struct drbd_interval, rb);
parent = *new;
if (this->sector < here->sector)
new = &(*new)->rb_left;
else if (this->sector > here->sector)
new = &(*new)->rb_right;
else if (this < here)
new = &(*new)->rb_left;
else if (this > here)
new = &(*new)->rb_right;
else
return false;
}
rb_link_node(&this->rb, parent, new);
rb_insert_augmented(&this->rb, root, &augment_callbacks);
return true;
}
/**
* drbd_contains_interval - check if a tree contains a given interval
* @sector: start sector of @interval
* @interval: may not be a valid pointer
*
* Returns if the tree contains the node @interval with start sector @start.
* Does not dereference @interval until @interval is known to be a valid object
* in @tree. Returns %false if @interval is in the tree but with a different
* sector number.
*/
bool
drbd_contains_interval(struct rb_root *root, sector_t sector,
struct drbd_interval *interval)
{
struct rb_node *node = root->rb_node;
while (node) {
struct drbd_interval *here =
rb_entry(node, struct drbd_interval, rb);
if (sector < here->sector)
node = node->rb_left;
else if (sector > here->sector)
node = node->rb_right;
else if (interval < here)
node = node->rb_left;
else if (interval > here)
node = node->rb_right;
else
return true;
}
return false;
}
/**
* drbd_remove_interval - remove an interval from a tree
*/
void
drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
{
rb_erase_augmented(&this->rb, root, &augment_callbacks);
}
/**
* drbd_find_overlap - search for an interval overlapping with [sector, sector + size)
* @sector: start sector
* @size: size, aligned to 512 bytes
*
* Returns an interval overlapping with [sector, sector + size), or NULL if
* there is none. When there is more than one overlapping interval in the
* tree, the interval with the lowest start sector is returned, and all other
* overlapping intervals will be on the right side of the tree, reachable with
* rb_next().
*/
struct drbd_interval *
drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
{
struct rb_node *node = root->rb_node;
struct drbd_interval *overlap = NULL;
sector_t end = sector + (size >> 9);
BUG_ON(!IS_ALIGNED(size, 512));
while (node) {
struct drbd_interval *here =
rb_entry(node, struct drbd_interval, rb);
if (node->rb_left &&
sector < interval_end(node->rb_left)) {
/* Overlap if any must be on left side */
node = node->rb_left;
} else if (here->sector < end &&
sector < here->sector + (here->size >> 9)) {
overlap = here;
break;
} else if (sector >= here->sector) {
/* Overlap if any must be on right side */
node = node->rb_right;
} else
break;
}
return overlap;
}
struct drbd_interval *
drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
{
sector_t end = sector + (size >> 9);
struct rb_node *node;
for (;;) {
node = rb_next(&i->rb);
if (!node)
return NULL;
i = rb_entry(node, struct drbd_interval, rb);
if (i->sector >= end)
return NULL;
if (sector < i->sector + (i->size >> 9))
return i;
}
}
#ifndef __DRBD_INTERVAL_H
#define __DRBD_INTERVAL_H
#include <linux/types.h>
#include <linux/rbtree.h>
struct drbd_interval {
struct rb_node rb;
sector_t sector; /* start sector of the interval */
unsigned int size; /* size in bytes */
sector_t end; /* highest interval end in subtree */
int local:1 /* local or remote request? */;
int waiting:1;
};
static inline void drbd_clear_interval(struct drbd_interval *i)
{
RB_CLEAR_NODE(&i->rb);
}
static inline bool drbd_interval_empty(struct drbd_interval *i)
{
return RB_EMPTY_NODE(&i->rb);
}
extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
extern bool drbd_contains_interval(struct rb_root *, sector_t,
struct drbd_interval *);
extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
unsigned int);
extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
unsigned int);
#define drbd_for_each_overlap(i, root, sector, size) \
for (i = drbd_find_overlap(root, sector, size); \
i; \
i = drbd_next_overlap(i, sector, size))
#endif /* __DRBD_INTERVAL_H */
此差异已折叠。
此差异已折叠。
#include "drbd_wrappers.h"
#include <linux/kernel.h>
#include <net/netlink.h>
#include <linux/drbd_genl_api.h>
#include "drbd_nla.h"
static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
{
struct nlattr *head = nla_data(nla);
int len = nla_len(nla);
int rem;
/*
* validate_nla (called from nla_parse_nested) ignores attributes
* beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
* In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
* flag set also, check and remove that flag before calling
* nla_parse_nested.
*/
nla_for_each_attr(nla, head, len, rem) {
if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
if (nla_type(nla) > maxtype)
return -EOPNOTSUPP;
}
}
return 0;
}
int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy)
{
int err;
err = drbd_nla_check_mandatory(maxtype, nla);
if (!err)
err = nla_parse_nested(tb, maxtype, nla, policy);
return err;
}
struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
{
int err;
/*
* If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
* we don't know about that attribute, reject all the nested
* attributes.
*/
err = drbd_nla_check_mandatory(maxtype, nla);
if (err)
return ERR_PTR(err);
return nla_find_nested(nla, attrtype);
}
#ifndef __DRBD_NLA_H
#define __DRBD_NLA_H
extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy);
extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
#endif /* __DRBD_NLA_H */
......@@ -167,18 +167,24 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
* we convert to sectors in the display below. */
unsigned long bm_bits = drbd_bm_bits(mdev);
unsigned long bit_pos;
unsigned long long stop_sector = 0;
if (mdev->state.conn == C_VERIFY_S ||
mdev->state.conn == C_VERIFY_T)
mdev->state.conn == C_VERIFY_T) {
bit_pos = bm_bits - mdev->ov_left;
else
if (verify_can_do_stop_sector(mdev))
stop_sector = mdev->ov_stop_sector;
} else
bit_pos = mdev->bm_resync_fo;
/* Total sectors may be slightly off for oddly
* sized devices. So what. */
seq_printf(seq,
"\t%3d%% sector pos: %llu/%llu\n",
"\t%3d%% sector pos: %llu/%llu",
(int)(bit_pos / (bm_bits/100+1)),
(unsigned long long)bit_pos * BM_SECT_PER_BIT,
(unsigned long long)bm_bits * BM_SECT_PER_BIT);
if (stop_sector != 0 && stop_sector != ULLONG_MAX)
seq_printf(seq, " stop sector: %llu", stop_sector);
seq_printf(seq, "\n");
}
}
......@@ -194,9 +200,11 @@ static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
static int drbd_seq_show(struct seq_file *seq, void *v)
{
int i, hole = 0;
int i, prev_i = -1;
const char *sn;
struct drbd_conf *mdev;
struct net_conf *nc;
char wp;
static char write_ordering_chars[] = {
[WO_none] = 'n',
......@@ -227,16 +235,11 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
oos .. known out-of-sync kB
*/
for (i = 0; i < minor_count; i++) {
mdev = minor_to_mdev(i);
if (!mdev) {
hole = 1;
continue;
}
if (hole) {
hole = 0;
rcu_read_lock();
idr_for_each_entry(&minors, mdev, i) {
if (prev_i != i - 1)
seq_printf(seq, "\n");
}
prev_i = i;
sn = drbd_conn_str(mdev->state.conn);
......@@ -248,6 +251,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
/* reset mdev->congestion_reason */
bdi_rw_congested(&mdev->rq_queue->backing_dev_info);
nc = rcu_dereference(mdev->tconn->net_conf);
wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
......@@ -257,9 +262,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
drbd_role_str(mdev->state.peer),
drbd_disk_str(mdev->state.disk),
drbd_disk_str(mdev->state.pdsk),
(mdev->net_conf == NULL ? ' ' :
(mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
is_susp(mdev->state) ? 's' : 'r',
wp,
drbd_suspended(mdev) ? 's' : 'r',
mdev->state.aftr_isp ? 'a' : '-',
mdev->state.peer_isp ? 'p' : '-',
mdev->state.user_isp ? 'u' : '-',
......@@ -276,8 +280,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
atomic_read(&mdev->rs_pending_cnt),
atomic_read(&mdev->unacked_cnt),
atomic_read(&mdev->ap_bio_cnt),
mdev->epochs,
write_ordering_chars[mdev->write_ordering]
mdev->tconn->epochs,
write_ordering_chars[mdev->tconn->write_ordering]
);
seq_printf(seq, " oos:%llu\n",
Bit2KB((unsigned long long)
......@@ -302,6 +306,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
}
}
}
rcu_read_unlock();
return 0;
}
......
此差异已折叠。
此差异已折叠。
......@@ -77,40 +77,41 @@
*/
enum drbd_req_event {
created,
to_be_send,
to_be_submitted,
CREATED,
TO_BE_SENT,
TO_BE_SUBMITTED,
/* XXX yes, now I am inconsistent...
* these are not "events" but "actions"
* oh, well... */
queue_for_net_write,
queue_for_net_read,
queue_for_send_oos,
send_canceled,
send_failed,
handed_over_to_network,
oos_handed_to_network,
connection_lost_while_pending,
read_retry_remote_canceled,
recv_acked_by_peer,
write_acked_by_peer,
write_acked_by_peer_and_sis, /* and set_in_sync */
conflict_discarded_by_peer,
neg_acked,
barrier_acked, /* in protocol A and B */
data_received, /* (remote read) */
read_completed_with_error,
read_ahead_completed_with_error,
write_completed_with_error,
abort_disk_io,
completed_ok,
resend,
fail_frozen_disk_io,
restart_frozen_disk_io,
nothing, /* for tracing only */
QUEUE_FOR_NET_WRITE,
QUEUE_FOR_NET_READ,
QUEUE_FOR_SEND_OOS,
SEND_CANCELED,
SEND_FAILED,
HANDED_OVER_TO_NETWORK,
OOS_HANDED_TO_NETWORK,
CONNECTION_LOST_WHILE_PENDING,
READ_RETRY_REMOTE_CANCELED,
RECV_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
CONFLICT_RESOLVED,
POSTPONE_WRITE,
NEG_ACKED,
BARRIER_ACKED, /* in protocol A and B */
DATA_RECEIVED, /* (remote read) */
READ_COMPLETED_WITH_ERROR,
READ_AHEAD_COMPLETED_WITH_ERROR,
WRITE_COMPLETED_WITH_ERROR,
ABORT_DISK_IO,
COMPLETED_OK,
RESEND,
FAIL_FROZEN_DISK_IO,
RESTART_FROZEN_DISK_IO,
NOTHING,
};
/* encoding of request states for now. we don't actually need that many bits.
......@@ -142,8 +143,8 @@ enum drbd_req_state_bits {
* recv_ack (B) or implicit "ack" (A),
* still waiting for the barrier ack.
* master_bio may already be completed and invalidated.
* 11100: write_acked (C),
* data_received (for remote read, any protocol)
* 11100: write acked (C),
* data received (for remote read, any protocol)
* or finally the barrier ack has arrived (B,A)...
* request can be freed
* 01100: neg-acked (write, protocol C)
......@@ -198,6 +199,22 @@ enum drbd_req_state_bits {
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
/* The peer has sent a retry ACK */
__RQ_POSTPONED,
/* would have been completed,
* but was not, because of drbd_suspended() */
__RQ_COMPLETION_SUSP,
/* We expect a receive ACK (wire proto B) */
__RQ_EXP_RECEIVE_ACK,
/* We expect a write ACK (wite proto C) */
__RQ_EXP_WRITE_ACK,
/* waiting for a barrier ack, did an extra kref_get */
__RQ_EXP_BARR_ACK,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
......@@ -219,56 +236,16 @@ enum drbd_req_state_bits {
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK)
/* For waking up the frozen transfer log mod_req() has to return if the request
should be counted in the epoch object*/
#define MR_WRITE_SHIFT 0
#define MR_WRITE (1 << MR_WRITE_SHIFT)
#define MR_READ_SHIFT 1
#define MR_READ (1 << MR_READ_SHIFT)
/* epoch entries */
static inline
struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
BUG_ON(mdev->ee_hash_s == 0);
return mdev->ee_hash +
((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
}
/* transfer log (drbd_request objects) */
static inline
struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
BUG_ON(mdev->tl_hash_s == 0);
return mdev->tl_hash +
((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
}
/* application reads (drbd_request objects) */
static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
return mdev->app_reads_hash
+ ((unsigned int)(sector) % APP_R_HSIZE);
}
/* when we receive the answer for a read request,
* verify that we actually know about it */
static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
u64 id, sector_t sector)
{
struct hlist_head *slot = ar_hash_slot(mdev, sector);
struct hlist_node *n;
struct drbd_request *req;
hlist_for_each_entry(req, n, slot, collision) {
if ((unsigned long)req == (unsigned long)id) {
D_ASSERT(req->sector == sector);
return req;
}
}
return NULL;
}
#define MR_WRITE 1
#define MR_READ 2
static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
{
......@@ -278,41 +255,10 @@ static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bi
req->private_bio = bio;
bio->bi_private = req;
bio->bi_end_io = drbd_endio_pri;
bio->bi_end_io = drbd_request_endio;
bio->bi_next = NULL;
}
static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
struct bio *bio_src)
{
struct drbd_request *req =
mempool_alloc(drbd_request_mempool, GFP_NOIO);
if (likely(req)) {
drbd_req_make_private_bio(req, bio_src);
req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
req->mdev = mdev;
req->master_bio = bio_src;
req->epoch = 0;
req->sector = bio_src->bi_sector;
req->size = bio_src->bi_size;
INIT_HLIST_NODE(&req->collision);
INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list);
}
return req;
}
static inline void drbd_req_free(struct drbd_request *req)
{
mempool_free(req, drbd_request_mempool);
}
static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
{
return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
}
/* Short lived temporary struct on the stack.
* We could squirrel the error to be returned into
* bio->bi_size, or similar. But that would be too ugly. */
......@@ -321,6 +267,7 @@ struct bio_and_error {
int error;
};
extern void drbd_req_destroy(struct kref *kref);
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
......@@ -328,13 +275,17 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
extern void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m);
extern void request_timer_fn(unsigned long data);
extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
extern void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what);
extern void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what);
/* this is in drbd_main.c */
extern void drbd_restart_request(struct drbd_request *req);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
int rv;
......@@ -354,13 +305,13 @@ static inline int req_mod(struct drbd_request *req,
enum drbd_req_event what)
{
unsigned long flags;
struct drbd_conf *mdev = req->mdev;
struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
int rv;
spin_lock_irqsave(&mdev->req_lock, flags);
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
rv = __req_mod(req, what, &m);
spin_unlock_irqrestore(&mdev->req_lock, flags);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
if (m.bio)
complete_master_bio(mdev, &m);
......@@ -368,7 +319,7 @@ static inline int req_mod(struct drbd_request *req,
return rv;
}
static inline bool drbd_should_do_remote(union drbd_state s)
static inline bool drbd_should_do_remote(union drbd_dev_state s)
{
return s.pdsk == D_UP_TO_DATE ||
(s.pdsk >= D_INCONSISTENT &&
......@@ -378,7 +329,7 @@ static inline bool drbd_should_do_remote(union drbd_state s)
That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
states. */
}
static inline bool drbd_should_send_oos(union drbd_state s)
static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
{
return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
......
此差异已折叠。
#ifndef DRBD_STATE_H
#define DRBD_STATE_H
struct drbd_conf;
struct drbd_tconn;
/**
* DOC: DRBD State macros
*
* These macros are used to express state changes in easily readable form.
*
* The NS macros expand to a mask and a value, that can be bit ored onto the
* current state as soon as the spinlock (req_lock) was taken.
*
* The _NS macros are used for state functions that get called with the
* spinlock. These macros expand directly to the new state value.
*
* Besides the basic forms NS() and _NS() additional _?NS[23] are defined
* to express state changes that affect more than one aspect of the state.
*
* E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
* Means that the network connection was established and that the peer
* is in secondary role.
*/
#define role_MASK R_MASK
#define peer_MASK R_MASK
#define disk_MASK D_MASK
#define pdsk_MASK D_MASK
#define conn_MASK C_MASK
#define susp_MASK 1
#define user_isp_MASK 1
#define aftr_isp_MASK 1
#define susp_nod_MASK 1
#define susp_fen_MASK 1
#define NS(T, S) \
({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T = (S); val; })
#define NS2(T1, S1, T2, S2) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val; })
#define NS3(T1, S1, T2, S2, T3, S3) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val.T3 = (S3); val; })
#define _NS(D, T, S) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
#define _NS2(D, T1, S1, T2, S2) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
__ns.T2 = (S2); __ns; })
#define _NS3(D, T1, S1, T2, S2, T3, S3) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
enum chg_state_flags {
CS_HARD = 1 << 0,
CS_VERBOSE = 1 << 1,
CS_WAIT_COMPLETE = 1 << 2,
CS_SERIALIZE = 1 << 3,
CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */
CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */
CS_DC_PEER = 1 << 6,
CS_DC_CONN = 1 << 7,
CS_DC_DISK = 1 << 8,
CS_DC_PDSK = 1 << 9,
CS_DC_SUSP = 1 << 10,
CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
CS_IGN_OUTD_FAIL = 1 << 11,
};
/* drbd_dev_state and drbd_state are different types. This is to stress the
small difference. There is no suspended flag (.susp), and no suspended
while fence handler runs flas (susp_fen). */
union drbd_dev_state {
struct {
#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned conn:5 ; /* 17/32 cstates */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned _unused:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned peer_isp:1 ;
unsigned user_isp:1 ;
unsigned _pad:11; /* 0 unused */
#elif defined(__BIG_ENDIAN_BITFIELD)
unsigned _pad:11;
unsigned user_isp:1 ;
unsigned peer_isp:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned _unused:1 ;
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned conn:5 ; /* 17/32 cstates */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
#else
# error "this endianess is not supported"
#endif
};
unsigned int i;
};
extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
enum chg_state_flags f,
union drbd_state mask,
union drbd_state val);
extern void drbd_force_state(struct drbd_conf *, union drbd_state,
union drbd_state);
extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
union drbd_state,
union drbd_state,
enum chg_state_flags);
extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
enum chg_state_flags,
struct completion *done);
extern void print_st_err(struct drbd_conf *, union drbd_state,
union drbd_state, int);
enum drbd_state_rv
_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags);
enum drbd_state_rv
conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags);
extern void drbd_resume_al(struct drbd_conf *mdev);
extern bool conn_all_vols_unconf(struct drbd_tconn *tconn);
/**
* drbd_request_state() - Reqest a state change
* @mdev: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.
*
* This is the most graceful way of requesting a state change. It is verbose
* quite verbose in case the state change is not possible, and all those
* state changes are globally serialized.
*/
static inline int drbd_request_state(struct drbd_conf *mdev,
union drbd_state mask,
union drbd_state val)
{
return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
}
enum drbd_role conn_highest_role(struct drbd_tconn *tconn);
enum drbd_role conn_highest_peer(struct drbd_tconn *tconn);
enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn);
enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn);
enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn);
enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn);
#endif
......@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = {
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
};
const char *drbd_conn_str(enum drbd_conns s)
......
此差异已折叠。
......@@ -3,6 +3,7 @@
#include <linux/ctype.h>
#include <linux/mm.h>
#include "drbd_int.h"
/* see get_sb_bdev and bd_claim */
extern char *drbd_sec_holder;
......@@ -20,8 +21,8 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
/* bi_end_io handlers */
extern void drbd_md_io_complete(struct bio *bio, int error);
extern void drbd_endio_sec(struct bio *bio, int error);
extern void drbd_endio_pri(struct bio *bio, int error);
extern void drbd_peer_request_endio(struct bio *bio, int error);
extern void drbd_request_endio(struct bio *bio, int error);
/*
* used to submit our private bio
......@@ -45,12 +46,6 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev,
generic_make_request(bio);
}
static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
{
return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
== CRYPTO_ALG_TYPE_HASH;
}
#ifndef __CHECKER__
# undef __cond_lock
# define __cond_lock(x,c) (c)
......
......@@ -463,6 +463,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
*/
static void loop_add_bio(struct loop_device *lo, struct bio *bio)
{
lo->lo_bio_count++;
bio_list_add(&lo->lo_bio_list, bio);
}
......@@ -471,6 +472,7 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio)
*/
static struct bio *loop_get_bio(struct loop_device *lo)
{
lo->lo_bio_count--;
return bio_list_pop(&lo->lo_bio_list);
}
......@@ -489,6 +491,10 @@ static void loop_make_request(struct request_queue *q, struct bio *old_bio)
goto out;
if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
goto out;
if (lo->lo_bio_count >= q->nr_congestion_on)
wait_event_lock_irq(lo->lo_req_wait,
lo->lo_bio_count < q->nr_congestion_off,
lo->lo_lock);
loop_add_bio(lo, old_bio);
wake_up(&lo->lo_event);
spin_unlock_irq(&lo->lo_lock);
......@@ -546,6 +552,8 @@ static int loop_thread(void *data)
continue;
spin_lock_irq(&lo->lo_lock);
bio = loop_get_bio(lo);
if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
wake_up(&lo->lo_req_wait);
spin_unlock_irq(&lo->lo_lock);
BUG_ON(!bio);
......@@ -873,6 +881,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
lo->transfer = transfer_none;
lo->ioctl = NULL;
lo->lo_sizelimit = 0;
lo->lo_bio_count = 0;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
......@@ -1673,6 +1682,7 @@ static int loop_add(struct loop_device **l, int i)
lo->lo_number = i;
lo->lo_thread = NULL;
init_waitqueue_head(&lo->lo_event);
init_waitqueue_head(&lo->lo_req_wait);
spin_lock_init(&lo->lo_lock);
disk->major = LOOP_MAJOR;
disk->first_minor = i << part_shift;
......
此差异已折叠。
......@@ -34,6 +34,7 @@
#include <linux/vmalloc.h>
#include <linux/wait.h>
#include <linux/io.h>
#include <linux/rbtree.h>
#include <asm/setup.h>
#include <asm/pgalloc.h>
#include <asm/hypervisor.h>
......@@ -160,10 +161,21 @@ struct xen_vbd {
sector_t size;
unsigned int flush_support:1;
unsigned int discard_secure:1;
unsigned int feature_gnt_persistent:1;
unsigned int overflow_max_grants:1;
};
struct backend_info;
struct persistent_gnt {
struct page *page;
grant_ref_t gnt;
grant_handle_t handle;
uint64_t dev_bus_addr;
struct rb_node node;
};
struct xen_blkif {
/* Unique identifier for this interface. */
domid_t domid;
......@@ -190,6 +202,10 @@ struct xen_blkif {
struct task_struct *xenblkd;
unsigned int waiting_reqs;
/* tree to store persistent grants */
struct rb_root persistent_gnts;
unsigned int persistent_gnt_c;
/* statistics */
unsigned long st_print;
int st_rd_req;
......
......@@ -117,6 +117,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
atomic_set(&blkif->drain, 0);
blkif->st_print = jiffies;
init_waitqueue_head(&blkif->waiting_to_free);
blkif->persistent_gnts.rb_node = NULL;
return blkif;
}
......@@ -672,6 +673,13 @@ static void connect(struct backend_info *be)
xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", 1);
if (err) {
xenbus_dev_fatal(dev, err, "writing %s/feature-persistent",
dev->nodename);
goto abort;
}
err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
(unsigned long long)vbd_sz(&be->blkif->vbd));
if (err) {
......@@ -720,6 +728,7 @@ static int connect_ring(struct backend_info *be)
struct xenbus_device *dev = be->dev;
unsigned long ring_ref;
unsigned int evtchn;
unsigned int pers_grants;
char protocol[64] = "";
int err;
......@@ -749,8 +758,18 @@ static int connect_ring(struct backend_info *be)
xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
return -1;
}
pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n",
ring_ref, evtchn, be->blkif->blk_protocol, protocol);
err = xenbus_gather(XBT_NIL, dev->otherend,
"feature-persistent", "%u",
&pers_grants, NULL);
if (err)
pers_grants = 0;
be->blkif->vbd.feature_gnt_persistent = pers_grants;
be->blkif->vbd.overflow_max_grants = 0;
pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
ring_ref, evtchn, be->blkif->blk_protocol, protocol,
pers_grants ? "persistent grants" : "");
/* Map the shared frame, irq etc. */
err = xen_blkif_map(be->blkif, ring_ref, evtchn);
......
此差异已折叠。
......@@ -452,7 +452,7 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
spin_lock_irq(&mddev->write_lock);
wait_event_lock_irq(mddev->sb_wait,
!mddev->flush_bio,
mddev->write_lock, /*nothing*/);
mddev->write_lock);
mddev->flush_bio = bio;
spin_unlock_irq(&mddev->write_lock);
......
......@@ -551,32 +551,6 @@ struct md_thread {
#define THREAD_WAKEUP 0
#define __wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
wait_queue_t __wait; \
init_waitqueue_entry(&__wait, current); \
\
add_wait_queue(&wq, &__wait); \
for (;;) { \
set_current_state(TASK_UNINTERRUPTIBLE); \
if (condition) \
break; \
spin_unlock_irq(&lock); \
cmd; \
schedule(); \
spin_lock_irq(&lock); \
} \
current->state = TASK_RUNNING; \
remove_wait_queue(&wq, &__wait); \
} while (0)
#define wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
if (condition) \
break; \
__wait_event_lock_irq(wq, condition, lock, cmd); \
} while (0)
static inline void safe_put_page(struct page *p)
{
if (p) put_page(p);
......
......@@ -822,7 +822,7 @@ static void raise_barrier(struct r1conf *conf)
/* Wait until no block IO is waiting */
wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
conf->resync_lock, );
conf->resync_lock);
/* block any new IO from starting */
conf->barrier++;
......@@ -830,7 +830,7 @@ static void raise_barrier(struct r1conf *conf)
/* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
conf->resync_lock, );
conf->resync_lock);
spin_unlock_irq(&conf->resync_lock);
}
......@@ -864,8 +864,7 @@ static void wait_barrier(struct r1conf *conf)
(conf->nr_pending &&
current->bio_list &&
!bio_list_empty(current->bio_list)),
conf->resync_lock,
);
conf->resync_lock);
conf->nr_waiting--;
}
conf->nr_pending++;
......@@ -898,10 +897,10 @@ static void freeze_array(struct r1conf *conf)
spin_lock_irq(&conf->resync_lock);
conf->barrier++;
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
flush_pending_writes(conf));
wait_event_lock_irq_cmd(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
flush_pending_writes(conf));
spin_unlock_irq(&conf->resync_lock);
}
static void unfreeze_array(struct r1conf *conf)
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -53,10 +53,13 @@ struct loop_device {
spinlock_t lo_lock;
struct bio_list lo_bio_list;
unsigned int lo_bio_count;
int lo_state;
struct mutex lo_ctl_mutex;
struct task_struct *lo_thread;
wait_queue_head_t lo_event;
/* wait queue for incoming requests */
wait_queue_head_t lo_req_wait;
struct request_queue *lo_queue;
struct gendisk *lo_disk;
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册