提交 69cf2d85 编写于 作者: E Ed Cashin 提交者: Linus Torvalds

aoe: become I/O request queue handler for increased user control

To allow users to choose an elevator algorithm for their particular
workloads, change from a make_request-style driver to an
I/O-request-queue-handler-style driver.

We have to do a couple of things that might be surprising.  We manipulate
the page _count directly on the assumption that we still have no guarantee
that users of the block layer are prohibited from submitting bios
containing pages with zero reference counts.[1] If such a prohibition now
exists, I can get rid of the _count manipulation.

Just as before this patch, we still keep track of the sk_buffs that the
network layer still hasn't finished yet and cap the resources we use with
a "pool" of skbs.[2]

Now that the block layer maintains the disk stats, the aoe driver's
diskstats function can go away.

1. https://lkml.org/lkml/2007/3/1/374
2. https://lkml.org/lkml/2007/7/6/241Signed-off-by: NEd Cashin <ecashin@coraid.com>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 896831f5
...@@ -90,7 +90,7 @@ enum { ...@@ -90,7 +90,7 @@ enum {
MIN_BUFS = 16, MIN_BUFS = 16,
NTARGETS = 8, NTARGETS = 8,
NAOEIFS = 8, NAOEIFS = 8,
NSKBPOOLMAX = 128, NSKBPOOLMAX = 256,
NFACTIVE = 17, NFACTIVE = 17,
TIMERTICK = HZ / 10, TIMERTICK = HZ / 10,
...@@ -100,30 +100,26 @@ enum { ...@@ -100,30 +100,26 @@ enum {
}; };
struct buf { struct buf {
struct list_head bufs;
ulong stime; /* for disk stats */
ulong flags;
ulong nframesout; ulong nframesout;
ulong resid; ulong resid;
ulong bv_resid; ulong bv_resid;
ulong bv_off;
sector_t sector; sector_t sector;
struct bio *bio; struct bio *bio;
struct bio_vec *bv; struct bio_vec *bv;
struct request *rq;
}; };
struct frame { struct frame {
struct list_head head; struct list_head head;
u32 tag; u32 tag;
ulong waited; ulong waited;
struct buf *buf;
struct aoetgt *t; /* parent target I belong to */ struct aoetgt *t; /* parent target I belong to */
char *bufaddr;
ulong bcnt;
sector_t lba; sector_t lba;
struct sk_buff *skb; /* command skb freed on module exit */ struct sk_buff *skb; /* command skb freed on module exit */
struct sk_buff *r_skb; /* response skb for async processing */ struct sk_buff *r_skb; /* response skb for async processing */
struct buf *buf;
struct bio_vec *bv; struct bio_vec *bv;
ulong bcnt;
ulong bv_off; ulong bv_off;
}; };
...@@ -161,6 +157,7 @@ struct aoedev { ...@@ -161,6 +157,7 @@ struct aoedev {
u16 rttavg; /* round trip average of requests/responses */ u16 rttavg; /* round trip average of requests/responses */
u16 mintimer; u16 mintimer;
u16 fw_ver; /* version of blade's firmware */ u16 fw_ver; /* version of blade's firmware */
ulong ref;
struct work_struct work;/* disk create work struct */ struct work_struct work;/* disk create work struct */
struct gendisk *gd; struct gendisk *gd;
struct request_queue *blkq; struct request_queue *blkq;
...@@ -168,11 +165,13 @@ struct aoedev { ...@@ -168,11 +165,13 @@ struct aoedev {
sector_t ssize; sector_t ssize;
struct timer_list timer; struct timer_list timer;
spinlock_t lock; spinlock_t lock;
struct sk_buff_head sendq;
struct sk_buff_head skbpool; struct sk_buff_head skbpool;
mempool_t *bufpool; /* for deadlock-free Buf allocation */ mempool_t *bufpool; /* for deadlock-free Buf allocation */
struct list_head bufq; /* queue of bios to work on */ struct { /* pointers to work in progress */
struct buf *inprocess; /* the one we're currently working on */ struct buf *buf;
struct bio *nxbio;
struct request *rq;
} ip;
struct aoetgt *targets[NTARGETS]; struct aoetgt *targets[NTARGETS];
struct aoetgt **tgt; /* target in use when working */ struct aoetgt **tgt; /* target in use when working */
struct aoetgt *htgt; /* target needing rexmit assistance */ struct aoetgt *htgt; /* target needing rexmit assistance */
...@@ -209,6 +208,8 @@ void aoecmd_exit(void); ...@@ -209,6 +208,8 @@ void aoecmd_exit(void);
int aoecmd_init(void); int aoecmd_init(void);
struct sk_buff *aoecmd_ata_id(struct aoedev *); struct sk_buff *aoecmd_ata_id(struct aoedev *);
void aoe_freetframe(struct frame *); void aoe_freetframe(struct frame *);
void aoe_flush_iocq(void);
void aoe_end_request(struct aoedev *, struct request *, int);
int aoedev_init(void); int aoedev_init(void);
void aoedev_exit(void); void aoedev_exit(void);
...@@ -216,7 +217,8 @@ struct aoedev *aoedev_by_aoeaddr(int maj, int min); ...@@ -216,7 +217,8 @@ struct aoedev *aoedev_by_aoeaddr(int maj, int min);
struct aoedev *aoedev_by_sysminor_m(ulong sysminor); struct aoedev *aoedev_by_sysminor_m(ulong sysminor);
void aoedev_downdev(struct aoedev *d); void aoedev_downdev(struct aoedev *d);
int aoedev_flush(const char __user *str, size_t size); int aoedev_flush(const char __user *str, size_t size);
void aoe_failbuf(struct aoedev *d, struct buf *buf); void aoe_failbuf(struct aoedev *, struct buf *);
void aoedev_put(struct aoedev *);
int aoenet_init(void); int aoenet_init(void);
void aoenet_exit(void); void aoenet_exit(void);
......
...@@ -161,68 +161,22 @@ aoeblk_release(struct gendisk *disk, fmode_t mode) ...@@ -161,68 +161,22 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
} }
static void static void
aoeblk_make_request(struct request_queue *q, struct bio *bio) aoeblk_request(struct request_queue *q)
{ {
struct sk_buff_head queue;
struct aoedev *d; struct aoedev *d;
struct buf *buf; struct request *rq;
ulong flags;
blk_queue_bounce(q, &bio);
if (bio == NULL) {
printk(KERN_ERR "aoe: bio is NULL\n");
BUG();
return;
}
d = bio->bi_bdev->bd_disk->private_data;
if (d == NULL) {
printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
BUG();
bio_endio(bio, -ENXIO);
return;
} else if (bio->bi_io_vec == NULL) {
printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
BUG();
bio_endio(bio, -ENXIO);
return;
}
buf = mempool_alloc(d->bufpool, GFP_NOIO);
if (buf == NULL) {
printk(KERN_INFO "aoe: buf allocation failure\n");
bio_endio(bio, -ENOMEM);
return;
}
memset(buf, 0, sizeof(*buf));
INIT_LIST_HEAD(&buf->bufs);
buf->stime = jiffies;
buf->bio = bio;
buf->resid = bio->bi_size;
buf->sector = bio->bi_sector;
buf->bv = &bio->bi_io_vec[bio->bi_idx];
buf->bv_resid = buf->bv->bv_len;
WARN_ON(buf->bv_resid == 0);
buf->bv_off = buf->bv->bv_offset;
spin_lock_irqsave(&d->lock, flags);
d = q->queuedata;
if ((d->flags & DEVFL_UP) == 0) { if ((d->flags & DEVFL_UP) == 0) {
pr_info_ratelimited("aoe: device %ld.%d is not up\n", pr_info_ratelimited("aoe: device %ld.%d is not up\n",
d->aoemajor, d->aoeminor); d->aoemajor, d->aoeminor);
spin_unlock_irqrestore(&d->lock, flags); while ((rq = blk_peek_request(q))) {
mempool_free(buf, d->bufpool); blk_start_request(rq);
bio_endio(bio, -ENXIO); aoe_end_request(d, rq, 1);
}
return; return;
} }
list_add_tail(&buf->bufs, &d->bufq);
aoecmd_work(d); aoecmd_work(d);
__skb_queue_head_init(&queue);
skb_queue_splice_init(&d->sendq, &queue);
spin_unlock_irqrestore(&d->lock, flags);
aoenet_xmit(&queue);
} }
static int static int
...@@ -254,34 +208,46 @@ aoeblk_gdalloc(void *vp) ...@@ -254,34 +208,46 @@ aoeblk_gdalloc(void *vp)
{ {
struct aoedev *d = vp; struct aoedev *d = vp;
struct gendisk *gd; struct gendisk *gd;
enum { KB = 1024, MB = KB * KB, READ_AHEAD = MB, }; mempool_t *mp;
struct request_queue *q;
enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
ulong flags; ulong flags;
gd = alloc_disk(AOE_PARTITIONS); gd = alloc_disk(AOE_PARTITIONS);
if (gd == NULL) { if (gd == NULL) {
printk(KERN_ERR pr_err("aoe: cannot allocate disk structure for %ld.%d\n",
"aoe: cannot allocate disk structure for %ld.%d\n",
d->aoemajor, d->aoeminor); d->aoemajor, d->aoeminor);
goto err; goto err;
} }
d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache); mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab,
if (d->bufpool == NULL) { buf_pool_cache);
if (mp == NULL) {
printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n", printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n",
d->aoemajor, d->aoeminor); d->aoemajor, d->aoeminor);
goto err_disk; goto err_disk;
} }
q = blk_init_queue(aoeblk_request, &d->lock);
if (q == NULL) {
pr_err("aoe: cannot allocate block queue for %ld.%d\n",
d->aoemajor, d->aoeminor);
mempool_destroy(mp);
goto err_disk;
}
d->blkq = blk_alloc_queue(GFP_KERNEL); d->blkq = blk_alloc_queue(GFP_KERNEL);
if (!d->blkq) if (!d->blkq)
goto err_mempool; goto err_mempool;
blk_queue_make_request(d->blkq, aoeblk_make_request);
d->blkq->backing_dev_info.name = "aoe"; d->blkq->backing_dev_info.name = "aoe";
if (bdi_init(&d->blkq->backing_dev_info)) if (bdi_init(&d->blkq->backing_dev_info))
goto err_blkq; goto err_blkq;
spin_lock_irqsave(&d->lock, flags); spin_lock_irqsave(&d->lock, flags);
blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS); blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
d->blkq->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
d->bufpool = mp;
d->blkq = gd->queue = q;
q->queuedata = d;
d->gd = gd;
gd->major = AOE_MAJOR; gd->major = AOE_MAJOR;
gd->first_minor = d->sysminor * AOE_PARTITIONS; gd->first_minor = d->sysminor * AOE_PARTITIONS;
gd->fops = &aoe_bdops; gd->fops = &aoe_bdops;
...@@ -290,8 +256,6 @@ aoeblk_gdalloc(void *vp) ...@@ -290,8 +256,6 @@ aoeblk_gdalloc(void *vp)
snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d", snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
d->aoemajor, d->aoeminor); d->aoemajor, d->aoeminor);
gd->queue = d->blkq;
d->gd = gd;
d->flags &= ~DEVFL_GDALLOC; d->flags &= ~DEVFL_GDALLOC;
d->flags |= DEVFL_UP; d->flags |= DEVFL_UP;
......
...@@ -106,6 +106,7 @@ revalidate(const char __user *str, size_t size) ...@@ -106,6 +106,7 @@ revalidate(const char __user *str, size_t size)
spin_lock_irqsave(&d->lock, flags); spin_lock_irqsave(&d->lock, flags);
goto loop; goto loop;
} }
aoedev_put(d);
if (skb) { if (skb) {
struct sk_buff_head queue; struct sk_buff_head queue;
__skb_queue_head_init(&queue); __skb_queue_head_init(&queue);
......
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
static void ktcomplete(struct frame *, struct sk_buff *); static void ktcomplete(struct frame *, struct sk_buff *);
static struct buf *nextbuf(struct aoedev *);
static int aoe_deadsecs = 60 * 3; static int aoe_deadsecs = 60 * 3;
module_param(aoe_deadsecs, int, 0644); module_param(aoe_deadsecs, int, 0644);
MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev."); MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
...@@ -283,17 +285,20 @@ aoecmd_ata_rw(struct aoedev *d) ...@@ -283,17 +285,20 @@ aoecmd_ata_rw(struct aoedev *d)
struct bio_vec *bv; struct bio_vec *bv;
struct aoetgt *t; struct aoetgt *t;
struct sk_buff *skb; struct sk_buff *skb;
struct sk_buff_head queue;
ulong bcnt, fbcnt; ulong bcnt, fbcnt;
char writebit, extbit; char writebit, extbit;
writebit = 0x10; writebit = 0x10;
extbit = 0x4; extbit = 0x4;
buf = nextbuf(d);
if (buf == NULL)
return 0;
f = newframe(d); f = newframe(d);
if (f == NULL) if (f == NULL)
return 0; return 0;
t = *d->tgt; t = *d->tgt;
buf = d->inprocess;
bv = buf->bv; bv = buf->bv;
bcnt = t->ifp->maxbcnt; bcnt = t->ifp->maxbcnt;
if (bcnt == 0) if (bcnt == 0)
...@@ -312,7 +317,7 @@ aoecmd_ata_rw(struct aoedev *d) ...@@ -312,7 +317,7 @@ aoecmd_ata_rw(struct aoedev *d)
fbcnt -= buf->bv_resid; fbcnt -= buf->bv_resid;
buf->resid -= buf->bv_resid; buf->resid -= buf->bv_resid;
if (buf->resid == 0) { if (buf->resid == 0) {
d->inprocess = NULL; d->ip.buf = NULL;
break; break;
} }
buf->bv++; buf->bv++;
...@@ -364,8 +369,11 @@ aoecmd_ata_rw(struct aoedev *d) ...@@ -364,8 +369,11 @@ aoecmd_ata_rw(struct aoedev *d)
skb->dev = t->ifp->nd; skb->dev = t->ifp->nd;
skb = skb_clone(skb, GFP_ATOMIC); skb = skb_clone(skb, GFP_ATOMIC);
if (skb) if (skb) {
__skb_queue_tail(&d->sendq, skb); __skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
}
return 1; return 1;
} }
...@@ -415,6 +423,7 @@ static void ...@@ -415,6 +423,7 @@ static void
resend(struct aoedev *d, struct frame *f) resend(struct aoedev *d, struct frame *f)
{ {
struct sk_buff *skb; struct sk_buff *skb;
struct sk_buff_head queue;
struct aoe_hdr *h; struct aoe_hdr *h;
struct aoe_atahdr *ah; struct aoe_atahdr *ah;
struct aoetgt *t; struct aoetgt *t;
...@@ -444,7 +453,9 @@ resend(struct aoedev *d, struct frame *f) ...@@ -444,7 +453,9 @@ resend(struct aoedev *d, struct frame *f)
skb = skb_clone(skb, GFP_ATOMIC); skb = skb_clone(skb, GFP_ATOMIC);
if (skb == NULL) if (skb == NULL)
return; return;
__skb_queue_tail(&d->sendq, skb); __skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
} }
static int static int
...@@ -554,7 +565,6 @@ ata_scnt(unsigned char *packet) { ...@@ -554,7 +565,6 @@ ata_scnt(unsigned char *packet) {
static void static void
rexmit_timer(ulong vp) rexmit_timer(ulong vp)
{ {
struct sk_buff_head queue;
struct aoedev *d; struct aoedev *d;
struct aoetgt *t, **tt, **te; struct aoetgt *t, **tt, **te;
struct aoeif *ifp; struct aoeif *ifp;
...@@ -603,6 +613,12 @@ rexmit_timer(ulong vp) ...@@ -603,6 +613,12 @@ rexmit_timer(ulong vp)
} }
} }
if (!list_empty(&flist)) { /* retransmissions necessary */
n = d->rttavg <<= 1;
if (n > MAXTIMER)
d->rttavg = MAXTIMER;
}
/* process expired frames */ /* process expired frames */
while (!list_empty(&flist)) { while (!list_empty(&flist)) {
pos = flist.next; pos = flist.next;
...@@ -641,45 +657,131 @@ rexmit_timer(ulong vp) ...@@ -641,45 +657,131 @@ rexmit_timer(ulong vp)
resend(d, f); resend(d, f);
} }
if (!skb_queue_empty(&d->sendq)) { if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) {
n = d->rttavg <<= 1;
if (n > MAXTIMER)
d->rttavg = MAXTIMER;
}
if (d->flags & DEVFL_KICKME || d->htgt) {
d->flags &= ~DEVFL_KICKME; d->flags &= ~DEVFL_KICKME;
aoecmd_work(d); d->blkq->request_fn(d->blkq);
} }
__skb_queue_head_init(&queue);
skb_queue_splice_init(&d->sendq, &queue);
d->timer.expires = jiffies + TIMERTICK; d->timer.expires = jiffies + TIMERTICK;
add_timer(&d->timer); add_timer(&d->timer);
spin_unlock_irqrestore(&d->lock, flags); spin_unlock_irqrestore(&d->lock, flags);
}
aoenet_xmit(&queue); static unsigned long
rqbiocnt(struct request *r)
{
struct bio *bio;
unsigned long n = 0;
__rq_for_each_bio(bio, r)
n++;
return n;
}
/* This can be removed if we are certain that no users of the block
* layer will ever use zero-count pages in bios. Otherwise we have to
* protect against the put_page sometimes done by the network layer.
*
* See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
* discussion.
*
* We cannot use get_page in the workaround, because it insists on a
* positive page count as a precondition. So we use _count directly.
*/
static void
bio_pageinc(struct bio *bio)
{
struct bio_vec *bv;
struct page *page;
int i;
bio_for_each_segment(bv, bio, i) {
page = bv->bv_page;
/* Non-zero page count for non-head members of
* compound pages is no longer allowed by the kernel,
* but this has never been seen here.
*/
if (unlikely(PageCompound(page)))
if (compound_trans_head(page) != page) {
pr_crit("page tail used for block I/O\n");
BUG();
}
atomic_inc(&page->_count);
}
}
static void
bio_pagedec(struct bio *bio)
{
struct bio_vec *bv;
int i;
bio_for_each_segment(bv, bio, i)
atomic_dec(&bv->bv_page->_count);
}
static void
bufinit(struct buf *buf, struct request *rq, struct bio *bio)
{
struct bio_vec *bv;
memset(buf, 0, sizeof(*buf));
buf->rq = rq;
buf->bio = bio;
buf->resid = bio->bi_size;
buf->sector = bio->bi_sector;
bio_pageinc(bio);
buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
buf->bv_resid = bv->bv_len;
WARN_ON(buf->bv_resid == 0);
}
static struct buf *
nextbuf(struct aoedev *d)
{
struct request *rq;
struct request_queue *q;
struct buf *buf;
struct bio *bio;
q = d->blkq;
if (q == NULL)
return NULL; /* initializing */
if (d->ip.buf)
return d->ip.buf;
rq = d->ip.rq;
if (rq == NULL) {
rq = blk_peek_request(q);
if (rq == NULL)
return NULL;
blk_start_request(rq);
d->ip.rq = rq;
d->ip.nxbio = rq->bio;
rq->special = (void *) rqbiocnt(rq);
}
buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
if (buf == NULL) {
pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
return NULL;
}
bio = d->ip.nxbio;
bufinit(buf, rq, bio);
bio = bio->bi_next;
d->ip.nxbio = bio;
if (bio == NULL)
d->ip.rq = NULL;
return d->ip.buf = buf;
} }
/* enters with d->lock held */ /* enters with d->lock held */
void void
aoecmd_work(struct aoedev *d) aoecmd_work(struct aoedev *d)
{ {
struct buf *buf;
loop:
if (d->htgt && !sthtith(d)) if (d->htgt && !sthtith(d))
return; return;
if (d->inprocess == NULL) { while (aoecmd_ata_rw(d))
if (list_empty(&d->bufq)) ;
return;
buf = container_of(d->bufq.next, struct buf, bufs);
list_del(d->bufq.next);
d->inprocess = buf;
}
if (aoecmd_ata_rw(d))
goto loop;
} }
/* this function performs work that has been deferred until sleeping is OK /* this function performs work that has been deferred until sleeping is OK
...@@ -802,25 +904,6 @@ gettgt(struct aoedev *d, char *addr) ...@@ -802,25 +904,6 @@ gettgt(struct aoedev *d, char *addr)
return NULL; return NULL;
} }
static inline void
diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector)
{
unsigned long n_sect = bio->bi_size >> 9;
const int rw = bio_data_dir(bio);
struct hd_struct *part;
int cpu;
cpu = part_stat_lock();
part = disk_map_sector_rcu(disk, sector);
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, ticks[rw], duration);
part_stat_add(cpu, part, sectors[rw], n_sect);
part_stat_add(cpu, part, io_ticks, duration);
part_stat_unlock();
}
static void static void
bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt) bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt)
{ {
...@@ -842,6 +925,43 @@ bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt) ...@@ -842,6 +925,43 @@ bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt)
goto loop; goto loop;
} }
void
aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
{
struct bio *bio;
int bok;
struct request_queue *q;
q = d->blkq;
if (rq == d->ip.rq)
d->ip.rq = NULL;
do {
bio = rq->bio;
bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
} while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size));
/* cf. http://lkml.org/lkml/2006/10/31/28 */
if (!fastfail)
q->request_fn(q);
}
static void
aoe_end_buf(struct aoedev *d, struct buf *buf)
{
struct request *rq;
unsigned long n;
if (buf == d->ip.buf)
d->ip.buf = NULL;
rq = buf->rq;
bio_pagedec(buf->bio);
mempool_free(buf, d->bufpool);
n = (unsigned long) rq->special;
rq->special = (void *) --n;
if (n == 0)
aoe_end_request(d, rq, 0);
}
static void static void
ktiocomplete(struct frame *f) ktiocomplete(struct frame *f)
{ {
...@@ -876,7 +996,7 @@ ktiocomplete(struct frame *f) ...@@ -876,7 +996,7 @@ ktiocomplete(struct frame *f)
ahout->cmdstat, ahin->cmdstat, ahout->cmdstat, ahin->cmdstat,
d->aoemajor, d->aoeminor); d->aoemajor, d->aoeminor);
noskb: if (buf) noskb: if (buf)
buf->flags |= BUFFL_FAIL; clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
goto badrsp; goto badrsp;
} }
...@@ -887,7 +1007,7 @@ noskb: if (buf) ...@@ -887,7 +1007,7 @@ noskb: if (buf)
if (skb->len < n) { if (skb->len < n) {
pr_err("aoe: runt data size in read. skb->len=%d need=%ld\n", pr_err("aoe: runt data size in read. skb->len=%d need=%ld\n",
skb->len, n); skb->len, n);
buf->flags |= BUFFL_FAIL; clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
break; break;
} }
bvcpy(f->bv, f->bv_off, skb, n); bvcpy(f->bv, f->bv_off, skb, n);
...@@ -927,18 +1047,13 @@ noskb: if (buf) ...@@ -927,18 +1047,13 @@ noskb: if (buf)
aoe_freetframe(f); aoe_freetframe(f);
if (buf && --buf->nframesout == 0 && buf->resid == 0) { if (buf && --buf->nframesout == 0 && buf->resid == 0)
struct bio *bio = buf->bio; aoe_end_buf(d, buf);
diskstats(d->gd, bio, jiffies - buf->stime, buf->sector); aoecmd_work(d);
n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
mempool_free(buf, d->bufpool); spin_unlock_irq(&d->lock);
spin_unlock_irq(&d->lock); aoedev_put(d);
if (n != -EIO)
bio_flush_dcache_pages(buf->bio);
bio_endio(bio, n);
} else
spin_unlock_irq(&d->lock);
dev_kfree_skb(skb); dev_kfree_skb(skb);
} }
...@@ -1061,12 +1176,14 @@ aoecmd_ata_rsp(struct sk_buff *skb) ...@@ -1061,12 +1176,14 @@ aoecmd_ata_rsp(struct sk_buff *skb)
printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n", printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
d->aoemajor, d->aoeminor, h->src); d->aoemajor, d->aoeminor, h->src);
spin_unlock_irqrestore(&d->lock, flags); spin_unlock_irqrestore(&d->lock, flags);
aoedev_put(d);
return skb; return skb;
} }
f = getframe(t, n); f = getframe(t, n);
if (f == NULL) { if (f == NULL) {
calc_rttavg(d, -tsince(n)); calc_rttavg(d, -tsince(n));
spin_unlock_irqrestore(&d->lock, flags); spin_unlock_irqrestore(&d->lock, flags);
aoedev_put(d);
snprintf(ebuf, sizeof ebuf, snprintf(ebuf, sizeof ebuf,
"%15s e%d.%d tag=%08x@%08lx\n", "%15s e%d.%d tag=%08x@%08lx\n",
"unexpected rsp", "unexpected rsp",
...@@ -1185,8 +1302,10 @@ aoecmd_cfg_rsp(struct sk_buff *skb) ...@@ -1185,8 +1302,10 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
struct aoeif *ifp; struct aoeif *ifp;
ulong flags, sysminor, aoemajor; ulong flags, sysminor, aoemajor;
struct sk_buff *sl; struct sk_buff *sl;
struct sk_buff_head queue;
u16 n; u16 n;
sl = NULL;
h = (struct aoe_hdr *) skb_mac_header(skb); h = (struct aoe_hdr *) skb_mac_header(skb);
ch = (struct aoe_cfghdr *) (h+1); ch = (struct aoe_cfghdr *) (h+1);
...@@ -1223,10 +1342,8 @@ aoecmd_cfg_rsp(struct sk_buff *skb) ...@@ -1223,10 +1342,8 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
t = gettgt(d, h->src); t = gettgt(d, h->src);
if (!t) { if (!t) {
t = addtgt(d, h->src, n); t = addtgt(d, h->src, n);
if (!t) { if (!t)
spin_unlock_irqrestore(&d->lock, flags); goto bail;
return;
}
} }
ifp = getif(t, skb->dev); ifp = getif(t, skb->dev);
if (!ifp) { if (!ifp) {
...@@ -1235,8 +1352,7 @@ aoecmd_cfg_rsp(struct sk_buff *skb) ...@@ -1235,8 +1352,7 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
printk(KERN_INFO printk(KERN_INFO
"aoe: device addif failure; " "aoe: device addif failure; "
"too many interfaces?\n"); "too many interfaces?\n");
spin_unlock_irqrestore(&d->lock, flags); goto bail;
return;
} }
} }
if (ifp->maxbcnt) { if (ifp->maxbcnt) {
...@@ -1257,18 +1373,14 @@ aoecmd_cfg_rsp(struct sk_buff *skb) ...@@ -1257,18 +1373,14 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
} }
/* don't change users' perspective */ /* don't change users' perspective */
if (d->nopen) { if (d->nopen == 0) {
spin_unlock_irqrestore(&d->lock, flags); d->fw_ver = be16_to_cpu(ch->fwver);
return; sl = aoecmd_ata_id(d);
} }
d->fw_ver = be16_to_cpu(ch->fwver); bail:
sl = aoecmd_ata_id(d);
spin_unlock_irqrestore(&d->lock, flags); spin_unlock_irqrestore(&d->lock, flags);
aoedev_put(d);
if (sl) { if (sl) {
struct sk_buff_head queue;
__skb_queue_head_init(&queue); __skb_queue_head_init(&queue);
__skb_queue_tail(&queue, sl); __skb_queue_tail(&queue, sl);
aoenet_xmit(&queue); aoenet_xmit(&queue);
...@@ -1297,8 +1409,19 @@ aoecmd_cleanslate(struct aoedev *d) ...@@ -1297,8 +1409,19 @@ aoecmd_cleanslate(struct aoedev *d)
} }
} }
static void void
flush_iocq(void) aoe_failbuf(struct aoedev *d, struct buf *buf)
{
if (buf == NULL)
return;
buf->resid = 0;
clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
if (buf->nframesout == 0)
aoe_end_buf(d, buf);
}
void
aoe_flush_iocq(void)
{ {
struct frame *f; struct frame *f;
struct aoedev *d; struct aoedev *d;
...@@ -1324,6 +1447,7 @@ flush_iocq(void) ...@@ -1324,6 +1447,7 @@ flush_iocq(void)
aoe_freetframe(f); aoe_freetframe(f);
spin_unlock_irqrestore(&d->lock, flags); spin_unlock_irqrestore(&d->lock, flags);
dev_kfree_skb(skb); dev_kfree_skb(skb);
aoedev_put(d);
} }
} }
...@@ -1344,5 +1468,5 @@ void ...@@ -1344,5 +1468,5 @@ void
aoecmd_exit(void) aoecmd_exit(void)
{ {
aoe_ktstop(&kts); aoe_ktstop(&kts);
flush_iocq(); aoe_flush_iocq();
} }
...@@ -19,6 +19,17 @@ static void skbpoolfree(struct aoedev *d); ...@@ -19,6 +19,17 @@ static void skbpoolfree(struct aoedev *d);
static struct aoedev *devlist; static struct aoedev *devlist;
static DEFINE_SPINLOCK(devlist_lock); static DEFINE_SPINLOCK(devlist_lock);
/*
* Users who grab a pointer to the device with aoedev_by_aoeaddr or
* aoedev_by_sysminor_m automatically get a reference count and must
* be responsible for performing a aoedev_put. With the addition of
* async kthread processing I'm no longer confident that we can
* guarantee consistency in the face of device flushes.
*
* For the time being, we only bother to add extra references for
* frames sitting on the iocq. When the kthreads finish processing
* these frames, they will aoedev_put the device.
*/
struct aoedev * struct aoedev *
aoedev_by_aoeaddr(int maj, int min) aoedev_by_aoeaddr(int maj, int min)
{ {
...@@ -28,13 +39,25 @@ aoedev_by_aoeaddr(int maj, int min) ...@@ -28,13 +39,25 @@ aoedev_by_aoeaddr(int maj, int min)
spin_lock_irqsave(&devlist_lock, flags); spin_lock_irqsave(&devlist_lock, flags);
for (d=devlist; d; d=d->next) for (d=devlist; d; d=d->next)
if (d->aoemajor == maj && d->aoeminor == min) if (d->aoemajor == maj && d->aoeminor == min) {
d->ref++;
break; break;
}
spin_unlock_irqrestore(&devlist_lock, flags); spin_unlock_irqrestore(&devlist_lock, flags);
return d; return d;
} }
void
aoedev_put(struct aoedev *d)
{
ulong flags;
spin_lock_irqsave(&devlist_lock, flags);
d->ref--;
spin_unlock_irqrestore(&devlist_lock, flags);
}
static void static void
dummy_timer(ulong vp) dummy_timer(ulong vp)
{ {
...@@ -47,21 +70,26 @@ dummy_timer(ulong vp) ...@@ -47,21 +70,26 @@ dummy_timer(ulong vp)
add_timer(&d->timer); add_timer(&d->timer);
} }
void static void
aoe_failbuf(struct aoedev *d, struct buf *buf) aoe_failip(struct aoedev *d)
{ {
struct request *rq;
struct bio *bio; struct bio *bio;
unsigned long n;
aoe_failbuf(d, d->ip.buf);
if (buf == NULL) rq = d->ip.rq;
if (rq == NULL)
return; return;
buf->flags |= BUFFL_FAIL; while ((bio = d->ip.nxbio)) {
if (buf->nframesout == 0) { clear_bit(BIO_UPTODATE, &bio->bi_flags);
if (buf == d->inprocess) /* ensure we only process this once */ d->ip.nxbio = bio->bi_next;
d->inprocess = NULL; n = (unsigned long) rq->special;
bio = buf->bio; rq->special = (void *) --n;
mempool_free(buf, d->bufpool);
bio_endio(bio, -EIO);
} }
if ((unsigned long) rq->special == 0)
aoe_end_request(d, rq, 0);
} }
void void
...@@ -70,8 +98,11 @@ aoedev_downdev(struct aoedev *d) ...@@ -70,8 +98,11 @@ aoedev_downdev(struct aoedev *d)
struct aoetgt *t, **tt, **te; struct aoetgt *t, **tt, **te;
struct frame *f; struct frame *f;
struct list_head *head, *pos, *nx; struct list_head *head, *pos, *nx;
struct request *rq;
int i; int i;
d->flags &= ~DEVFL_UP;
/* clean out active buffers on all targets */ /* clean out active buffers on all targets */
tt = d->targets; tt = d->targets;
te = tt + NTARGETS; te = tt + NTARGETS;
...@@ -92,22 +123,20 @@ aoedev_downdev(struct aoedev *d) ...@@ -92,22 +123,20 @@ aoedev_downdev(struct aoedev *d)
t->nout = 0; t->nout = 0;
} }
/* clean out the in-process buffer (if any) */ /* clean out the in-process request (if any) */
aoe_failbuf(d, d->inprocess); aoe_failip(d);
d->inprocess = NULL;
d->htgt = NULL; d->htgt = NULL;
/* clean out all pending I/O */ /* fast fail all pending I/O */
while (!list_empty(&d->bufq)) { if (d->blkq) {
struct buf *buf = container_of(d->bufq.next, struct buf, bufs); while ((rq = blk_peek_request(d->blkq))) {
list_del(d->bufq.next); blk_start_request(rq);
aoe_failbuf(d, buf); aoe_end_request(d, rq, 1);
}
} }
if (d->gd) if (d->gd)
set_capacity(d->gd, 0); set_capacity(d->gd, 0);
d->flags &= ~DEVFL_UP;
} }
static void static void
...@@ -120,6 +149,7 @@ aoedev_freedev(struct aoedev *d) ...@@ -120,6 +149,7 @@ aoedev_freedev(struct aoedev *d)
aoedisk_rm_sysfs(d); aoedisk_rm_sysfs(d);
del_gendisk(d->gd); del_gendisk(d->gd);
put_disk(d->gd); put_disk(d->gd);
blk_cleanup_queue(d->blkq);
} }
t = d->targets; t = d->targets;
e = t + NTARGETS; e = t + NTARGETS;
...@@ -128,7 +158,6 @@ aoedev_freedev(struct aoedev *d) ...@@ -128,7 +158,6 @@ aoedev_freedev(struct aoedev *d)
if (d->bufpool) if (d->bufpool)
mempool_destroy(d->bufpool); mempool_destroy(d->bufpool);
skbpoolfree(d); skbpoolfree(d);
blk_cleanup_queue(d->blkq);
kfree(d); kfree(d);
} }
...@@ -155,7 +184,8 @@ aoedev_flush(const char __user *str, size_t cnt) ...@@ -155,7 +184,8 @@ aoedev_flush(const char __user *str, size_t cnt)
spin_lock(&d->lock); spin_lock(&d->lock);
if ((!all && (d->flags & DEVFL_UP)) if ((!all && (d->flags & DEVFL_UP))
|| (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
|| d->nopen) { || d->nopen
|| d->ref) {
spin_unlock(&d->lock); spin_unlock(&d->lock);
dd = &d->next; dd = &d->next;
continue; continue;
...@@ -176,12 +206,15 @@ aoedev_flush(const char __user *str, size_t cnt) ...@@ -176,12 +206,15 @@ aoedev_flush(const char __user *str, size_t cnt)
return 0; return 0;
} }
/* I'm not really sure that this is a realistic problem, but if the /* This has been confirmed to occur once with Tms=3*1000 due to the
network driver goes gonzo let's just leak memory after complaining. */ * driver changing link and not processing its transmit ring. The
* problem is hard enough to solve by returning an error that I'm
* still punting on "solving" this.
*/
static void static void
skbfree(struct sk_buff *skb) skbfree(struct sk_buff *skb)
{ {
enum { Sms = 100, Tms = 3*1000}; enum { Sms = 250, Tms = 30 * 1000};
int i = Tms / Sms; int i = Tms / Sms;
if (skb == NULL) if (skb == NULL)
...@@ -222,8 +255,10 @@ aoedev_by_sysminor_m(ulong sysminor) ...@@ -222,8 +255,10 @@ aoedev_by_sysminor_m(ulong sysminor)
spin_lock_irqsave(&devlist_lock, flags); spin_lock_irqsave(&devlist_lock, flags);
for (d=devlist; d; d=d->next) for (d=devlist; d; d=d->next)
if (d->sysminor == sysminor) if (d->sysminor == sysminor) {
d->ref++;
break; break;
}
if (d) if (d)
goto out; goto out;
d = kcalloc(1, sizeof *d, GFP_ATOMIC); d = kcalloc(1, sizeof *d, GFP_ATOMIC);
...@@ -231,7 +266,6 @@ aoedev_by_sysminor_m(ulong sysminor) ...@@ -231,7 +266,6 @@ aoedev_by_sysminor_m(ulong sysminor)
goto out; goto out;
INIT_WORK(&d->work, aoecmd_sleepwork); INIT_WORK(&d->work, aoecmd_sleepwork);
spin_lock_init(&d->lock); spin_lock_init(&d->lock);
skb_queue_head_init(&d->sendq);
skb_queue_head_init(&d->skbpool); skb_queue_head_init(&d->skbpool);
init_timer(&d->timer); init_timer(&d->timer);
d->timer.data = (ulong) d; d->timer.data = (ulong) d;
...@@ -240,7 +274,7 @@ aoedev_by_sysminor_m(ulong sysminor) ...@@ -240,7 +274,7 @@ aoedev_by_sysminor_m(ulong sysminor)
add_timer(&d->timer); add_timer(&d->timer);
d->bufpool = NULL; /* defer to aoeblk_gdalloc */ d->bufpool = NULL; /* defer to aoeblk_gdalloc */
d->tgt = d->targets; d->tgt = d->targets;
INIT_LIST_HEAD(&d->bufq); d->ref = 1;
d->sysminor = sysminor; d->sysminor = sysminor;
d->aoemajor = AOEMAJOR(sysminor); d->aoemajor = AOEMAJOR(sysminor);
d->aoeminor = AOEMINOR(sysminor); d->aoeminor = AOEMINOR(sysminor);
...@@ -274,6 +308,7 @@ aoedev_exit(void) ...@@ -274,6 +308,7 @@ aoedev_exit(void)
struct aoedev *d; struct aoedev *d;
ulong flags; ulong flags;
aoe_flush_iocq();
while ((d = devlist)) { while ((d = devlist)) {
devlist = d->next; devlist = d->next;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册